diff options
| author | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-01-17 16:15:55 -0500 |
|---|---|---|
| committer | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-01-17 16:15:55 -0500 |
| commit | 8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch) | |
| tree | a8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /net/ipv4 | |
| parent | 406089d01562f1e2bf9f089fd7637009ebaad589 (diff) | |
Patched in Tegra support.
Diffstat (limited to 'net/ipv4')
92 files changed, 5920 insertions, 11321 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 5a19aeb8609..cbb505ba932 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig | |||
| @@ -163,6 +163,8 @@ config IP_PNP_RARP | |||
| 163 | operating on your network. Read | 163 | operating on your network. Read |
| 164 | <file:Documentation/filesystems/nfs/nfsroot.txt> for details. | 164 | <file:Documentation/filesystems/nfs/nfsroot.txt> for details. |
| 165 | 165 | ||
| 166 | # not yet ready.. | ||
| 167 | # bool ' IP: ARP support' CONFIG_IP_PNP_ARP | ||
| 166 | config NET_IPIP | 168 | config NET_IPIP |
| 167 | tristate "IP: tunneling" | 169 | tristate "IP: tunneling" |
| 168 | select INET_TUNNEL | 170 | select INET_TUNNEL |
| @@ -262,8 +264,8 @@ config ARPD | |||
| 262 | bool "IP: ARP daemon support" | 264 | bool "IP: ARP daemon support" |
| 263 | ---help--- | 265 | ---help--- |
| 264 | The kernel maintains an internal cache which maps IP addresses to | 266 | The kernel maintains an internal cache which maps IP addresses to |
| 265 | hardware addresses on the local network, so that Ethernet | 267 | hardware addresses on the local network, so that Ethernet/Token Ring/ |
| 266 | frames are sent to the proper address on the physical networking | 268 | etc. frames are sent to the proper address on the physical networking |
| 267 | layer. Normally, kernel uses the ARP protocol to resolve these | 269 | layer. Normally, kernel uses the ARP protocol to resolve these |
| 268 | mappings. | 270 | mappings. |
| 269 | 271 | ||
| @@ -310,20 +312,9 @@ config SYN_COOKIES | |||
| 310 | 312 | ||
| 311 | If unsure, say N. | 313 | If unsure, say N. |
| 312 | 314 | ||
| 313 | config NET_IPVTI | ||
| 314 | tristate "Virtual (secure) IP: tunneling" | ||
| 315 | select INET_TUNNEL | ||
| 316 | depends on INET_XFRM_MODE_TUNNEL | ||
| 317 | ---help--- | ||
| 318 | Tunneling means encapsulating data of one protocol type within | ||
| 319 | another protocol and sending it over a channel that understands the | ||
| 320 | encapsulating protocol. This can be used with xfrm mode tunnel to give | ||
| 321 | the notion of a secure tunnel for IPSEC and then use routing protocol | ||
| 322 | on top. | ||
| 323 | |||
| 324 | config INET_AH | 315 | config INET_AH |
| 325 | tristate "IP: AH transformation" | 316 | tristate "IP: AH transformation" |
| 326 | select XFRM_ALGO | 317 | select XFRM |
| 327 | select CRYPTO | 318 | select CRYPTO |
| 328 | select CRYPTO_HMAC | 319 | select CRYPTO_HMAC |
| 329 | select CRYPTO_MD5 | 320 | select CRYPTO_MD5 |
| @@ -335,7 +326,7 @@ config INET_AH | |||
| 335 | 326 | ||
| 336 | config INET_ESP | 327 | config INET_ESP |
| 337 | tristate "IP: ESP transformation" | 328 | tristate "IP: ESP transformation" |
| 338 | select XFRM_ALGO | 329 | select XFRM |
| 339 | select CRYPTO | 330 | select CRYPTO |
| 340 | select CRYPTO_AUTHENC | 331 | select CRYPTO_AUTHENC |
| 341 | select CRYPTO_HMAC | 332 | select CRYPTO_HMAC |
| @@ -418,14 +409,6 @@ config INET_TCP_DIAG | |||
| 418 | depends on INET_DIAG | 409 | depends on INET_DIAG |
| 419 | def_tristate INET_DIAG | 410 | def_tristate INET_DIAG |
| 420 | 411 | ||
| 421 | config INET_UDP_DIAG | ||
| 422 | tristate "UDP: socket monitoring interface" | ||
| 423 | depends on INET_DIAG && (IPV6 || IPV6=n) | ||
| 424 | default n | ||
| 425 | ---help--- | ||
| 426 | Support for UDP socket monitoring interface used by the ss tool. | ||
| 427 | If unsure, say Y. | ||
| 428 | |||
| 429 | menuconfig TCP_CONG_ADVANCED | 412 | menuconfig TCP_CONG_ADVANCED |
| 430 | bool "TCP: advanced congestion control" | 413 | bool "TCP: advanced congestion control" |
| 431 | ---help--- | 414 | ---help--- |
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 15ca63ec604..681084d76a9 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile | |||
| @@ -7,20 +7,20 @@ obj-y := route.o inetpeer.o protocol.o \ | |||
| 7 | ip_output.o ip_sockglue.o inet_hashtables.o \ | 7 | ip_output.o ip_sockglue.o inet_hashtables.o \ |
| 8 | inet_timewait_sock.o inet_connection_sock.o \ | 8 | inet_timewait_sock.o inet_connection_sock.o \ |
| 9 | tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ | 9 | tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ |
| 10 | tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ | 10 | tcp_minisocks.o tcp_cong.o \ |
| 11 | datagram.o raw.o udp.o udplite.o \ | 11 | datagram.o raw.o udp.o udplite.o \ |
| 12 | arp.o icmp.o devinet.o af_inet.o igmp.o \ | 12 | arp.o icmp.o devinet.o af_inet.o igmp.o \ |
| 13 | fib_frontend.o fib_semantics.o fib_trie.o \ | 13 | fib_frontend.o fib_semantics.o fib_trie.o \ |
| 14 | inet_fragment.o ping.o | 14 | inet_fragment.o ping.o |
| 15 | 15 | ||
| 16 | obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o | 16 | obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o |
| 17 | obj-$(CONFIG_SYSFS) += sysfs_net_ipv4.o | ||
| 17 | obj-$(CONFIG_PROC_FS) += proc.o | 18 | obj-$(CONFIG_PROC_FS) += proc.o |
| 18 | obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o | 19 | obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o |
| 19 | obj-$(CONFIG_IP_MROUTE) += ipmr.o | 20 | obj-$(CONFIG_IP_MROUTE) += ipmr.o |
| 20 | obj-$(CONFIG_NET_IPIP) += ipip.o | 21 | obj-$(CONFIG_NET_IPIP) += ipip.o |
| 21 | obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o | 22 | obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o |
| 22 | obj-$(CONFIG_NET_IPGRE) += ip_gre.o | 23 | obj-$(CONFIG_NET_IPGRE) += ip_gre.o |
| 23 | obj-$(CONFIG_NET_IPVTI) += ip_vti.o | ||
| 24 | obj-$(CONFIG_SYN_COOKIES) += syncookies.o | 24 | obj-$(CONFIG_SYN_COOKIES) += syncookies.o |
| 25 | obj-$(CONFIG_INET_AH) += ah4.o | 25 | obj-$(CONFIG_INET_AH) += ah4.o |
| 26 | obj-$(CONFIG_INET_ESP) += esp4.o | 26 | obj-$(CONFIG_INET_ESP) += esp4.o |
| @@ -35,7 +35,6 @@ obj-$(CONFIG_IP_PNP) += ipconfig.o | |||
| 35 | obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ | 35 | obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ |
| 36 | obj-$(CONFIG_INET_DIAG) += inet_diag.o | 36 | obj-$(CONFIG_INET_DIAG) += inet_diag.o |
| 37 | obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o | 37 | obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o |
| 38 | obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o | ||
| 39 | obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o | 38 | obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o |
| 40 | obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o | 39 | obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o |
| 41 | obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o | 40 | obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o |
| @@ -49,7 +48,6 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o | |||
| 49 | obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o | 48 | obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o |
| 50 | obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o | 49 | obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o |
| 51 | obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o | 50 | obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o |
| 52 | obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o | ||
| 53 | obj-$(CONFIG_NETLABEL) += cipso_ipv4.o | 51 | obj-$(CONFIG_NETLABEL) += cipso_ipv4.o |
| 54 | 52 | ||
| 55 | obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ | 53 | obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ |
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 24b384b7903..bf488051a8d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
| @@ -65,8 +65,6 @@ | |||
| 65 | * 2 of the License, or (at your option) any later version. | 65 | * 2 of the License, or (at your option) any later version. |
| 66 | */ | 66 | */ |
| 67 | 67 | ||
| 68 | #define pr_fmt(fmt) "IPv4: " fmt | ||
| 69 | |||
| 70 | #include <linux/err.h> | 68 | #include <linux/err.h> |
| 71 | #include <linux/errno.h> | 69 | #include <linux/errno.h> |
| 72 | #include <linux/types.h> | 70 | #include <linux/types.h> |
| @@ -91,6 +89,7 @@ | |||
| 91 | #include <linux/slab.h> | 89 | #include <linux/slab.h> |
| 92 | 90 | ||
| 93 | #include <asm/uaccess.h> | 91 | #include <asm/uaccess.h> |
| 92 | #include <asm/system.h> | ||
| 94 | 93 | ||
| 95 | #include <linux/inet.h> | 94 | #include <linux/inet.h> |
| 96 | #include <linux/igmp.h> | 95 | #include <linux/igmp.h> |
| @@ -119,6 +118,19 @@ | |||
| 119 | #include <linux/mroute.h> | 118 | #include <linux/mroute.h> |
| 120 | #endif | 119 | #endif |
| 121 | 120 | ||
| 121 | #ifdef CONFIG_ANDROID_PARANOID_NETWORK | ||
| 122 | #include <linux/android_aid.h> | ||
| 123 | |||
| 124 | static inline int current_has_network(void) | ||
| 125 | { | ||
| 126 | return in_egroup_p(AID_INET) || capable(CAP_NET_RAW); | ||
| 127 | } | ||
| 128 | #else | ||
| 129 | static inline int current_has_network(void) | ||
| 130 | { | ||
| 131 | return 1; | ||
| 132 | } | ||
| 133 | #endif | ||
| 122 | 134 | ||
| 123 | /* The inetsw table contains everything that inet_create needs to | 135 | /* The inetsw table contains everything that inet_create needs to |
| 124 | * build a new socket. | 136 | * build a new socket. |
| @@ -157,7 +169,6 @@ void inet_sock_destruct(struct sock *sk) | |||
| 157 | 169 | ||
| 158 | kfree(rcu_dereference_protected(inet->inet_opt, 1)); | 170 | kfree(rcu_dereference_protected(inet->inet_opt, 1)); |
| 159 | dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); | 171 | dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); |
| 160 | dst_release(sk->sk_rx_dst); | ||
| 161 | sk_refcnt_debug_dec(sk); | 172 | sk_refcnt_debug_dec(sk); |
| 162 | } | 173 | } |
| 163 | EXPORT_SYMBOL(inet_sock_destruct); | 174 | EXPORT_SYMBOL(inet_sock_destruct); |
| @@ -212,26 +223,6 @@ int inet_listen(struct socket *sock, int backlog) | |||
| 212 | * we can only allow the backlog to be adjusted. | 223 | * we can only allow the backlog to be adjusted. |
| 213 | */ | 224 | */ |
| 214 | if (old_state != TCP_LISTEN) { | 225 | if (old_state != TCP_LISTEN) { |
| 215 | /* Check special setups for testing purpose to enable TFO w/o | ||
| 216 | * requiring TCP_FASTOPEN sockopt. | ||
| 217 | * Note that only TCP sockets (SOCK_STREAM) will reach here. | ||
| 218 | * Also fastopenq may already been allocated because this | ||
| 219 | * socket was in TCP_LISTEN state previously but was | ||
| 220 | * shutdown() (rather than close()). | ||
| 221 | */ | ||
| 222 | if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && | ||
| 223 | inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) { | ||
| 224 | if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) | ||
| 225 | err = fastopen_init_queue(sk, backlog); | ||
| 226 | else if ((sysctl_tcp_fastopen & | ||
| 227 | TFO_SERVER_WO_SOCKOPT2) != 0) | ||
| 228 | err = fastopen_init_queue(sk, | ||
| 229 | ((uint)sysctl_tcp_fastopen) >> 16); | ||
| 230 | else | ||
| 231 | err = 0; | ||
| 232 | if (err) | ||
| 233 | goto out; | ||
| 234 | } | ||
| 235 | err = inet_csk_listen_start(sk, backlog); | 226 | err = inet_csk_listen_start(sk, backlog); |
| 236 | if (err) | 227 | if (err) |
| 237 | goto out; | 228 | goto out; |
| @@ -263,21 +254,24 @@ void build_ehash_secret(void) | |||
| 263 | } | 254 | } |
| 264 | EXPORT_SYMBOL(build_ehash_secret); | 255 | EXPORT_SYMBOL(build_ehash_secret); |
| 265 | 256 | ||
| 266 | static inline int inet_netns_ok(struct net *net, __u8 protocol) | 257 | static inline int inet_netns_ok(struct net *net, int protocol) |
| 267 | { | 258 | { |
| 259 | int hash; | ||
| 268 | const struct net_protocol *ipprot; | 260 | const struct net_protocol *ipprot; |
| 269 | 261 | ||
| 270 | if (net_eq(net, &init_net)) | 262 | if (net_eq(net, &init_net)) |
| 271 | return 1; | 263 | return 1; |
| 272 | 264 | ||
| 273 | ipprot = rcu_dereference(inet_protos[protocol]); | 265 | hash = protocol & (MAX_INET_PROTOS - 1); |
| 274 | if (ipprot == NULL) { | 266 | ipprot = rcu_dereference(inet_protos[hash]); |
| 267 | |||
| 268 | if (ipprot == NULL) | ||
| 275 | /* raw IP is OK */ | 269 | /* raw IP is OK */ |
| 276 | return 1; | 270 | return 1; |
| 277 | } | ||
| 278 | return ipprot->netns_ok; | 271 | return ipprot->netns_ok; |
| 279 | } | 272 | } |
| 280 | 273 | ||
| 274 | |||
| 281 | /* | 275 | /* |
| 282 | * Create an inet socket. | 276 | * Create an inet socket. |
| 283 | */ | 277 | */ |
| @@ -294,6 +288,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol, | |||
| 294 | int try_loading_module = 0; | 288 | int try_loading_module = 0; |
| 295 | int err; | 289 | int err; |
| 296 | 290 | ||
| 291 | if (!current_has_network()) | ||
| 292 | return -EACCES; | ||
| 293 | |||
| 297 | if (unlikely(!inet_ehash_secret)) | 294 | if (unlikely(!inet_ehash_secret)) |
| 298 | if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) | 295 | if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) |
| 299 | build_ehash_secret(); | 296 | build_ehash_secret(); |
| @@ -346,8 +343,7 @@ lookup_protocol: | |||
| 346 | } | 343 | } |
| 347 | 344 | ||
| 348 | err = -EPERM; | 345 | err = -EPERM; |
| 349 | if (sock->type == SOCK_RAW && !kern && | 346 | if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) |
| 350 | !ns_capable(net->user_ns, CAP_NET_RAW)) | ||
| 351 | goto out_rcu_unlock; | 347 | goto out_rcu_unlock; |
| 352 | 348 | ||
| 353 | err = -EAFNOSUPPORT; | 349 | err = -EAFNOSUPPORT; |
| @@ -370,7 +366,7 @@ lookup_protocol: | |||
| 370 | err = 0; | 366 | err = 0; |
| 371 | sk->sk_no_check = answer_no_check; | 367 | sk->sk_no_check = answer_no_check; |
| 372 | if (INET_PROTOSW_REUSE & answer_flags) | 368 | if (INET_PROTOSW_REUSE & answer_flags) |
| 373 | sk->sk_reuse = SK_CAN_REUSE; | 369 | sk->sk_reuse = 1; |
| 374 | 370 | ||
| 375 | inet = inet_sk(sk); | 371 | inet = inet_sk(sk); |
| 376 | inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; | 372 | inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; |
| @@ -402,7 +398,6 @@ lookup_protocol: | |||
| 402 | inet->mc_all = 1; | 398 | inet->mc_all = 1; |
| 403 | inet->mc_index = 0; | 399 | inet->mc_index = 0; |
| 404 | inet->mc_list = NULL; | 400 | inet->mc_list = NULL; |
| 405 | inet->rcv_tos = 0; | ||
| 406 | 401 | ||
| 407 | sk_refcnt_debug_inc(sk); | 402 | sk_refcnt_debug_inc(sk); |
| 408 | 403 | ||
| @@ -474,7 +469,6 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) | |||
| 474 | struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; | 469 | struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; |
| 475 | struct sock *sk = sock->sk; | 470 | struct sock *sk = sock->sk; |
| 476 | struct inet_sock *inet = inet_sk(sk); | 471 | struct inet_sock *inet = inet_sk(sk); |
| 477 | struct net *net = sock_net(sk); | ||
| 478 | unsigned short snum; | 472 | unsigned short snum; |
| 479 | int chk_addr_ret; | 473 | int chk_addr_ret; |
| 480 | int err; | 474 | int err; |
| @@ -498,7 +492,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) | |||
| 498 | goto out; | 492 | goto out; |
| 499 | } | 493 | } |
| 500 | 494 | ||
| 501 | chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr); | 495 | chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); |
| 502 | 496 | ||
| 503 | /* Not specified by any standard per-se, however it breaks too | 497 | /* Not specified by any standard per-se, however it breaks too |
| 504 | * many applications when removed. It is unfortunate since | 498 | * many applications when removed. It is unfortunate since |
| @@ -518,8 +512,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) | |||
| 518 | 512 | ||
| 519 | snum = ntohs(addr->sin_port); | 513 | snum = ntohs(addr->sin_port); |
| 520 | err = -EACCES; | 514 | err = -EACCES; |
| 521 | if (snum && snum < PROT_SOCK && | 515 | if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) |
| 522 | !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) | ||
| 523 | goto out; | 516 | goto out; |
| 524 | 517 | ||
| 525 | /* We keep a pair of addresses. rcv_saddr is the one | 518 | /* We keep a pair of addresses. rcv_saddr is the one |
| @@ -563,7 +556,7 @@ out: | |||
| 563 | } | 556 | } |
| 564 | EXPORT_SYMBOL(inet_bind); | 557 | EXPORT_SYMBOL(inet_bind); |
| 565 | 558 | ||
| 566 | int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, | 559 | int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, |
| 567 | int addr_len, int flags) | 560 | int addr_len, int flags) |
| 568 | { | 561 | { |
| 569 | struct sock *sk = sock->sk; | 562 | struct sock *sk = sock->sk; |
| @@ -575,16 +568,15 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, | |||
| 575 | 568 | ||
| 576 | if (!inet_sk(sk)->inet_num && inet_autobind(sk)) | 569 | if (!inet_sk(sk)->inet_num && inet_autobind(sk)) |
| 577 | return -EAGAIN; | 570 | return -EAGAIN; |
| 578 | return sk->sk_prot->connect(sk, uaddr, addr_len); | 571 | return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); |
| 579 | } | 572 | } |
| 580 | EXPORT_SYMBOL(inet_dgram_connect); | 573 | EXPORT_SYMBOL(inet_dgram_connect); |
| 581 | 574 | ||
| 582 | static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) | 575 | static long inet_wait_for_connect(struct sock *sk, long timeo) |
| 583 | { | 576 | { |
| 584 | DEFINE_WAIT(wait); | 577 | DEFINE_WAIT(wait); |
| 585 | 578 | ||
| 586 | prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); | 579 | prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); |
| 587 | sk->sk_write_pending += writebias; | ||
| 588 | 580 | ||
| 589 | /* Basic assumption: if someone sets sk->sk_err, he _must_ | 581 | /* Basic assumption: if someone sets sk->sk_err, he _must_ |
| 590 | * change state of the socket from TCP_SYN_*. | 582 | * change state of the socket from TCP_SYN_*. |
| @@ -600,7 +592,6 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) | |||
| 600 | prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); | 592 | prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); |
| 601 | } | 593 | } |
| 602 | finish_wait(sk_sleep(sk), &wait); | 594 | finish_wait(sk_sleep(sk), &wait); |
| 603 | sk->sk_write_pending -= writebias; | ||
| 604 | return timeo; | 595 | return timeo; |
| 605 | } | 596 | } |
| 606 | 597 | ||
| @@ -608,8 +599,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) | |||
| 608 | * Connect to a remote host. There is regrettably still a little | 599 | * Connect to a remote host. There is regrettably still a little |
| 609 | * TCP 'magic' in here. | 600 | * TCP 'magic' in here. |
| 610 | */ | 601 | */ |
| 611 | int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, | 602 | int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, |
| 612 | int addr_len, int flags) | 603 | int addr_len, int flags) |
| 613 | { | 604 | { |
| 614 | struct sock *sk = sock->sk; | 605 | struct sock *sk = sock->sk; |
| 615 | int err; | 606 | int err; |
| @@ -618,6 +609,8 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, | |||
| 618 | if (addr_len < sizeof(uaddr->sa_family)) | 609 | if (addr_len < sizeof(uaddr->sa_family)) |
| 619 | return -EINVAL; | 610 | return -EINVAL; |
| 620 | 611 | ||
| 612 | lock_sock(sk); | ||
| 613 | |||
| 621 | if (uaddr->sa_family == AF_UNSPEC) { | 614 | if (uaddr->sa_family == AF_UNSPEC) { |
| 622 | err = sk->sk_prot->disconnect(sk, flags); | 615 | err = sk->sk_prot->disconnect(sk, flags); |
| 623 | sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; | 616 | sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; |
| @@ -657,12 +650,8 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, | |||
| 657 | timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); | 650 | timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); |
| 658 | 651 | ||
| 659 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { | 652 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { |
| 660 | int writebias = (sk->sk_protocol == IPPROTO_TCP) && | ||
| 661 | tcp_sk(sk)->fastopen_req && | ||
| 662 | tcp_sk(sk)->fastopen_req->data ? 1 : 0; | ||
| 663 | |||
| 664 | /* Error code is set above */ | 653 | /* Error code is set above */ |
| 665 | if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) | 654 | if (!timeo || !inet_wait_for_connect(sk, timeo)) |
| 666 | goto out; | 655 | goto out; |
| 667 | 656 | ||
| 668 | err = sock_intr_errno(timeo); | 657 | err = sock_intr_errno(timeo); |
| @@ -684,6 +673,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, | |||
| 684 | sock->state = SS_CONNECTED; | 673 | sock->state = SS_CONNECTED; |
| 685 | err = 0; | 674 | err = 0; |
| 686 | out: | 675 | out: |
| 676 | release_sock(sk); | ||
| 687 | return err; | 677 | return err; |
| 688 | 678 | ||
| 689 | sock_error: | 679 | sock_error: |
| @@ -693,18 +683,6 @@ sock_error: | |||
| 693 | sock->state = SS_DISCONNECTING; | 683 | sock->state = SS_DISCONNECTING; |
| 694 | goto out; | 684 | goto out; |
| 695 | } | 685 | } |
| 696 | EXPORT_SYMBOL(__inet_stream_connect); | ||
| 697 | |||
| 698 | int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, | ||
| 699 | int addr_len, int flags) | ||
| 700 | { | ||
| 701 | int err; | ||
| 702 | |||
| 703 | lock_sock(sock->sk); | ||
| 704 | err = __inet_stream_connect(sock, uaddr, addr_len, flags); | ||
| 705 | release_sock(sock->sk); | ||
| 706 | return err; | ||
| 707 | } | ||
| 708 | EXPORT_SYMBOL(inet_stream_connect); | 686 | EXPORT_SYMBOL(inet_stream_connect); |
| 709 | 687 | ||
| 710 | /* | 688 | /* |
| @@ -724,8 +702,7 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags) | |||
| 724 | 702 | ||
| 725 | sock_rps_record_flow(sk2); | 703 | sock_rps_record_flow(sk2); |
| 726 | WARN_ON(!((1 << sk2->sk_state) & | 704 | WARN_ON(!((1 << sk2->sk_state) & |
| 727 | (TCPF_ESTABLISHED | TCPF_SYN_RECV | | 705 | (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); |
| 728 | TCPF_CLOSE_WAIT | TCPF_CLOSE))); | ||
| 729 | 706 | ||
| 730 | sock_graft(sk2, newsock); | 707 | sock_graft(sk2, newsock); |
| 731 | 708 | ||
| @@ -919,6 +896,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) | |||
| 919 | case SIOCSIFPFLAGS: | 896 | case SIOCSIFPFLAGS: |
| 920 | case SIOCGIFPFLAGS: | 897 | case SIOCGIFPFLAGS: |
| 921 | case SIOCSIFFLAGS: | 898 | case SIOCSIFFLAGS: |
| 899 | case SIOCKILLADDR: | ||
| 922 | err = devinet_ioctl(net, cmd, (void __user *)arg); | 900 | err = devinet_ioctl(net, cmd, (void __user *)arg); |
| 923 | break; | 901 | break; |
| 924 | default: | 902 | default: |
| @@ -933,7 +911,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) | |||
| 933 | EXPORT_SYMBOL(inet_ioctl); | 911 | EXPORT_SYMBOL(inet_ioctl); |
| 934 | 912 | ||
| 935 | #ifdef CONFIG_COMPAT | 913 | #ifdef CONFIG_COMPAT |
| 936 | static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) | 914 | int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) |
| 937 | { | 915 | { |
| 938 | struct sock *sk = sock->sk; | 916 | struct sock *sk = sock->sk; |
| 939 | int err = -ENOIOCTLCMD; | 917 | int err = -ENOIOCTLCMD; |
| @@ -1124,11 +1102,13 @@ out: | |||
| 1124 | return; | 1102 | return; |
| 1125 | 1103 | ||
| 1126 | out_permanent: | 1104 | out_permanent: |
| 1127 | pr_err("Attempt to override permanent protocol %d\n", protocol); | 1105 | printk(KERN_ERR "Attempt to override permanent protocol %d.\n", |
| 1106 | protocol); | ||
| 1128 | goto out; | 1107 | goto out; |
| 1129 | 1108 | ||
| 1130 | out_illegal: | 1109 | out_illegal: |
| 1131 | pr_err("Ignoring attempt to register invalid socket type %d\n", | 1110 | printk(KERN_ERR |
| 1111 | "Ignoring attempt to register invalid socket type %d.\n", | ||
| 1132 | p->type); | 1112 | p->type); |
| 1133 | goto out; | 1113 | goto out; |
| 1134 | } | 1114 | } |
| @@ -1137,7 +1117,8 @@ EXPORT_SYMBOL(inet_register_protosw); | |||
| 1137 | void inet_unregister_protosw(struct inet_protosw *p) | 1117 | void inet_unregister_protosw(struct inet_protosw *p) |
| 1138 | { | 1118 | { |
| 1139 | if (INET_PROTOSW_PERMANENT & p->flags) { | 1119 | if (INET_PROTOSW_PERMANENT & p->flags) { |
| 1140 | pr_err("Attempt to unregister permanent protocol %d\n", | 1120 | printk(KERN_ERR |
| 1121 | "Attempt to unregister permanent protocol %d.\n", | ||
| 1141 | p->protocol); | 1122 | p->protocol); |
| 1142 | } else { | 1123 | } else { |
| 1143 | spin_lock_bh(&inetsw_lock); | 1124 | spin_lock_bh(&inetsw_lock); |
| @@ -1186,8 +1167,8 @@ static int inet_sk_reselect_saddr(struct sock *sk) | |||
| 1186 | return 0; | 1167 | return 0; |
| 1187 | 1168 | ||
| 1188 | if (sysctl_ip_dynaddr > 1) { | 1169 | if (sysctl_ip_dynaddr > 1) { |
| 1189 | pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n", | 1170 | printk(KERN_INFO "%s(): shifting inet->saddr from %pI4 to %pI4\n", |
| 1190 | __func__, &old_saddr, &new_saddr); | 1171 | __func__, &old_saddr, &new_saddr); |
| 1191 | } | 1172 | } |
| 1192 | 1173 | ||
| 1193 | inet->inet_saddr = inet->inet_rcv_saddr = new_saddr; | 1174 | inet->inet_saddr = inet->inet_rcv_saddr = new_saddr; |
| @@ -1254,8 +1235,8 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); | |||
| 1254 | 1235 | ||
| 1255 | static int inet_gso_send_check(struct sk_buff *skb) | 1236 | static int inet_gso_send_check(struct sk_buff *skb) |
| 1256 | { | 1237 | { |
| 1257 | const struct net_offload *ops; | ||
| 1258 | const struct iphdr *iph; | 1238 | const struct iphdr *iph; |
| 1239 | const struct net_protocol *ops; | ||
| 1259 | int proto; | 1240 | int proto; |
| 1260 | int ihl; | 1241 | int ihl; |
| 1261 | int err = -EINVAL; | 1242 | int err = -EINVAL; |
| @@ -1274,25 +1255,24 @@ static int inet_gso_send_check(struct sk_buff *skb) | |||
| 1274 | __skb_pull(skb, ihl); | 1255 | __skb_pull(skb, ihl); |
| 1275 | skb_reset_transport_header(skb); | 1256 | skb_reset_transport_header(skb); |
| 1276 | iph = ip_hdr(skb); | 1257 | iph = ip_hdr(skb); |
| 1277 | proto = iph->protocol; | 1258 | proto = iph->protocol & (MAX_INET_PROTOS - 1); |
| 1278 | err = -EPROTONOSUPPORT; | 1259 | err = -EPROTONOSUPPORT; |
| 1279 | 1260 | ||
| 1280 | rcu_read_lock(); | 1261 | rcu_read_lock(); |
| 1281 | ops = rcu_dereference(inet_offloads[proto]); | 1262 | ops = rcu_dereference(inet_protos[proto]); |
| 1282 | if (likely(ops && ops->callbacks.gso_send_check)) | 1263 | if (likely(ops && ops->gso_send_check)) |
| 1283 | err = ops->callbacks.gso_send_check(skb); | 1264 | err = ops->gso_send_check(skb); |
| 1284 | rcu_read_unlock(); | 1265 | rcu_read_unlock(); |
| 1285 | 1266 | ||
| 1286 | out: | 1267 | out: |
| 1287 | return err; | 1268 | return err; |
| 1288 | } | 1269 | } |
| 1289 | 1270 | ||
| 1290 | static struct sk_buff *inet_gso_segment(struct sk_buff *skb, | 1271 | static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features) |
| 1291 | netdev_features_t features) | ||
| 1292 | { | 1272 | { |
| 1293 | struct sk_buff *segs = ERR_PTR(-EINVAL); | 1273 | struct sk_buff *segs = ERR_PTR(-EINVAL); |
| 1294 | const struct net_offload *ops; | ||
| 1295 | struct iphdr *iph; | 1274 | struct iphdr *iph; |
| 1275 | const struct net_protocol *ops; | ||
| 1296 | int proto; | 1276 | int proto; |
| 1297 | int ihl; | 1277 | int ihl; |
| 1298 | int id; | 1278 | int id; |
| @@ -1324,13 +1304,13 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, | |||
| 1324 | skb_reset_transport_header(skb); | 1304 | skb_reset_transport_header(skb); |
| 1325 | iph = ip_hdr(skb); | 1305 | iph = ip_hdr(skb); |
| 1326 | id = ntohs(iph->id); | 1306 | id = ntohs(iph->id); |
| 1327 | proto = iph->protocol; | 1307 | proto = iph->protocol & (MAX_INET_PROTOS - 1); |
| 1328 | segs = ERR_PTR(-EPROTONOSUPPORT); | 1308 | segs = ERR_PTR(-EPROTONOSUPPORT); |
| 1329 | 1309 | ||
| 1330 | rcu_read_lock(); | 1310 | rcu_read_lock(); |
| 1331 | ops = rcu_dereference(inet_offloads[proto]); | 1311 | ops = rcu_dereference(inet_protos[proto]); |
| 1332 | if (likely(ops && ops->callbacks.gso_segment)) | 1312 | if (likely(ops && ops->gso_segment)) |
| 1333 | segs = ops->callbacks.gso_segment(skb, features); | 1313 | segs = ops->gso_segment(skb, features); |
| 1334 | rcu_read_unlock(); | 1314 | rcu_read_unlock(); |
| 1335 | 1315 | ||
| 1336 | if (!segs || IS_ERR(segs)) | 1316 | if (!segs || IS_ERR(segs)) |
| @@ -1359,7 +1339,7 @@ out: | |||
| 1359 | static struct sk_buff **inet_gro_receive(struct sk_buff **head, | 1339 | static struct sk_buff **inet_gro_receive(struct sk_buff **head, |
| 1360 | struct sk_buff *skb) | 1340 | struct sk_buff *skb) |
| 1361 | { | 1341 | { |
| 1362 | const struct net_offload *ops; | 1342 | const struct net_protocol *ops; |
| 1363 | struct sk_buff **pp = NULL; | 1343 | struct sk_buff **pp = NULL; |
| 1364 | struct sk_buff *p; | 1344 | struct sk_buff *p; |
| 1365 | const struct iphdr *iph; | 1345 | const struct iphdr *iph; |
| @@ -1378,17 +1358,17 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, | |||
| 1378 | goto out; | 1358 | goto out; |
| 1379 | } | 1359 | } |
| 1380 | 1360 | ||
| 1381 | proto = iph->protocol; | 1361 | proto = iph->protocol & (MAX_INET_PROTOS - 1); |
| 1382 | 1362 | ||
| 1383 | rcu_read_lock(); | 1363 | rcu_read_lock(); |
| 1384 | ops = rcu_dereference(inet_offloads[proto]); | 1364 | ops = rcu_dereference(inet_protos[proto]); |
| 1385 | if (!ops || !ops->callbacks.gro_receive) | 1365 | if (!ops || !ops->gro_receive) |
| 1386 | goto out_unlock; | 1366 | goto out_unlock; |
| 1387 | 1367 | ||
| 1388 | if (*(u8 *)iph != 0x45) | 1368 | if (*(u8 *)iph != 0x45) |
| 1389 | goto out_unlock; | 1369 | goto out_unlock; |
| 1390 | 1370 | ||
| 1391 | if (unlikely(ip_fast_csum((u8 *)iph, 5))) | 1371 | if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) |
| 1392 | goto out_unlock; | 1372 | goto out_unlock; |
| 1393 | 1373 | ||
| 1394 | id = ntohl(*(__be32 *)&iph->id); | 1374 | id = ntohl(*(__be32 *)&iph->id); |
| @@ -1404,6 +1384,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, | |||
| 1404 | iph2 = ip_hdr(p); | 1384 | iph2 = ip_hdr(p); |
| 1405 | 1385 | ||
| 1406 | if ((iph->protocol ^ iph2->protocol) | | 1386 | if ((iph->protocol ^ iph2->protocol) | |
| 1387 | (iph->tos ^ iph2->tos) | | ||
| 1407 | ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | | 1388 | ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | |
| 1408 | ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { | 1389 | ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { |
| 1409 | NAPI_GRO_CB(p)->same_flow = 0; | 1390 | NAPI_GRO_CB(p)->same_flow = 0; |
| @@ -1413,7 +1394,6 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, | |||
| 1413 | /* All fields must match except length and checksum. */ | 1394 | /* All fields must match except length and checksum. */ |
| 1414 | NAPI_GRO_CB(p)->flush |= | 1395 | NAPI_GRO_CB(p)->flush |= |
| 1415 | (iph->ttl ^ iph2->ttl) | | 1396 | (iph->ttl ^ iph2->ttl) | |
| 1416 | (iph->tos ^ iph2->tos) | | ||
| 1417 | ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); | 1397 | ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); |
| 1418 | 1398 | ||
| 1419 | NAPI_GRO_CB(p)->flush |= flush; | 1399 | NAPI_GRO_CB(p)->flush |= flush; |
| @@ -1423,7 +1403,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, | |||
| 1423 | skb_gro_pull(skb, sizeof(*iph)); | 1403 | skb_gro_pull(skb, sizeof(*iph)); |
| 1424 | skb_set_transport_header(skb, skb_gro_offset(skb)); | 1404 | skb_set_transport_header(skb, skb_gro_offset(skb)); |
| 1425 | 1405 | ||
| 1426 | pp = ops->callbacks.gro_receive(head, skb); | 1406 | pp = ops->gro_receive(head, skb); |
| 1427 | 1407 | ||
| 1428 | out_unlock: | 1408 | out_unlock: |
| 1429 | rcu_read_unlock(); | 1409 | rcu_read_unlock(); |
| @@ -1436,21 +1416,21 @@ out: | |||
| 1436 | 1416 | ||
| 1437 | static int inet_gro_complete(struct sk_buff *skb) | 1417 | static int inet_gro_complete(struct sk_buff *skb) |
| 1438 | { | 1418 | { |
| 1439 | __be16 newlen = htons(skb->len - skb_network_offset(skb)); | 1419 | const struct net_protocol *ops; |
| 1440 | struct iphdr *iph = ip_hdr(skb); | 1420 | struct iphdr *iph = ip_hdr(skb); |
| 1441 | const struct net_offload *ops; | 1421 | int proto = iph->protocol & (MAX_INET_PROTOS - 1); |
| 1442 | int proto = iph->protocol; | ||
| 1443 | int err = -ENOSYS; | 1422 | int err = -ENOSYS; |
| 1423 | __be16 newlen = htons(skb->len - skb_network_offset(skb)); | ||
| 1444 | 1424 | ||
| 1445 | csum_replace2(&iph->check, iph->tot_len, newlen); | 1425 | csum_replace2(&iph->check, iph->tot_len, newlen); |
| 1446 | iph->tot_len = newlen; | 1426 | iph->tot_len = newlen; |
| 1447 | 1427 | ||
| 1448 | rcu_read_lock(); | 1428 | rcu_read_lock(); |
| 1449 | ops = rcu_dereference(inet_offloads[proto]); | 1429 | ops = rcu_dereference(inet_protos[proto]); |
| 1450 | if (WARN_ON(!ops || !ops->callbacks.gro_complete)) | 1430 | if (WARN_ON(!ops || !ops->gro_complete)) |
| 1451 | goto out_unlock; | 1431 | goto out_unlock; |
| 1452 | 1432 | ||
| 1453 | err = ops->callbacks.gro_complete(skb); | 1433 | err = ops->gro_complete(skb); |
| 1454 | 1434 | ||
| 1455 | out_unlock: | 1435 | out_unlock: |
| 1456 | rcu_read_unlock(); | 1436 | rcu_read_unlock(); |
| @@ -1558,36 +1538,25 @@ static const struct net_protocol igmp_protocol = { | |||
| 1558 | #endif | 1538 | #endif |
| 1559 | 1539 | ||
| 1560 | static const struct net_protocol tcp_protocol = { | 1540 | static const struct net_protocol tcp_protocol = { |
| 1561 | .early_demux = tcp_v4_early_demux, | 1541 | .handler = tcp_v4_rcv, |
| 1562 | .handler = tcp_v4_rcv, | 1542 | .err_handler = tcp_v4_err, |
| 1563 | .err_handler = tcp_v4_err, | 1543 | .gso_send_check = tcp_v4_gso_send_check, |
| 1564 | .no_policy = 1, | 1544 | .gso_segment = tcp_tso_segment, |
| 1565 | .netns_ok = 1, | 1545 | .gro_receive = tcp4_gro_receive, |
| 1566 | }; | 1546 | .gro_complete = tcp4_gro_complete, |
| 1567 | 1547 | .no_policy = 1, | |
| 1568 | static const struct net_offload tcp_offload = { | 1548 | .netns_ok = 1, |
| 1569 | .callbacks = { | ||
| 1570 | .gso_send_check = tcp_v4_gso_send_check, | ||
| 1571 | .gso_segment = tcp_tso_segment, | ||
| 1572 | .gro_receive = tcp4_gro_receive, | ||
| 1573 | .gro_complete = tcp4_gro_complete, | ||
| 1574 | }, | ||
| 1575 | }; | 1549 | }; |
| 1576 | 1550 | ||
| 1577 | static const struct net_protocol udp_protocol = { | 1551 | static const struct net_protocol udp_protocol = { |
| 1578 | .handler = udp_rcv, | 1552 | .handler = udp_rcv, |
| 1579 | .err_handler = udp_err, | 1553 | .err_handler = udp_err, |
| 1554 | .gso_send_check = udp4_ufo_send_check, | ||
| 1555 | .gso_segment = udp4_ufo_fragment, | ||
| 1580 | .no_policy = 1, | 1556 | .no_policy = 1, |
| 1581 | .netns_ok = 1, | 1557 | .netns_ok = 1, |
| 1582 | }; | 1558 | }; |
| 1583 | 1559 | ||
| 1584 | static const struct net_offload udp_offload = { | ||
| 1585 | .callbacks = { | ||
| 1586 | .gso_send_check = udp4_ufo_send_check, | ||
| 1587 | .gso_segment = udp4_ufo_fragment, | ||
| 1588 | }, | ||
| 1589 | }; | ||
| 1590 | |||
| 1591 | static const struct net_protocol icmp_protocol = { | 1560 | static const struct net_protocol icmp_protocol = { |
| 1592 | .handler = icmp_rcv, | 1561 | .handler = icmp_rcv, |
| 1593 | .err_handler = ping_err, | 1562 | .err_handler = ping_err, |
| @@ -1621,9 +1590,9 @@ static __net_init int ipv4_mib_init_net(struct net *net) | |||
| 1621 | sizeof(struct icmp_mib), | 1590 | sizeof(struct icmp_mib), |
| 1622 | __alignof__(struct icmp_mib)) < 0) | 1591 | __alignof__(struct icmp_mib)) < 0) |
| 1623 | goto err_icmp_mib; | 1592 | goto err_icmp_mib; |
| 1624 | net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib), | 1593 | if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics, |
| 1625 | GFP_KERNEL); | 1594 | sizeof(struct icmpmsg_mib), |
| 1626 | if (!net->mib.icmpmsg_statistics) | 1595 | __alignof__(struct icmpmsg_mib)) < 0) |
| 1627 | goto err_icmpmsg_mib; | 1596 | goto err_icmpmsg_mib; |
| 1628 | 1597 | ||
| 1629 | tcp_mib_init(net); | 1598 | tcp_mib_init(net); |
| @@ -1647,7 +1616,7 @@ err_tcp_mib: | |||
| 1647 | 1616 | ||
| 1648 | static __net_exit void ipv4_mib_exit_net(struct net *net) | 1617 | static __net_exit void ipv4_mib_exit_net(struct net *net) |
| 1649 | { | 1618 | { |
| 1650 | kfree(net->mib.icmpmsg_statistics); | 1619 | snmp_mib_free((void __percpu **)net->mib.icmpmsg_statistics); |
| 1651 | snmp_mib_free((void __percpu **)net->mib.icmp_statistics); | 1620 | snmp_mib_free((void __percpu **)net->mib.icmp_statistics); |
| 1652 | snmp_mib_free((void __percpu **)net->mib.udplite_statistics); | 1621 | snmp_mib_free((void __percpu **)net->mib.udplite_statistics); |
| 1653 | snmp_mib_free((void __percpu **)net->mib.udp_statistics); | 1622 | snmp_mib_free((void __percpu **)net->mib.udp_statistics); |
| @@ -1672,35 +1641,13 @@ static int ipv4_proc_init(void); | |||
| 1672 | * IP protocol layer initialiser | 1641 | * IP protocol layer initialiser |
| 1673 | */ | 1642 | */ |
| 1674 | 1643 | ||
| 1675 | static struct packet_offload ip_packet_offload __read_mostly = { | ||
| 1676 | .type = cpu_to_be16(ETH_P_IP), | ||
| 1677 | .callbacks = { | ||
| 1678 | .gso_send_check = inet_gso_send_check, | ||
| 1679 | .gso_segment = inet_gso_segment, | ||
| 1680 | .gro_receive = inet_gro_receive, | ||
| 1681 | .gro_complete = inet_gro_complete, | ||
| 1682 | }, | ||
| 1683 | }; | ||
| 1684 | |||
| 1685 | static int __init ipv4_offload_init(void) | ||
| 1686 | { | ||
| 1687 | /* | ||
| 1688 | * Add offloads | ||
| 1689 | */ | ||
| 1690 | if (inet_add_offload(&udp_offload, IPPROTO_UDP) < 0) | ||
| 1691 | pr_crit("%s: Cannot add UDP protocol offload\n", __func__); | ||
| 1692 | if (inet_add_offload(&tcp_offload, IPPROTO_TCP) < 0) | ||
| 1693 | pr_crit("%s: Cannot add TCP protocol offlaod\n", __func__); | ||
| 1694 | |||
| 1695 | dev_add_offload(&ip_packet_offload); | ||
| 1696 | return 0; | ||
| 1697 | } | ||
| 1698 | |||
| 1699 | fs_initcall(ipv4_offload_init); | ||
| 1700 | |||
| 1701 | static struct packet_type ip_packet_type __read_mostly = { | 1644 | static struct packet_type ip_packet_type __read_mostly = { |
| 1702 | .type = cpu_to_be16(ETH_P_IP), | 1645 | .type = cpu_to_be16(ETH_P_IP), |
| 1703 | .func = ip_rcv, | 1646 | .func = ip_rcv, |
| 1647 | .gso_send_check = inet_gso_send_check, | ||
| 1648 | .gso_segment = inet_gso_segment, | ||
| 1649 | .gro_receive = inet_gro_receive, | ||
| 1650 | .gro_complete = inet_gro_complete, | ||
| 1704 | }; | 1651 | }; |
| 1705 | 1652 | ||
| 1706 | static int __init inet_init(void) | 1653 | static int __init inet_init(void) |
| @@ -1742,21 +1689,19 @@ static int __init inet_init(void) | |||
| 1742 | ip_static_sysctl_init(); | 1689 | ip_static_sysctl_init(); |
| 1743 | #endif | 1690 | #endif |
| 1744 | 1691 | ||
| 1745 | tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem; | ||
| 1746 | |||
| 1747 | /* | 1692 | /* |
| 1748 | * Add all the base protocols. | 1693 | * Add all the base protocols. |
| 1749 | */ | 1694 | */ |
| 1750 | 1695 | ||
| 1751 | if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) | 1696 | if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) |
| 1752 | pr_crit("%s: Cannot add ICMP protocol\n", __func__); | 1697 | printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n"); |
| 1753 | if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) | 1698 | if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) |
| 1754 | pr_crit("%s: Cannot add UDP protocol\n", __func__); | 1699 | printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n"); |
| 1755 | if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) | 1700 | if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) |
| 1756 | pr_crit("%s: Cannot add TCP protocol\n", __func__); | 1701 | printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n"); |
| 1757 | #ifdef CONFIG_IP_MULTICAST | 1702 | #ifdef CONFIG_IP_MULTICAST |
| 1758 | if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) | 1703 | if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) |
| 1759 | pr_crit("%s: Cannot add IGMP protocol\n", __func__); | 1704 | printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n"); |
| 1760 | #endif | 1705 | #endif |
| 1761 | 1706 | ||
| 1762 | /* Register the socket-side information for inet_create. */ | 1707 | /* Register the socket-side information for inet_create. */ |
| @@ -1803,14 +1748,14 @@ static int __init inet_init(void) | |||
| 1803 | */ | 1748 | */ |
| 1804 | #if defined(CONFIG_IP_MROUTE) | 1749 | #if defined(CONFIG_IP_MROUTE) |
| 1805 | if (ip_mr_init()) | 1750 | if (ip_mr_init()) |
| 1806 | pr_crit("%s: Cannot init ipv4 mroute\n", __func__); | 1751 | printk(KERN_CRIT "inet_init: Cannot init ipv4 mroute\n"); |
| 1807 | #endif | 1752 | #endif |
| 1808 | /* | 1753 | /* |
| 1809 | * Initialise per-cpu ipv4 mibs | 1754 | * Initialise per-cpu ipv4 mibs |
| 1810 | */ | 1755 | */ |
| 1811 | 1756 | ||
| 1812 | if (init_ipv4_mibs()) | 1757 | if (init_ipv4_mibs()) |
| 1813 | pr_crit("%s: Cannot init ipv4 mibs\n", __func__); | 1758 | printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); |
| 1814 | 1759 | ||
| 1815 | ipv4_proc_init(); | 1760 | ipv4_proc_init(); |
| 1816 | 1761 | ||
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index a0d8392491c..36d14406261 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c | |||
| @@ -1,5 +1,3 @@ | |||
| 1 | #define pr_fmt(fmt) "IPsec: " fmt | ||
| 2 | |||
| 3 | #include <crypto/hash.h> | 1 | #include <crypto/hash.h> |
| 4 | #include <linux/err.h> | 2 | #include <linux/err.h> |
| 5 | #include <linux/module.h> | 3 | #include <linux/module.h> |
| @@ -77,7 +75,7 @@ static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash, | |||
| 77 | 75 | ||
| 78 | static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr) | 76 | static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr) |
| 79 | { | 77 | { |
| 80 | unsigned char *optptr = (unsigned char *)(iph+1); | 78 | unsigned char * optptr = (unsigned char*)(iph+1); |
| 81 | int l = iph->ihl*4 - sizeof(struct iphdr); | 79 | int l = iph->ihl*4 - sizeof(struct iphdr); |
| 82 | int optlen; | 80 | int optlen; |
| 83 | 81 | ||
| @@ -398,25 +396,16 @@ static void ah4_err(struct sk_buff *skb, u32 info) | |||
| 398 | struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); | 396 | struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); |
| 399 | struct xfrm_state *x; | 397 | struct xfrm_state *x; |
| 400 | 398 | ||
| 401 | switch (icmp_hdr(skb)->type) { | 399 | if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || |
| 402 | case ICMP_DEST_UNREACH: | 400 | icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) |
| 403 | if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) | ||
| 404 | return; | ||
| 405 | case ICMP_REDIRECT: | ||
| 406 | break; | ||
| 407 | default: | ||
| 408 | return; | 401 | return; |
| 409 | } | ||
| 410 | 402 | ||
| 411 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, | 403 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, |
| 412 | ah->spi, IPPROTO_AH, AF_INET); | 404 | ah->spi, IPPROTO_AH, AF_INET); |
| 413 | if (!x) | 405 | if (!x) |
| 414 | return; | 406 | return; |
| 415 | 407 | printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", | |
| 416 | if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) | 408 | ntohl(ah->spi), ntohl(iph->daddr)); |
| 417 | ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); | ||
| 418 | else | ||
| 419 | ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0); | ||
| 420 | xfrm_state_put(x); | 409 | xfrm_state_put(x); |
| 421 | } | 410 | } |
| 422 | 411 | ||
| @@ -456,10 +445,9 @@ static int ah_init_state(struct xfrm_state *x) | |||
| 456 | 445 | ||
| 457 | if (aalg_desc->uinfo.auth.icv_fullbits/8 != | 446 | if (aalg_desc->uinfo.auth.icv_fullbits/8 != |
| 458 | crypto_ahash_digestsize(ahash)) { | 447 | crypto_ahash_digestsize(ahash)) { |
| 459 | pr_info("%s: %s digestsize %u != %hu\n", | 448 | printk(KERN_INFO "AH: %s digestsize %u != %hu\n", |
| 460 | __func__, x->aalg->alg_name, | 449 | x->aalg->alg_name, crypto_ahash_digestsize(ahash), |
| 461 | crypto_ahash_digestsize(ahash), | 450 | aalg_desc->uinfo.auth.icv_fullbits/8); |
| 462 | aalg_desc->uinfo.auth.icv_fullbits / 8); | ||
| 463 | goto error; | 451 | goto error; |
| 464 | } | 452 | } |
| 465 | 453 | ||
| @@ -522,11 +510,11 @@ static const struct net_protocol ah4_protocol = { | |||
| 522 | static int __init ah4_init(void) | 510 | static int __init ah4_init(void) |
| 523 | { | 511 | { |
| 524 | if (xfrm_register_type(&ah_type, AF_INET) < 0) { | 512 | if (xfrm_register_type(&ah_type, AF_INET) < 0) { |
| 525 | pr_info("%s: can't add xfrm type\n", __func__); | 513 | printk(KERN_INFO "ip ah init: can't add xfrm type\n"); |
| 526 | return -EAGAIN; | 514 | return -EAGAIN; |
| 527 | } | 515 | } |
| 528 | if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) { | 516 | if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) { |
| 529 | pr_info("%s: can't add protocol\n", __func__); | 517 | printk(KERN_INFO "ip ah init: can't add protocol\n"); |
| 530 | xfrm_unregister_type(&ah_type, AF_INET); | 518 | xfrm_unregister_type(&ah_type, AF_INET); |
| 531 | return -EAGAIN; | 519 | return -EAGAIN; |
| 532 | } | 520 | } |
| @@ -536,9 +524,9 @@ static int __init ah4_init(void) | |||
| 536 | static void __exit ah4_fini(void) | 524 | static void __exit ah4_fini(void) |
| 537 | { | 525 | { |
| 538 | if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0) | 526 | if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0) |
| 539 | pr_info("%s: can't remove protocol\n", __func__); | 527 | printk(KERN_INFO "ip ah close: can't remove protocol\n"); |
| 540 | if (xfrm_unregister_type(&ah_type, AF_INET) < 0) | 528 | if (xfrm_unregister_type(&ah_type, AF_INET) < 0) |
| 541 | pr_info("%s: can't remove xfrm type\n", __func__); | 529 | printk(KERN_INFO "ip ah close: can't remove xfrm type\n"); |
| 542 | } | 530 | } |
| 543 | 531 | ||
| 544 | module_init(ah4_init); | 532 | module_init(ah4_init); |
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 9547a273b9e..96a164aa136 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c | |||
| @@ -73,8 +73,6 @@ | |||
| 73 | * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support. | 73 | * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support. |
| 74 | */ | 74 | */ |
| 75 | 75 | ||
| 76 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
| 77 | |||
| 78 | #include <linux/module.h> | 76 | #include <linux/module.h> |
| 79 | #include <linux/types.h> | 77 | #include <linux/types.h> |
| 80 | #include <linux/string.h> | 78 | #include <linux/string.h> |
| @@ -91,6 +89,7 @@ | |||
| 91 | #include <linux/etherdevice.h> | 89 | #include <linux/etherdevice.h> |
| 92 | #include <linux/fddidevice.h> | 90 | #include <linux/fddidevice.h> |
| 93 | #include <linux/if_arp.h> | 91 | #include <linux/if_arp.h> |
| 92 | #include <linux/trdevice.h> | ||
| 94 | #include <linux/skbuff.h> | 93 | #include <linux/skbuff.h> |
| 95 | #include <linux/proc_fs.h> | 94 | #include <linux/proc_fs.h> |
| 96 | #include <linux/seq_file.h> | 95 | #include <linux/seq_file.h> |
| @@ -113,7 +112,13 @@ | |||
| 113 | #include <net/arp.h> | 112 | #include <net/arp.h> |
| 114 | #include <net/ax25.h> | 113 | #include <net/ax25.h> |
| 115 | #include <net/netrom.h> | 114 | #include <net/netrom.h> |
| 115 | #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) | ||
| 116 | #include <net/atmclip.h> | ||
| 117 | struct neigh_table *clip_tbl_hook; | ||
| 118 | EXPORT_SYMBOL(clip_tbl_hook); | ||
| 119 | #endif | ||
| 116 | 120 | ||
| 121 | #include <asm/system.h> | ||
| 117 | #include <linux/uaccess.h> | 122 | #include <linux/uaccess.h> |
| 118 | 123 | ||
| 119 | #include <linux/netfilter_arp.h> | 124 | #include <linux/netfilter_arp.h> |
| @@ -121,7 +126,7 @@ | |||
| 121 | /* | 126 | /* |
| 122 | * Interface to generic neighbour cache. | 127 | * Interface to generic neighbour cache. |
| 123 | */ | 128 | */ |
| 124 | static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); | 129 | static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd); |
| 125 | static int arp_constructor(struct neighbour *neigh); | 130 | static int arp_constructor(struct neighbour *neigh); |
| 126 | static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); | 131 | static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); |
| 127 | static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); | 132 | static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); |
| @@ -159,6 +164,7 @@ static const struct neigh_ops arp_broken_ops = { | |||
| 159 | 164 | ||
| 160 | struct neigh_table arp_tbl = { | 165 | struct neigh_table arp_tbl = { |
| 161 | .family = AF_INET, | 166 | .family = AF_INET, |
| 167 | .entry_size = sizeof(struct neighbour) + 4, | ||
| 162 | .key_len = 4, | 168 | .key_len = 4, |
| 163 | .hash = arp_hash, | 169 | .hash = arp_hash, |
| 164 | .constructor = arp_constructor, | 170 | .constructor = arp_constructor, |
| @@ -171,7 +177,7 @@ struct neigh_table arp_tbl = { | |||
| 171 | .gc_staletime = 60 * HZ, | 177 | .gc_staletime = 60 * HZ, |
| 172 | .reachable_time = 30 * HZ, | 178 | .reachable_time = 30 * HZ, |
| 173 | .delay_probe_time = 5 * HZ, | 179 | .delay_probe_time = 5 * HZ, |
| 174 | .queue_len_bytes = 64*1024, | 180 | .queue_len = 3, |
| 175 | .ucast_probes = 3, | 181 | .ucast_probes = 3, |
| 176 | .mcast_probes = 3, | 182 | .mcast_probes = 3, |
| 177 | .anycast_delay = 1 * HZ, | 183 | .anycast_delay = 1 * HZ, |
| @@ -194,6 +200,9 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) | |||
| 194 | case ARPHRD_IEEE802: | 200 | case ARPHRD_IEEE802: |
| 195 | ip_eth_mc_map(addr, haddr); | 201 | ip_eth_mc_map(addr, haddr); |
| 196 | return 0; | 202 | return 0; |
| 203 | case ARPHRD_IEEE802_TR: | ||
| 204 | ip_tr_mc_map(addr, haddr); | ||
| 205 | return 0; | ||
| 197 | case ARPHRD_INFINIBAND: | 206 | case ARPHRD_INFINIBAND: |
| 198 | ip_ib_mc_map(addr, dev->broadcast, haddr); | 207 | ip_ib_mc_map(addr, dev->broadcast, haddr); |
| 199 | return 0; | 208 | return 0; |
| @@ -212,9 +221,9 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) | |||
| 212 | 221 | ||
| 213 | static u32 arp_hash(const void *pkey, | 222 | static u32 arp_hash(const void *pkey, |
| 214 | const struct net_device *dev, | 223 | const struct net_device *dev, |
| 215 | __u32 *hash_rnd) | 224 | __u32 hash_rnd) |
| 216 | { | 225 | { |
| 217 | return arp_hashfn(*(u32 *)pkey, dev, *hash_rnd); | 226 | return arp_hashfn(*(u32 *)pkey, dev, hash_rnd); |
| 218 | } | 227 | } |
| 219 | 228 | ||
| 220 | static int arp_constructor(struct neighbour *neigh) | 229 | static int arp_constructor(struct neighbour *neigh) |
| @@ -274,9 +283,9 @@ static int arp_constructor(struct neighbour *neigh) | |||
| 274 | default: | 283 | default: |
| 275 | break; | 284 | break; |
| 276 | case ARPHRD_ROSE: | 285 | case ARPHRD_ROSE: |
| 277 | #if IS_ENABLED(CONFIG_AX25) | 286 | #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) |
| 278 | case ARPHRD_AX25: | 287 | case ARPHRD_AX25: |
| 279 | #if IS_ENABLED(CONFIG_NETROM) | 288 | #if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) |
| 280 | case ARPHRD_NETROM: | 289 | case ARPHRD_NETROM: |
| 281 | #endif | 290 | #endif |
| 282 | neigh->ops = &arp_broken_ops; | 291 | neigh->ops = &arp_broken_ops; |
| @@ -321,7 +330,7 @@ static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb) | |||
| 321 | static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) | 330 | static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) |
| 322 | { | 331 | { |
| 323 | __be32 saddr = 0; | 332 | __be32 saddr = 0; |
| 324 | u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL; | 333 | u8 *dst_ha = NULL; |
| 325 | struct net_device *dev = neigh->dev; | 334 | struct net_device *dev = neigh->dev; |
| 326 | __be32 target = *(__be32 *)neigh->primary_key; | 335 | __be32 target = *(__be32 *)neigh->primary_key; |
| 327 | int probes = atomic_read(&neigh->probes); | 336 | int probes = atomic_read(&neigh->probes); |
| @@ -362,9 +371,10 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) | |||
| 362 | probes -= neigh->parms->ucast_probes; | 371 | probes -= neigh->parms->ucast_probes; |
| 363 | if (probes < 0) { | 372 | if (probes < 0) { |
| 364 | if (!(neigh->nud_state & NUD_VALID)) | 373 | if (!(neigh->nud_state & NUD_VALID)) |
| 365 | pr_debug("trying to ucast probe in NUD_INVALID\n"); | 374 | printk(KERN_DEBUG |
| 366 | neigh_ha_snapshot(dst_ha, neigh, dev); | 375 | "trying to ucast probe in NUD_INVALID\n"); |
| 367 | dst_hw = dst_ha; | 376 | dst_ha = neigh->ha; |
| 377 | read_lock_bh(&neigh->lock); | ||
| 368 | } else { | 378 | } else { |
| 369 | probes -= neigh->parms->app_probes; | 379 | probes -= neigh->parms->app_probes; |
| 370 | if (probes < 0) { | 380 | if (probes < 0) { |
| @@ -376,7 +386,9 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) | |||
| 376 | } | 386 | } |
| 377 | 387 | ||
| 378 | arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, | 388 | arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, |
| 379 | dst_hw, dev->dev_addr, NULL); | 389 | dst_ha, dev->dev_addr, NULL); |
| 390 | if (dst_ha) | ||
| 391 | read_unlock_bh(&neigh->lock); | ||
| 380 | } | 392 | } |
| 381 | 393 | ||
| 382 | static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) | 394 | static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) |
| @@ -447,7 +459,7 @@ static int arp_set_predefined(int addr_hint, unsigned char *haddr, | |||
| 447 | { | 459 | { |
| 448 | switch (addr_hint) { | 460 | switch (addr_hint) { |
| 449 | case RTN_LOCAL: | 461 | case RTN_LOCAL: |
| 450 | pr_debug("arp called for own IP address\n"); | 462 | printk(KERN_DEBUG "ARP: arp called for own IP address\n"); |
| 451 | memcpy(haddr, dev->dev_addr, dev->addr_len); | 463 | memcpy(haddr, dev->dev_addr, dev->addr_len); |
| 452 | return 1; | 464 | return 1; |
| 453 | case RTN_MULTICAST: | 465 | case RTN_MULTICAST: |
| @@ -468,12 +480,13 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) | |||
| 468 | struct neighbour *n; | 480 | struct neighbour *n; |
| 469 | 481 | ||
| 470 | if (!skb_dst(skb)) { | 482 | if (!skb_dst(skb)) { |
| 471 | pr_debug("arp_find is called with dst==NULL\n"); | 483 | printk(KERN_DEBUG "arp_find is called with dst==NULL\n"); |
| 472 | kfree_skb(skb); | 484 | kfree_skb(skb); |
| 473 | return 1; | 485 | return 1; |
| 474 | } | 486 | } |
| 475 | 487 | ||
| 476 | paddr = rt_nexthop(skb_rtable(skb), ip_hdr(skb)->daddr); | 488 | paddr = skb_rtable(skb)->rt_gateway; |
| 489 | |||
| 477 | if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, | 490 | if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, |
| 478 | paddr, dev)) | 491 | paddr, dev)) |
| 479 | return 0; | 492 | return 0; |
| @@ -579,18 +592,16 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, | |||
| 579 | struct sk_buff *skb; | 592 | struct sk_buff *skb; |
| 580 | struct arphdr *arp; | 593 | struct arphdr *arp; |
| 581 | unsigned char *arp_ptr; | 594 | unsigned char *arp_ptr; |
| 582 | int hlen = LL_RESERVED_SPACE(dev); | ||
| 583 | int tlen = dev->needed_tailroom; | ||
| 584 | 595 | ||
| 585 | /* | 596 | /* |
| 586 | * Allocate a buffer | 597 | * Allocate a buffer |
| 587 | */ | 598 | */ |
| 588 | 599 | ||
| 589 | skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC); | 600 | skb = alloc_skb(arp_hdr_len(dev) + LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); |
| 590 | if (skb == NULL) | 601 | if (skb == NULL) |
| 591 | return NULL; | 602 | return NULL; |
| 592 | 603 | ||
| 593 | skb_reserve(skb, hlen); | 604 | skb_reserve(skb, LL_RESERVED_SPACE(dev)); |
| 594 | skb_reset_network_header(skb); | 605 | skb_reset_network_header(skb); |
| 595 | arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev)); | 606 | arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev)); |
| 596 | skb->dev = dev; | 607 | skb->dev = dev; |
| @@ -622,13 +633,13 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, | |||
| 622 | arp->ar_pro = htons(ETH_P_IP); | 633 | arp->ar_pro = htons(ETH_P_IP); |
| 623 | break; | 634 | break; |
| 624 | 635 | ||
| 625 | #if IS_ENABLED(CONFIG_AX25) | 636 | #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) |
| 626 | case ARPHRD_AX25: | 637 | case ARPHRD_AX25: |
| 627 | arp->ar_hrd = htons(ARPHRD_AX25); | 638 | arp->ar_hrd = htons(ARPHRD_AX25); |
| 628 | arp->ar_pro = htons(AX25_P_IP); | 639 | arp->ar_pro = htons(AX25_P_IP); |
| 629 | break; | 640 | break; |
| 630 | 641 | ||
| 631 | #if IS_ENABLED(CONFIG_NETROM) | 642 | #if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) |
| 632 | case ARPHRD_NETROM: | 643 | case ARPHRD_NETROM: |
| 633 | arp->ar_hrd = htons(ARPHRD_NETROM); | 644 | arp->ar_hrd = htons(ARPHRD_NETROM); |
| 634 | arp->ar_pro = htons(AX25_P_IP); | 645 | arp->ar_pro = htons(AX25_P_IP); |
| @@ -636,12 +647,18 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, | |||
| 636 | #endif | 647 | #endif |
| 637 | #endif | 648 | #endif |
| 638 | 649 | ||
| 639 | #if IS_ENABLED(CONFIG_FDDI) | 650 | #if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE) |
| 640 | case ARPHRD_FDDI: | 651 | case ARPHRD_FDDI: |
| 641 | arp->ar_hrd = htons(ARPHRD_ETHER); | 652 | arp->ar_hrd = htons(ARPHRD_ETHER); |
| 642 | arp->ar_pro = htons(ETH_P_IP); | 653 | arp->ar_pro = htons(ETH_P_IP); |
| 643 | break; | 654 | break; |
| 644 | #endif | 655 | #endif |
| 656 | #if defined(CONFIG_TR) || defined(CONFIG_TR_MODULE) | ||
| 657 | case ARPHRD_IEEE802_TR: | ||
| 658 | arp->ar_hrd = htons(ARPHRD_IEEE802); | ||
| 659 | arp->ar_pro = htons(ETH_P_IP); | ||
| 660 | break; | ||
| 661 | #endif | ||
| 645 | } | 662 | } |
| 646 | 663 | ||
| 647 | arp->ar_hln = dev->addr_len; | 664 | arp->ar_hln = dev->addr_len; |
| @@ -739,10 +756,11 @@ static int arp_process(struct sk_buff *skb) | |||
| 739 | goto out; | 756 | goto out; |
| 740 | break; | 757 | break; |
| 741 | case ARPHRD_ETHER: | 758 | case ARPHRD_ETHER: |
| 759 | case ARPHRD_IEEE802_TR: | ||
| 742 | case ARPHRD_FDDI: | 760 | case ARPHRD_FDDI: |
| 743 | case ARPHRD_IEEE802: | 761 | case ARPHRD_IEEE802: |
| 744 | /* | 762 | /* |
| 745 | * ETHERNET, and Fibre Channel (which are IEEE 802 | 763 | * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802 |
| 746 | * devices, according to RFC 2625) devices will accept ARP | 764 | * devices, according to RFC 2625) devices will accept ARP |
| 747 | * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2). | 765 | * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2). |
| 748 | * This is the case also of FDDI, where the RFC 1390 says that | 766 | * This is the case also of FDDI, where the RFC 1390 says that |
| @@ -787,8 +805,7 @@ static int arp_process(struct sk_buff *skb) | |||
| 787 | * Check for bad requests for 127.x.x.x and requests for multicast | 805 | * Check for bad requests for 127.x.x.x and requests for multicast |
| 788 | * addresses. If this is one such, delete it. | 806 | * addresses. If this is one such, delete it. |
| 789 | */ | 807 | */ |
| 790 | if (ipv4_is_multicast(tip) || | 808 | if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) |
| 791 | (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) | ||
| 792 | goto out; | 809 | goto out; |
| 793 | 810 | ||
| 794 | /* | 811 | /* |
| @@ -850,8 +867,7 @@ static int arp_process(struct sk_buff *skb) | |||
| 850 | if (addr_type == RTN_UNICAST && | 867 | if (addr_type == RTN_UNICAST && |
| 851 | (arp_fwd_proxy(in_dev, dev, rt) || | 868 | (arp_fwd_proxy(in_dev, dev, rt) || |
| 852 | arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || | 869 | arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || |
| 853 | (rt->dst.dev != dev && | 870 | pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { |
| 854 | pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) { | ||
| 855 | n = neigh_event_ns(&arp_tbl, sha, &sip, dev); | 871 | n = neigh_event_ns(&arp_tbl, sha, &sip, dev); |
| 856 | if (n) | 872 | if (n) |
| 857 | neigh_release(n); | 873 | neigh_release(n); |
| @@ -876,7 +892,7 @@ static int arp_process(struct sk_buff *skb) | |||
| 876 | 892 | ||
| 877 | n = __neigh_lookup(&arp_tbl, &sip, dev, 0); | 893 | n = __neigh_lookup(&arp_tbl, &sip, dev, 0); |
| 878 | 894 | ||
| 879 | if (IN_DEV_ARP_ACCEPT(in_dev)) { | 895 | if (IPV4_DEVCONF_ALL(dev_net(dev), ARP_ACCEPT)) { |
| 880 | /* Unsolicited ARP is not accepted by default. | 896 | /* Unsolicited ARP is not accepted by default. |
| 881 | It is possible, that this option should be enabled for some | 897 | It is possible, that this option should be enabled for some |
| 882 | devices (strip is candidate) | 898 | devices (strip is candidate) |
| @@ -1024,7 +1040,7 @@ static int arp_req_set(struct net *net, struct arpreq *r, | |||
| 1024 | return -EINVAL; | 1040 | return -EINVAL; |
| 1025 | } | 1041 | } |
| 1026 | switch (dev->type) { | 1042 | switch (dev->type) { |
| 1027 | #if IS_ENABLED(CONFIG_FDDI) | 1043 | #if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE) |
| 1028 | case ARPHRD_FDDI: | 1044 | case ARPHRD_FDDI: |
| 1029 | /* | 1045 | /* |
| 1030 | * According to RFC 1390, FDDI devices should accept ARP | 1046 | * According to RFC 1390, FDDI devices should accept ARP |
| @@ -1047,7 +1063,7 @@ static int arp_req_set(struct net *net, struct arpreq *r, | |||
| 1047 | neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev); | 1063 | neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev); |
| 1048 | err = PTR_ERR(neigh); | 1064 | err = PTR_ERR(neigh); |
| 1049 | if (!IS_ERR(neigh)) { | 1065 | if (!IS_ERR(neigh)) { |
| 1050 | unsigned int state = NUD_STALE; | 1066 | unsigned state = NUD_STALE; |
| 1051 | if (r->arp_flags & ATF_PERM) | 1067 | if (r->arp_flags & ATF_PERM) |
| 1052 | state = NUD_PERMANENT; | 1068 | state = NUD_PERMANENT; |
| 1053 | err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? | 1069 | err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? |
| @@ -1059,7 +1075,7 @@ static int arp_req_set(struct net *net, struct arpreq *r, | |||
| 1059 | return err; | 1075 | return err; |
| 1060 | } | 1076 | } |
| 1061 | 1077 | ||
| 1062 | static unsigned int arp_state_to_flags(struct neighbour *neigh) | 1078 | static unsigned arp_state_to_flags(struct neighbour *neigh) |
| 1063 | { | 1079 | { |
| 1064 | if (neigh->nud_state&NUD_PERMANENT) | 1080 | if (neigh->nud_state&NUD_PERMANENT) |
| 1065 | return ATF_PERM | ATF_COM; | 1081 | return ATF_PERM | ATF_COM; |
| @@ -1159,7 +1175,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) | |||
| 1159 | switch (cmd) { | 1175 | switch (cmd) { |
| 1160 | case SIOCDARP: | 1176 | case SIOCDARP: |
| 1161 | case SIOCSARP: | 1177 | case SIOCSARP: |
| 1162 | if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) | 1178 | if (!capable(CAP_NET_ADMIN)) |
| 1163 | return -EPERM; | 1179 | return -EPERM; |
| 1164 | case SIOCGARP: | 1180 | case SIOCGARP: |
| 1165 | err = copy_from_user(&r, arg, sizeof(struct arpreq)); | 1181 | err = copy_from_user(&r, arg, sizeof(struct arpreq)); |
| @@ -1223,7 +1239,7 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, | |||
| 1223 | switch (event) { | 1239 | switch (event) { |
| 1224 | case NETDEV_CHANGEADDR: | 1240 | case NETDEV_CHANGEADDR: |
| 1225 | neigh_changeaddr(&arp_tbl, dev); | 1241 | neigh_changeaddr(&arp_tbl, dev); |
| 1226 | rt_cache_flush(dev_net(dev)); | 1242 | rt_cache_flush(dev_net(dev), 0); |
| 1227 | break; | 1243 | break; |
| 1228 | default: | 1244 | default: |
| 1229 | break; | 1245 | break; |
| @@ -1270,7 +1286,7 @@ void __init arp_init(void) | |||
| 1270 | } | 1286 | } |
| 1271 | 1287 | ||
| 1272 | #ifdef CONFIG_PROC_FS | 1288 | #ifdef CONFIG_PROC_FS |
| 1273 | #if IS_ENABLED(CONFIG_AX25) | 1289 | #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) |
| 1274 | 1290 | ||
| 1275 | /* ------------------------------------------------------------------------ */ | 1291 | /* ------------------------------------------------------------------------ */ |
| 1276 | /* | 1292 | /* |
| @@ -1318,7 +1334,7 @@ static void arp_format_neigh_entry(struct seq_file *seq, | |||
| 1318 | 1334 | ||
| 1319 | read_lock(&n->lock); | 1335 | read_lock(&n->lock); |
| 1320 | /* Convert hardware address to XX:XX:XX:XX ... form. */ | 1336 | /* Convert hardware address to XX:XX:XX:XX ... form. */ |
| 1321 | #if IS_ENABLED(CONFIG_AX25) | 1337 | #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) |
| 1322 | if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM) | 1338 | if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM) |
| 1323 | ax2asc2((ax25_address *)n->ha, hbuffer); | 1339 | ax2asc2((ax25_address *)n->ha, hbuffer); |
| 1324 | else { | 1340 | else { |
| @@ -1331,7 +1347,7 @@ static void arp_format_neigh_entry(struct seq_file *seq, | |||
| 1331 | if (k != 0) | 1347 | if (k != 0) |
| 1332 | --k; | 1348 | --k; |
| 1333 | hbuffer[k] = 0; | 1349 | hbuffer[k] = 0; |
| 1334 | #if IS_ENABLED(CONFIG_AX25) | 1350 | #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) |
| 1335 | } | 1351 | } |
| 1336 | #endif | 1352 | #endif |
| 1337 | sprintf(tbuf, "%pI4", n->primary_key); | 1353 | sprintf(tbuf, "%pI4", n->primary_key); |
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 667c1d4ca98..2c2a98e402e 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c | |||
| @@ -476,7 +476,7 @@ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def, | |||
| 476 | doi = doi_def->doi; | 476 | doi = doi_def->doi; |
| 477 | doi_type = doi_def->type; | 477 | doi_type = doi_def->type; |
| 478 | 478 | ||
| 479 | if (doi_def->doi == CIPSO_V4_DOI_UNKNOWN) | 479 | if (doi_def == NULL || doi_def->doi == CIPSO_V4_DOI_UNKNOWN) |
| 480 | goto doi_add_return; | 480 | goto doi_add_return; |
| 481 | for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) { | 481 | for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) { |
| 482 | switch (doi_def->tags[iter]) { | 482 | switch (doi_def->tags[iter]) { |
| @@ -1725,10 +1725,8 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option) | |||
| 1725 | case CIPSO_V4_TAG_LOCAL: | 1725 | case CIPSO_V4_TAG_LOCAL: |
| 1726 | /* This is a non-standard tag that we only allow for | 1726 | /* This is a non-standard tag that we only allow for |
| 1727 | * local connections, so if the incoming interface is | 1727 | * local connections, so if the incoming interface is |
| 1728 | * not the loopback device drop the packet. Further, | 1728 | * not the loopback device drop the packet. */ |
| 1729 | * there is no legitimate reason for setting this from | 1729 | if (!(skb->dev->flags & IFF_LOOPBACK)) { |
| 1730 | * userspace so reject it if skb is NULL. */ | ||
| 1731 | if (skb == NULL || !(skb->dev->flags & IFF_LOOPBACK)) { | ||
| 1732 | err_offset = opt_iter; | 1730 | err_offset = opt_iter; |
| 1733 | goto validate_return_locked; | 1731 | goto validate_return_locked; |
| 1734 | } | 1732 | } |
| @@ -1859,6 +1857,11 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len, | |||
| 1859 | return CIPSO_V4_HDR_LEN + ret_val; | 1857 | return CIPSO_V4_HDR_LEN + ret_val; |
| 1860 | } | 1858 | } |
| 1861 | 1859 | ||
| 1860 | static void opt_kfree_rcu(struct rcu_head *head) | ||
| 1861 | { | ||
| 1862 | kfree(container_of(head, struct ip_options_rcu, rcu)); | ||
| 1863 | } | ||
| 1864 | |||
| 1862 | /** | 1865 | /** |
| 1863 | * cipso_v4_sock_setattr - Add a CIPSO option to a socket | 1866 | * cipso_v4_sock_setattr - Add a CIPSO option to a socket |
| 1864 | * @sk: the socket | 1867 | * @sk: the socket |
| @@ -1935,7 +1938,7 @@ int cipso_v4_sock_setattr(struct sock *sk, | |||
| 1935 | } | 1938 | } |
| 1936 | rcu_assign_pointer(sk_inet->inet_opt, opt); | 1939 | rcu_assign_pointer(sk_inet->inet_opt, opt); |
| 1937 | if (old) | 1940 | if (old) |
| 1938 | kfree_rcu(old, rcu); | 1941 | call_rcu(&old->rcu, opt_kfree_rcu); |
| 1939 | 1942 | ||
| 1940 | return 0; | 1943 | return 0; |
| 1941 | 1944 | ||
| @@ -2002,7 +2005,7 @@ int cipso_v4_req_setattr(struct request_sock *req, | |||
| 2002 | req_inet = inet_rsk(req); | 2005 | req_inet = inet_rsk(req); |
| 2003 | opt = xchg(&req_inet->opt, opt); | 2006 | opt = xchg(&req_inet->opt, opt); |
| 2004 | if (opt) | 2007 | if (opt) |
| 2005 | kfree_rcu(opt, rcu); | 2008 | call_rcu(&opt->rcu, opt_kfree_rcu); |
| 2006 | 2009 | ||
| 2007 | return 0; | 2010 | return 0; |
| 2008 | 2011 | ||
| @@ -2072,7 +2075,7 @@ static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr) | |||
| 2072 | * remove the entire option struct */ | 2075 | * remove the entire option struct */ |
| 2073 | *opt_ptr = NULL; | 2076 | *opt_ptr = NULL; |
| 2074 | hdr_delta = opt->opt.optlen; | 2077 | hdr_delta = opt->opt.optlen; |
| 2075 | kfree_rcu(opt, rcu); | 2078 | call_rcu(&opt->rcu, opt_kfree_rcu); |
| 2076 | } | 2079 | } |
| 2077 | 2080 | ||
| 2078 | return hdr_delta; | 2081 | return hdr_delta; |
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index a8e4f2665d5..76db59202f1 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c | |||
| @@ -27,6 +27,7 @@ | |||
| 27 | 27 | ||
| 28 | 28 | ||
| 29 | #include <asm/uaccess.h> | 29 | #include <asm/uaccess.h> |
| 30 | #include <asm/system.h> | ||
| 30 | #include <linux/bitops.h> | 31 | #include <linux/bitops.h> |
| 31 | #include <linux/capability.h> | 32 | #include <linux/capability.h> |
| 32 | #include <linux/module.h> | 33 | #include <linux/module.h> |
| @@ -55,10 +56,10 @@ | |||
| 55 | #include <linux/sysctl.h> | 56 | #include <linux/sysctl.h> |
| 56 | #endif | 57 | #endif |
| 57 | #include <linux/kmod.h> | 58 | #include <linux/kmod.h> |
| 58 | #include <linux/netconf.h> | ||
| 59 | 59 | ||
| 60 | #include <net/arp.h> | 60 | #include <net/arp.h> |
| 61 | #include <net/ip.h> | 61 | #include <net/ip.h> |
| 62 | #include <net/tcp.h> | ||
| 62 | #include <net/route.h> | 63 | #include <net/route.h> |
| 63 | #include <net/ip_fib.h> | 64 | #include <net/ip_fib.h> |
| 64 | #include <net/rtnetlink.h> | 65 | #include <net/rtnetlink.h> |
| @@ -95,22 +96,25 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { | |||
| 95 | [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, | 96 | [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, |
| 96 | }; | 97 | }; |
| 97 | 98 | ||
| 98 | #define IN4_ADDR_HSIZE_SHIFT 8 | 99 | /* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE |
| 99 | #define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT) | 100 | * value. So if you change this define, make appropriate changes to |
| 100 | 101 | * inet_addr_hash as well. | |
| 102 | */ | ||
| 103 | #define IN4_ADDR_HSIZE 256 | ||
| 101 | static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; | 104 | static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; |
| 102 | static DEFINE_SPINLOCK(inet_addr_hash_lock); | 105 | static DEFINE_SPINLOCK(inet_addr_hash_lock); |
| 103 | 106 | ||
| 104 | static u32 inet_addr_hash(struct net *net, __be32 addr) | 107 | static inline unsigned int inet_addr_hash(struct net *net, __be32 addr) |
| 105 | { | 108 | { |
| 106 | u32 val = (__force u32) addr ^ net_hash_mix(net); | 109 | u32 val = (__force u32) addr ^ hash_ptr(net, 8); |
| 107 | 110 | ||
| 108 | return hash_32(val, IN4_ADDR_HSIZE_SHIFT); | 111 | return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) & |
| 112 | (IN4_ADDR_HSIZE - 1)); | ||
| 109 | } | 113 | } |
| 110 | 114 | ||
| 111 | static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) | 115 | static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) |
| 112 | { | 116 | { |
| 113 | u32 hash = inet_addr_hash(net, ifa->ifa_local); | 117 | unsigned int hash = inet_addr_hash(net, ifa->ifa_local); |
| 114 | 118 | ||
| 115 | spin_lock(&inet_addr_hash_lock); | 119 | spin_lock(&inet_addr_hash_lock); |
| 116 | hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); | 120 | hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); |
| @@ -134,18 +138,18 @@ static void inet_hash_remove(struct in_ifaddr *ifa) | |||
| 134 | */ | 138 | */ |
| 135 | struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) | 139 | struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) |
| 136 | { | 140 | { |
| 137 | u32 hash = inet_addr_hash(net, addr); | 141 | unsigned int hash = inet_addr_hash(net, addr); |
| 138 | struct net_device *result = NULL; | 142 | struct net_device *result = NULL; |
| 139 | struct in_ifaddr *ifa; | 143 | struct in_ifaddr *ifa; |
| 140 | struct hlist_node *node; | 144 | struct hlist_node *node; |
| 141 | 145 | ||
| 142 | rcu_read_lock(); | 146 | rcu_read_lock(); |
| 143 | hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) { | 147 | hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) { |
| 144 | if (ifa->ifa_local == addr) { | 148 | struct net_device *dev = ifa->ifa_dev->dev; |
| 145 | struct net_device *dev = ifa->ifa_dev->dev; | ||
| 146 | 149 | ||
| 147 | if (!net_eq(dev_net(dev), net)) | 150 | if (!net_eq(dev_net(dev), net)) |
| 148 | continue; | 151 | continue; |
| 152 | if (ifa->ifa_local == addr) { | ||
| 149 | result = dev; | 153 | result = dev; |
| 150 | break; | 154 | break; |
| 151 | } | 155 | } |
| @@ -180,10 +184,10 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, | |||
| 180 | static void devinet_sysctl_register(struct in_device *idev); | 184 | static void devinet_sysctl_register(struct in_device *idev); |
| 181 | static void devinet_sysctl_unregister(struct in_device *idev); | 185 | static void devinet_sysctl_unregister(struct in_device *idev); |
| 182 | #else | 186 | #else |
| 183 | static void devinet_sysctl_register(struct in_device *idev) | 187 | static inline void devinet_sysctl_register(struct in_device *idev) |
| 184 | { | 188 | { |
| 185 | } | 189 | } |
| 186 | static void devinet_sysctl_unregister(struct in_device *idev) | 190 | static inline void devinet_sysctl_unregister(struct in_device *idev) |
| 187 | { | 191 | { |
| 188 | } | 192 | } |
| 189 | #endif | 193 | #endif |
| @@ -203,7 +207,7 @@ static void inet_rcu_free_ifa(struct rcu_head *head) | |||
| 203 | kfree(ifa); | 207 | kfree(ifa); |
| 204 | } | 208 | } |
| 205 | 209 | ||
| 206 | static void inet_free_ifa(struct in_ifaddr *ifa) | 210 | static inline void inet_free_ifa(struct in_ifaddr *ifa) |
| 207 | { | 211 | { |
| 208 | call_rcu(&ifa->rcu_head, inet_rcu_free_ifa); | 212 | call_rcu(&ifa->rcu_head, inet_rcu_free_ifa); |
| 209 | } | 213 | } |
| @@ -215,7 +219,8 @@ void in_dev_finish_destroy(struct in_device *idev) | |||
| 215 | WARN_ON(idev->ifa_list); | 219 | WARN_ON(idev->ifa_list); |
| 216 | WARN_ON(idev->mc_list); | 220 | WARN_ON(idev->mc_list); |
| 217 | #ifdef NET_REFCNT_DEBUG | 221 | #ifdef NET_REFCNT_DEBUG |
| 218 | pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL"); | 222 | printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n", |
| 223 | idev, dev ? dev->name : "NIL"); | ||
| 219 | #endif | 224 | #endif |
| 220 | dev_put(dev); | 225 | dev_put(dev); |
| 221 | if (!idev->dead) | 226 | if (!idev->dead) |
| @@ -287,7 +292,7 @@ static void inetdev_destroy(struct in_device *in_dev) | |||
| 287 | inet_free_ifa(ifa); | 292 | inet_free_ifa(ifa); |
| 288 | } | 293 | } |
| 289 | 294 | ||
| 290 | RCU_INIT_POINTER(dev->ip_ptr, NULL); | 295 | rcu_assign_pointer(dev->ip_ptr, NULL); |
| 291 | 296 | ||
| 292 | devinet_sysctl_unregister(in_dev); | 297 | devinet_sysctl_unregister(in_dev); |
| 293 | neigh_parms_release(&arp_tbl, in_dev->arp_parms); | 298 | neigh_parms_release(&arp_tbl, in_dev->arp_parms); |
| @@ -312,7 +317,7 @@ int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b) | |||
| 312 | } | 317 | } |
| 313 | 318 | ||
| 314 | static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, | 319 | static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, |
| 315 | int destroy, struct nlmsghdr *nlh, u32 portid) | 320 | int destroy, struct nlmsghdr *nlh, u32 pid) |
| 316 | { | 321 | { |
| 317 | struct in_ifaddr *promote = NULL; | 322 | struct in_ifaddr *promote = NULL; |
| 318 | struct in_ifaddr *ifa, *ifa1 = *ifap; | 323 | struct in_ifaddr *ifa, *ifa1 = *ifap; |
| @@ -346,7 +351,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, | |||
| 346 | inet_hash_remove(ifa); | 351 | inet_hash_remove(ifa); |
| 347 | *ifap1 = ifa->ifa_next; | 352 | *ifap1 = ifa->ifa_next; |
| 348 | 353 | ||
| 349 | rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid); | 354 | rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid); |
| 350 | blocking_notifier_call_chain(&inetaddr_chain, | 355 | blocking_notifier_call_chain(&inetaddr_chain, |
| 351 | NETDEV_DOWN, ifa); | 356 | NETDEV_DOWN, ifa); |
| 352 | inet_free_ifa(ifa); | 357 | inet_free_ifa(ifa); |
| @@ -383,7 +388,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, | |||
| 383 | is valid, it will try to restore deleted routes... Grr. | 388 | is valid, it will try to restore deleted routes... Grr. |
| 384 | So that, this order is correct. | 389 | So that, this order is correct. |
| 385 | */ | 390 | */ |
| 386 | rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid); | 391 | rtmsg_ifa(RTM_DELADDR, ifa1, nlh, pid); |
| 387 | blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); | 392 | blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); |
| 388 | 393 | ||
| 389 | if (promote) { | 394 | if (promote) { |
| @@ -396,7 +401,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, | |||
| 396 | } | 401 | } |
| 397 | 402 | ||
| 398 | promote->ifa_flags &= ~IFA_F_SECONDARY; | 403 | promote->ifa_flags &= ~IFA_F_SECONDARY; |
| 399 | rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid); | 404 | rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid); |
| 400 | blocking_notifier_call_chain(&inetaddr_chain, | 405 | blocking_notifier_call_chain(&inetaddr_chain, |
| 401 | NETDEV_UP, promote); | 406 | NETDEV_UP, promote); |
| 402 | for (ifa = next_sec; ifa; ifa = ifa->ifa_next) { | 407 | for (ifa = next_sec; ifa; ifa = ifa->ifa_next) { |
| @@ -418,7 +423,7 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, | |||
| 418 | } | 423 | } |
| 419 | 424 | ||
| 420 | static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, | 425 | static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, |
| 421 | u32 portid) | 426 | u32 pid) |
| 422 | { | 427 | { |
| 423 | struct in_device *in_dev = ifa->ifa_dev; | 428 | struct in_device *in_dev = ifa->ifa_dev; |
| 424 | struct in_ifaddr *ifa1, **ifap, **last_primary; | 429 | struct in_ifaddr *ifa1, **ifap, **last_primary; |
| @@ -465,7 +470,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, | |||
| 465 | /* Send message first, then call notifier. | 470 | /* Send message first, then call notifier. |
| 466 | Notifier will trigger FIB update, so that | 471 | Notifier will trigger FIB update, so that |
| 467 | listeners of netlink will know about new ifaddr */ | 472 | listeners of netlink will know about new ifaddr */ |
| 468 | rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid); | 473 | rtmsg_ifa(RTM_NEWADDR, ifa, nlh, pid); |
| 469 | blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); | 474 | blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); |
| 470 | 475 | ||
| 471 | return 0; | 476 | return 0; |
| @@ -564,7 +569,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg | |||
| 564 | !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa))) | 569 | !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa))) |
| 565 | continue; | 570 | continue; |
| 566 | 571 | ||
| 567 | __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid); | 572 | __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).pid); |
| 568 | return 0; | 573 | return 0; |
| 569 | } | 574 | } |
| 570 | 575 | ||
| @@ -650,14 +655,14 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg | |||
| 650 | if (IS_ERR(ifa)) | 655 | if (IS_ERR(ifa)) |
| 651 | return PTR_ERR(ifa); | 656 | return PTR_ERR(ifa); |
| 652 | 657 | ||
| 653 | return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); | 658 | return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).pid); |
| 654 | } | 659 | } |
| 655 | 660 | ||
| 656 | /* | 661 | /* |
| 657 | * Determine a default network mask, based on the IP address. | 662 | * Determine a default network mask, based on the IP address. |
| 658 | */ | 663 | */ |
| 659 | 664 | ||
| 660 | static int inet_abc_len(__be32 addr) | 665 | static inline int inet_abc_len(__be32 addr) |
| 661 | { | 666 | { |
| 662 | int rc = -1; /* Something else, probably a multicast. */ | 667 | int rc = -1; /* Something else, probably a multicast. */ |
| 663 | 668 | ||
| @@ -723,16 +728,17 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) | |||
| 723 | break; | 728 | break; |
| 724 | 729 | ||
| 725 | case SIOCSIFFLAGS: | 730 | case SIOCSIFFLAGS: |
| 726 | ret = -EPERM; | 731 | ret = -EACCES; |
| 727 | if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) | 732 | if (!capable(CAP_NET_ADMIN)) |
| 728 | goto out; | 733 | goto out; |
| 729 | break; | 734 | break; |
| 730 | case SIOCSIFADDR: /* Set interface address (and family) */ | 735 | case SIOCSIFADDR: /* Set interface address (and family) */ |
| 731 | case SIOCSIFBRDADDR: /* Set the broadcast address */ | 736 | case SIOCSIFBRDADDR: /* Set the broadcast address */ |
| 732 | case SIOCSIFDSTADDR: /* Set the destination address */ | 737 | case SIOCSIFDSTADDR: /* Set the destination address */ |
| 733 | case SIOCSIFNETMASK: /* Set the netmask for the interface */ | 738 | case SIOCSIFNETMASK: /* Set the netmask for the interface */ |
| 734 | ret = -EPERM; | 739 | case SIOCKILLADDR: /* Nuke all sockets on this address */ |
| 735 | if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) | 740 | ret = -EACCES; |
| 741 | if (!capable(CAP_NET_ADMIN)) | ||
| 736 | goto out; | 742 | goto out; |
| 737 | ret = -EINVAL; | 743 | ret = -EINVAL; |
| 738 | if (sin->sin_family != AF_INET) | 744 | if (sin->sin_family != AF_INET) |
| @@ -782,7 +788,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) | |||
| 782 | } | 788 | } |
| 783 | 789 | ||
| 784 | ret = -EADDRNOTAVAIL; | 790 | ret = -EADDRNOTAVAIL; |
| 785 | if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS) | 791 | if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS |
| 792 | && cmd != SIOCKILLADDR) | ||
| 786 | goto done; | 793 | goto done; |
| 787 | 794 | ||
| 788 | switch (cmd) { | 795 | switch (cmd) { |
| @@ -823,9 +830,9 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) | |||
| 823 | if (!ifa) { | 830 | if (!ifa) { |
| 824 | ret = -ENOBUFS; | 831 | ret = -ENOBUFS; |
| 825 | ifa = inet_alloc_ifa(); | 832 | ifa = inet_alloc_ifa(); |
| 833 | INIT_HLIST_NODE(&ifa->hash); | ||
| 826 | if (!ifa) | 834 | if (!ifa) |
| 827 | break; | 835 | break; |
| 828 | INIT_HLIST_NODE(&ifa->hash); | ||
| 829 | if (colon) | 836 | if (colon) |
| 830 | memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ); | 837 | memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ); |
| 831 | else | 838 | else |
| @@ -908,6 +915,9 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) | |||
| 908 | inet_insert_ifa(ifa); | 915 | inet_insert_ifa(ifa); |
| 909 | } | 916 | } |
| 910 | break; | 917 | break; |
| 918 | case SIOCKILLADDR: /* Nuke all connections on this address */ | ||
| 919 | ret = tcp_nuke_addr(net, (struct sockaddr *) sin); | ||
| 920 | break; | ||
| 911 | } | 921 | } |
| 912 | done: | 922 | done: |
| 913 | rtnl_unlock(); | 923 | rtnl_unlock(); |
| @@ -1075,7 +1085,6 @@ __be32 inet_confirm_addr(struct in_device *in_dev, | |||
| 1075 | 1085 | ||
| 1076 | return addr; | 1086 | return addr; |
| 1077 | } | 1087 | } |
| 1078 | EXPORT_SYMBOL(inet_confirm_addr); | ||
| 1079 | 1088 | ||
| 1080 | /* | 1089 | /* |
| 1081 | * Device notifier | 1090 | * Device notifier |
| @@ -1122,7 +1131,7 @@ skip: | |||
| 1122 | } | 1131 | } |
| 1123 | } | 1132 | } |
| 1124 | 1133 | ||
| 1125 | static bool inetdev_valid_mtu(unsigned int mtu) | 1134 | static inline bool inetdev_valid_mtu(unsigned mtu) |
| 1126 | { | 1135 | { |
| 1127 | return mtu >= 68; | 1136 | return mtu >= 68; |
| 1128 | } | 1137 | } |
| @@ -1171,8 +1180,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, | |||
| 1171 | 1180 | ||
| 1172 | switch (event) { | 1181 | switch (event) { |
| 1173 | case NETDEV_REGISTER: | 1182 | case NETDEV_REGISTER: |
| 1174 | pr_debug("%s: bug\n", __func__); | 1183 | printk(KERN_DEBUG "inetdev_event: bug\n"); |
| 1175 | RCU_INIT_POINTER(dev->ip_ptr, NULL); | 1184 | rcu_assign_pointer(dev->ip_ptr, NULL); |
| 1176 | break; | 1185 | break; |
| 1177 | case NETDEV_UP: | 1186 | case NETDEV_UP: |
| 1178 | if (!inetdev_valid_mtu(dev->mtu)) | 1187 | if (!inetdev_valid_mtu(dev->mtu)) |
| @@ -1237,7 +1246,7 @@ static struct notifier_block ip_netdev_notifier = { | |||
| 1237 | .notifier_call = inetdev_event, | 1246 | .notifier_call = inetdev_event, |
| 1238 | }; | 1247 | }; |
| 1239 | 1248 | ||
| 1240 | static size_t inet_nlmsg_size(void) | 1249 | static inline size_t inet_nlmsg_size(void) |
| 1241 | { | 1250 | { |
| 1242 | return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) | 1251 | return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) |
| 1243 | + nla_total_size(4) /* IFA_ADDRESS */ | 1252 | + nla_total_size(4) /* IFA_ADDRESS */ |
| @@ -1247,12 +1256,12 @@ static size_t inet_nlmsg_size(void) | |||
| 1247 | } | 1256 | } |
| 1248 | 1257 | ||
| 1249 | static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, | 1258 | static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, |
| 1250 | u32 portid, u32 seq, int event, unsigned int flags) | 1259 | u32 pid, u32 seq, int event, unsigned int flags) |
| 1251 | { | 1260 | { |
| 1252 | struct ifaddrmsg *ifm; | 1261 | struct ifaddrmsg *ifm; |
| 1253 | struct nlmsghdr *nlh; | 1262 | struct nlmsghdr *nlh; |
| 1254 | 1263 | ||
| 1255 | nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags); | 1264 | nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), flags); |
| 1256 | if (nlh == NULL) | 1265 | if (nlh == NULL) |
| 1257 | return -EMSGSIZE; | 1266 | return -EMSGSIZE; |
| 1258 | 1267 | ||
| @@ -1263,15 +1272,17 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, | |||
| 1263 | ifm->ifa_scope = ifa->ifa_scope; | 1272 | ifm->ifa_scope = ifa->ifa_scope; |
| 1264 | ifm->ifa_index = ifa->ifa_dev->dev->ifindex; | 1273 | ifm->ifa_index = ifa->ifa_dev->dev->ifindex; |
| 1265 | 1274 | ||
| 1266 | if ((ifa->ifa_address && | 1275 | if (ifa->ifa_address) |
| 1267 | nla_put_be32(skb, IFA_ADDRESS, ifa->ifa_address)) || | 1276 | NLA_PUT_BE32(skb, IFA_ADDRESS, ifa->ifa_address); |
| 1268 | (ifa->ifa_local && | 1277 | |
| 1269 | nla_put_be32(skb, IFA_LOCAL, ifa->ifa_local)) || | 1278 | if (ifa->ifa_local) |
| 1270 | (ifa->ifa_broadcast && | 1279 | NLA_PUT_BE32(skb, IFA_LOCAL, ifa->ifa_local); |
| 1271 | nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || | 1280 | |
| 1272 | (ifa->ifa_label[0] && | 1281 | if (ifa->ifa_broadcast) |
| 1273 | nla_put_string(skb, IFA_LABEL, ifa->ifa_label))) | 1282 | NLA_PUT_BE32(skb, IFA_BROADCAST, ifa->ifa_broadcast); |
| 1274 | goto nla_put_failure; | 1283 | |
| 1284 | if (ifa->ifa_label[0]) | ||
| 1285 | NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label); | ||
| 1275 | 1286 | ||
| 1276 | return nlmsg_end(skb, nlh); | 1287 | return nlmsg_end(skb, nlh); |
| 1277 | 1288 | ||
| @@ -1314,7 +1325,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) | |||
| 1314 | if (ip_idx < s_ip_idx) | 1325 | if (ip_idx < s_ip_idx) |
| 1315 | continue; | 1326 | continue; |
| 1316 | if (inet_fill_ifaddr(skb, ifa, | 1327 | if (inet_fill_ifaddr(skb, ifa, |
| 1317 | NETLINK_CB(cb->skb).portid, | 1328 | NETLINK_CB(cb->skb).pid, |
| 1318 | cb->nlh->nlmsg_seq, | 1329 | cb->nlh->nlmsg_seq, |
| 1319 | RTM_NEWADDR, NLM_F_MULTI) <= 0) { | 1330 | RTM_NEWADDR, NLM_F_MULTI) <= 0) { |
| 1320 | rcu_read_unlock(); | 1331 | rcu_read_unlock(); |
| @@ -1336,7 +1347,7 @@ done: | |||
| 1336 | } | 1347 | } |
| 1337 | 1348 | ||
| 1338 | static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, | 1349 | static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, |
| 1339 | u32 portid) | 1350 | u32 pid) |
| 1340 | { | 1351 | { |
| 1341 | struct sk_buff *skb; | 1352 | struct sk_buff *skb; |
| 1342 | u32 seq = nlh ? nlh->nlmsg_seq : 0; | 1353 | u32 seq = nlh ? nlh->nlmsg_seq : 0; |
| @@ -1348,14 +1359,14 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, | |||
| 1348 | if (skb == NULL) | 1359 | if (skb == NULL) |
| 1349 | goto errout; | 1360 | goto errout; |
| 1350 | 1361 | ||
| 1351 | err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0); | 1362 | err = inet_fill_ifaddr(skb, ifa, pid, seq, event, 0); |
| 1352 | if (err < 0) { | 1363 | if (err < 0) { |
| 1353 | /* -EMSGSIZE implies BUG in inet_nlmsg_size() */ | 1364 | /* -EMSGSIZE implies BUG in inet_nlmsg_size() */ |
| 1354 | WARN_ON(err == -EMSGSIZE); | 1365 | WARN_ON(err == -EMSGSIZE); |
| 1355 | kfree_skb(skb); | 1366 | kfree_skb(skb); |
| 1356 | goto errout; | 1367 | goto errout; |
| 1357 | } | 1368 | } |
| 1358 | rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); | 1369 | rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); |
| 1359 | return; | 1370 | return; |
| 1360 | errout: | 1371 | errout: |
| 1361 | if (err < 0) | 1372 | if (err < 0) |
| @@ -1443,155 +1454,6 @@ static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla) | |||
| 1443 | return 0; | 1454 | return 0; |
| 1444 | } | 1455 | } |
| 1445 | 1456 | ||
| 1446 | static int inet_netconf_msgsize_devconf(int type) | ||
| 1447 | { | ||
| 1448 | int size = NLMSG_ALIGN(sizeof(struct netconfmsg)) | ||
| 1449 | + nla_total_size(4); /* NETCONFA_IFINDEX */ | ||
| 1450 | |||
| 1451 | /* type -1 is used for ALL */ | ||
| 1452 | if (type == -1 || type == NETCONFA_FORWARDING) | ||
| 1453 | size += nla_total_size(4); | ||
| 1454 | if (type == -1 || type == NETCONFA_RP_FILTER) | ||
| 1455 | size += nla_total_size(4); | ||
| 1456 | if (type == -1 || type == NETCONFA_MC_FORWARDING) | ||
| 1457 | size += nla_total_size(4); | ||
| 1458 | |||
| 1459 | return size; | ||
| 1460 | } | ||
| 1461 | |||
| 1462 | static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, | ||
| 1463 | struct ipv4_devconf *devconf, u32 portid, | ||
| 1464 | u32 seq, int event, unsigned int flags, | ||
| 1465 | int type) | ||
| 1466 | { | ||
| 1467 | struct nlmsghdr *nlh; | ||
| 1468 | struct netconfmsg *ncm; | ||
| 1469 | |||
| 1470 | nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), | ||
| 1471 | flags); | ||
| 1472 | if (nlh == NULL) | ||
| 1473 | return -EMSGSIZE; | ||
| 1474 | |||
| 1475 | ncm = nlmsg_data(nlh); | ||
| 1476 | ncm->ncm_family = AF_INET; | ||
| 1477 | |||
| 1478 | if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0) | ||
| 1479 | goto nla_put_failure; | ||
| 1480 | |||
| 1481 | /* type -1 is used for ALL */ | ||
| 1482 | if ((type == -1 || type == NETCONFA_FORWARDING) && | ||
| 1483 | nla_put_s32(skb, NETCONFA_FORWARDING, | ||
| 1484 | IPV4_DEVCONF(*devconf, FORWARDING)) < 0) | ||
| 1485 | goto nla_put_failure; | ||
| 1486 | if ((type == -1 || type == NETCONFA_RP_FILTER) && | ||
| 1487 | nla_put_s32(skb, NETCONFA_RP_FILTER, | ||
| 1488 | IPV4_DEVCONF(*devconf, RP_FILTER)) < 0) | ||
| 1489 | goto nla_put_failure; | ||
| 1490 | if ((type == -1 || type == NETCONFA_MC_FORWARDING) && | ||
| 1491 | nla_put_s32(skb, NETCONFA_MC_FORWARDING, | ||
| 1492 | IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0) | ||
| 1493 | goto nla_put_failure; | ||
| 1494 | |||
| 1495 | return nlmsg_end(skb, nlh); | ||
| 1496 | |||
| 1497 | nla_put_failure: | ||
| 1498 | nlmsg_cancel(skb, nlh); | ||
| 1499 | return -EMSGSIZE; | ||
| 1500 | } | ||
| 1501 | |||
| 1502 | void inet_netconf_notify_devconf(struct net *net, int type, int ifindex, | ||
| 1503 | struct ipv4_devconf *devconf) | ||
| 1504 | { | ||
| 1505 | struct sk_buff *skb; | ||
| 1506 | int err = -ENOBUFS; | ||
| 1507 | |||
| 1508 | skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_ATOMIC); | ||
| 1509 | if (skb == NULL) | ||
| 1510 | goto errout; | ||
| 1511 | |||
| 1512 | err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0, | ||
| 1513 | RTM_NEWNETCONF, 0, type); | ||
| 1514 | if (err < 0) { | ||
| 1515 | /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ | ||
| 1516 | WARN_ON(err == -EMSGSIZE); | ||
| 1517 | kfree_skb(skb); | ||
| 1518 | goto errout; | ||
| 1519 | } | ||
| 1520 | rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_ATOMIC); | ||
| 1521 | return; | ||
| 1522 | errout: | ||
| 1523 | if (err < 0) | ||
| 1524 | rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err); | ||
| 1525 | } | ||
| 1526 | |||
| 1527 | static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { | ||
| 1528 | [NETCONFA_IFINDEX] = { .len = sizeof(int) }, | ||
| 1529 | [NETCONFA_FORWARDING] = { .len = sizeof(int) }, | ||
| 1530 | [NETCONFA_RP_FILTER] = { .len = sizeof(int) }, | ||
| 1531 | }; | ||
| 1532 | |||
| 1533 | static int inet_netconf_get_devconf(struct sk_buff *in_skb, | ||
| 1534 | struct nlmsghdr *nlh, | ||
| 1535 | void *arg) | ||
| 1536 | { | ||
| 1537 | struct net *net = sock_net(in_skb->sk); | ||
| 1538 | struct nlattr *tb[NETCONFA_MAX+1]; | ||
| 1539 | struct netconfmsg *ncm; | ||
| 1540 | struct sk_buff *skb; | ||
| 1541 | struct ipv4_devconf *devconf; | ||
| 1542 | struct in_device *in_dev; | ||
| 1543 | struct net_device *dev; | ||
| 1544 | int ifindex; | ||
| 1545 | int err; | ||
| 1546 | |||
| 1547 | err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, | ||
| 1548 | devconf_ipv4_policy); | ||
| 1549 | if (err < 0) | ||
| 1550 | goto errout; | ||
| 1551 | |||
| 1552 | err = EINVAL; | ||
| 1553 | if (!tb[NETCONFA_IFINDEX]) | ||
| 1554 | goto errout; | ||
| 1555 | |||
| 1556 | ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); | ||
| 1557 | switch (ifindex) { | ||
| 1558 | case NETCONFA_IFINDEX_ALL: | ||
| 1559 | devconf = net->ipv4.devconf_all; | ||
| 1560 | break; | ||
| 1561 | case NETCONFA_IFINDEX_DEFAULT: | ||
| 1562 | devconf = net->ipv4.devconf_dflt; | ||
| 1563 | break; | ||
| 1564 | default: | ||
| 1565 | dev = __dev_get_by_index(net, ifindex); | ||
| 1566 | if (dev == NULL) | ||
| 1567 | goto errout; | ||
| 1568 | in_dev = __in_dev_get_rtnl(dev); | ||
| 1569 | if (in_dev == NULL) | ||
| 1570 | goto errout; | ||
| 1571 | devconf = &in_dev->cnf; | ||
| 1572 | break; | ||
| 1573 | } | ||
| 1574 | |||
| 1575 | err = -ENOBUFS; | ||
| 1576 | skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC); | ||
| 1577 | if (skb == NULL) | ||
| 1578 | goto errout; | ||
| 1579 | |||
| 1580 | err = inet_netconf_fill_devconf(skb, ifindex, devconf, | ||
| 1581 | NETLINK_CB(in_skb).portid, | ||
| 1582 | nlh->nlmsg_seq, RTM_NEWNETCONF, 0, | ||
| 1583 | -1); | ||
| 1584 | if (err < 0) { | ||
| 1585 | /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ | ||
| 1586 | WARN_ON(err == -EMSGSIZE); | ||
| 1587 | kfree_skb(skb); | ||
| 1588 | goto errout; | ||
| 1589 | } | ||
| 1590 | err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); | ||
| 1591 | errout: | ||
| 1592 | return err; | ||
| 1593 | } | ||
| 1594 | |||
| 1595 | #ifdef CONFIG_SYSCTL | 1457 | #ifdef CONFIG_SYSCTL |
| 1596 | 1458 | ||
| 1597 | static void devinet_copy_dflt_conf(struct net *net, int i) | 1459 | static void devinet_copy_dflt_conf(struct net *net, int i) |
| @@ -1617,12 +1479,6 @@ static void inet_forward_change(struct net *net) | |||
| 1617 | 1479 | ||
| 1618 | IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; | 1480 | IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; |
| 1619 | IPV4_DEVCONF_DFLT(net, FORWARDING) = on; | 1481 | IPV4_DEVCONF_DFLT(net, FORWARDING) = on; |
| 1620 | inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, | ||
| 1621 | NETCONFA_IFINDEX_ALL, | ||
| 1622 | net->ipv4.devconf_all); | ||
| 1623 | inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, | ||
| 1624 | NETCONFA_IFINDEX_DEFAULT, | ||
| 1625 | net->ipv4.devconf_dflt); | ||
| 1626 | 1482 | ||
| 1627 | for_each_netdev(net, dev) { | 1483 | for_each_netdev(net, dev) { |
| 1628 | struct in_device *in_dev; | 1484 | struct in_device *in_dev; |
| @@ -1630,11 +1486,8 @@ static void inet_forward_change(struct net *net) | |||
| 1630 | dev_disable_lro(dev); | 1486 | dev_disable_lro(dev); |
| 1631 | rcu_read_lock(); | 1487 | rcu_read_lock(); |
| 1632 | in_dev = __in_dev_get_rcu(dev); | 1488 | in_dev = __in_dev_get_rcu(dev); |
| 1633 | if (in_dev) { | 1489 | if (in_dev) |
| 1634 | IN_DEV_CONF_SET(in_dev, FORWARDING, on); | 1490 | IN_DEV_CONF_SET(in_dev, FORWARDING, on); |
| 1635 | inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, | ||
| 1636 | dev->ifindex, &in_dev->cnf); | ||
| 1637 | } | ||
| 1638 | rcu_read_unlock(); | 1491 | rcu_read_unlock(); |
| 1639 | } | 1492 | } |
| 1640 | } | 1493 | } |
| @@ -1656,27 +1509,9 @@ static int devinet_conf_proc(ctl_table *ctl, int write, | |||
| 1656 | 1509 | ||
| 1657 | if (cnf == net->ipv4.devconf_dflt) | 1510 | if (cnf == net->ipv4.devconf_dflt) |
| 1658 | devinet_copy_dflt_conf(net, i); | 1511 | devinet_copy_dflt_conf(net, i); |
| 1659 | if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 || | 1512 | if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1) |
| 1660 | i == IPV4_DEVCONF_ROUTE_LOCALNET - 1) | ||
| 1661 | if ((new_value == 0) && (old_value != 0)) | 1513 | if ((new_value == 0) && (old_value != 0)) |
| 1662 | rt_cache_flush(net); | 1514 | rt_cache_flush(net, 0); |
| 1663 | if (i == IPV4_DEVCONF_RP_FILTER - 1 && | ||
| 1664 | new_value != old_value) { | ||
| 1665 | int ifindex; | ||
| 1666 | |||
| 1667 | if (cnf == net->ipv4.devconf_dflt) | ||
| 1668 | ifindex = NETCONFA_IFINDEX_DEFAULT; | ||
| 1669 | else if (cnf == net->ipv4.devconf_all) | ||
| 1670 | ifindex = NETCONFA_IFINDEX_ALL; | ||
| 1671 | else { | ||
| 1672 | struct in_device *idev = | ||
| 1673 | container_of(cnf, struct in_device, | ||
| 1674 | cnf); | ||
| 1675 | ifindex = idev->dev->ifindex; | ||
| 1676 | } | ||
| 1677 | inet_netconf_notify_devconf(net, NETCONFA_RP_FILTER, | ||
| 1678 | ifindex, cnf); | ||
| 1679 | } | ||
| 1680 | } | 1515 | } |
| 1681 | 1516 | ||
| 1682 | return ret; | 1517 | return ret; |
| @@ -1703,23 +1538,15 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write, | |||
| 1703 | } | 1538 | } |
| 1704 | if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { | 1539 | if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { |
| 1705 | inet_forward_change(net); | 1540 | inet_forward_change(net); |
| 1706 | } else { | 1541 | } else if (*valp) { |
| 1707 | struct ipv4_devconf *cnf = ctl->extra1; | 1542 | struct ipv4_devconf *cnf = ctl->extra1; |
| 1708 | struct in_device *idev = | 1543 | struct in_device *idev = |
| 1709 | container_of(cnf, struct in_device, cnf); | 1544 | container_of(cnf, struct in_device, cnf); |
| 1710 | if (*valp) | 1545 | dev_disable_lro(idev->dev); |
| 1711 | dev_disable_lro(idev->dev); | ||
| 1712 | inet_netconf_notify_devconf(net, | ||
| 1713 | NETCONFA_FORWARDING, | ||
| 1714 | idev->dev->ifindex, | ||
| 1715 | cnf); | ||
| 1716 | } | 1546 | } |
| 1717 | rtnl_unlock(); | 1547 | rtnl_unlock(); |
| 1718 | rt_cache_flush(net); | 1548 | rt_cache_flush(net, 0); |
| 1719 | } else | 1549 | } |
| 1720 | inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, | ||
| 1721 | NETCONFA_IFINDEX_DEFAULT, | ||
| 1722 | net->ipv4.devconf_dflt); | ||
| 1723 | } | 1550 | } |
| 1724 | 1551 | ||
| 1725 | return ret; | 1552 | return ret; |
| @@ -1735,7 +1562,7 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write, | |||
| 1735 | struct net *net = ctl->extra2; | 1562 | struct net *net = ctl->extra2; |
| 1736 | 1563 | ||
| 1737 | if (write && *valp != val) | 1564 | if (write && *valp != val) |
| 1738 | rt_cache_flush(net); | 1565 | rt_cache_flush(net, 0); |
| 1739 | 1566 | ||
| 1740 | return ret; | 1567 | return ret; |
| 1741 | } | 1568 | } |
| @@ -1766,6 +1593,7 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write, | |||
| 1766 | static struct devinet_sysctl_table { | 1593 | static struct devinet_sysctl_table { |
| 1767 | struct ctl_table_header *sysctl_header; | 1594 | struct ctl_table_header *sysctl_header; |
| 1768 | struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX]; | 1595 | struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX]; |
| 1596 | char *dev_name; | ||
| 1769 | } devinet_sysctl = { | 1597 | } devinet_sysctl = { |
| 1770 | .devinet_vars = { | 1598 | .devinet_vars = { |
| 1771 | DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", | 1599 | DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", |
| @@ -1799,8 +1627,6 @@ static struct devinet_sysctl_table { | |||
| 1799 | "force_igmp_version"), | 1627 | "force_igmp_version"), |
| 1800 | DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, | 1628 | DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, |
| 1801 | "promote_secondaries"), | 1629 | "promote_secondaries"), |
| 1802 | DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, | ||
| 1803 | "route_localnet"), | ||
| 1804 | }, | 1630 | }, |
| 1805 | }; | 1631 | }; |
| 1806 | 1632 | ||
| @@ -1809,7 +1635,16 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, | |||
| 1809 | { | 1635 | { |
| 1810 | int i; | 1636 | int i; |
| 1811 | struct devinet_sysctl_table *t; | 1637 | struct devinet_sysctl_table *t; |
| 1812 | char path[sizeof("net/ipv4/conf/") + IFNAMSIZ]; | 1638 | |
| 1639 | #define DEVINET_CTL_PATH_DEV 3 | ||
| 1640 | |||
| 1641 | struct ctl_path devinet_ctl_path[] = { | ||
| 1642 | { .procname = "net", }, | ||
| 1643 | { .procname = "ipv4", }, | ||
| 1644 | { .procname = "conf", }, | ||
| 1645 | { /* to be set */ }, | ||
| 1646 | { }, | ||
| 1647 | }; | ||
| 1813 | 1648 | ||
| 1814 | t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL); | 1649 | t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL); |
| 1815 | if (!t) | 1650 | if (!t) |
| @@ -1821,15 +1656,27 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, | |||
| 1821 | t->devinet_vars[i].extra2 = net; | 1656 | t->devinet_vars[i].extra2 = net; |
| 1822 | } | 1657 | } |
| 1823 | 1658 | ||
| 1824 | snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name); | 1659 | /* |
| 1660 | * Make a copy of dev_name, because '.procname' is regarded as const | ||
| 1661 | * by sysctl and we wouldn't want anyone to change it under our feet | ||
| 1662 | * (see SIOCSIFNAME). | ||
| 1663 | */ | ||
| 1664 | t->dev_name = kstrdup(dev_name, GFP_KERNEL); | ||
| 1665 | if (!t->dev_name) | ||
| 1666 | goto free; | ||
| 1667 | |||
| 1668 | devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name; | ||
| 1825 | 1669 | ||
| 1826 | t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars); | 1670 | t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path, |
| 1671 | t->devinet_vars); | ||
| 1827 | if (!t->sysctl_header) | 1672 | if (!t->sysctl_header) |
| 1828 | goto free; | 1673 | goto free_procname; |
| 1829 | 1674 | ||
| 1830 | p->sysctl = t; | 1675 | p->sysctl = t; |
| 1831 | return 0; | 1676 | return 0; |
| 1832 | 1677 | ||
| 1678 | free_procname: | ||
| 1679 | kfree(t->dev_name); | ||
| 1833 | free: | 1680 | free: |
| 1834 | kfree(t); | 1681 | kfree(t); |
| 1835 | out: | 1682 | out: |
| @@ -1845,6 +1692,7 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf) | |||
| 1845 | 1692 | ||
| 1846 | cnf->sysctl = NULL; | 1693 | cnf->sysctl = NULL; |
| 1847 | unregister_net_sysctl_table(t->sysctl_header); | 1694 | unregister_net_sysctl_table(t->sysctl_header); |
| 1695 | kfree(t->dev_name); | ||
| 1848 | kfree(t); | 1696 | kfree(t); |
| 1849 | } | 1697 | } |
| 1850 | 1698 | ||
| @@ -1874,6 +1722,12 @@ static struct ctl_table ctl_forward_entry[] = { | |||
| 1874 | }, | 1722 | }, |
| 1875 | { }, | 1723 | { }, |
| 1876 | }; | 1724 | }; |
| 1725 | |||
| 1726 | static __net_initdata struct ctl_path net_ipv4_path[] = { | ||
| 1727 | { .procname = "net", }, | ||
| 1728 | { .procname = "ipv4", }, | ||
| 1729 | { }, | ||
| 1730 | }; | ||
| 1877 | #endif | 1731 | #endif |
| 1878 | 1732 | ||
| 1879 | static __net_init int devinet_init_net(struct net *net) | 1733 | static __net_init int devinet_init_net(struct net *net) |
| @@ -1919,7 +1773,7 @@ static __net_init int devinet_init_net(struct net *net) | |||
| 1919 | goto err_reg_dflt; | 1773 | goto err_reg_dflt; |
| 1920 | 1774 | ||
| 1921 | err = -ENOMEM; | 1775 | err = -ENOMEM; |
| 1922 | forw_hdr = register_net_sysctl(net, "net/ipv4", tbl); | 1776 | forw_hdr = register_net_sysctl_table(net, net_ipv4_path, tbl); |
| 1923 | if (forw_hdr == NULL) | 1777 | if (forw_hdr == NULL) |
| 1924 | goto err_reg_ctl; | 1778 | goto err_reg_ctl; |
| 1925 | net->ipv4.forw_hdr = forw_hdr; | 1779 | net->ipv4.forw_hdr = forw_hdr; |
| @@ -1993,7 +1847,5 @@ void __init devinet_init(void) | |||
| 1993 | rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL); | 1847 | rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL); |
| 1994 | rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL); | 1848 | rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL); |
| 1995 | rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL); | 1849 | rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL); |
| 1996 | rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf, | ||
| 1997 | NULL, NULL); | ||
| 1998 | } | 1850 | } |
| 1999 | 1851 | ||
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index b61e9deb7c7..a5b413416da 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c | |||
| @@ -1,5 +1,3 @@ | |||
| 1 | #define pr_fmt(fmt) "IPsec: " fmt | ||
| 2 | |||
| 3 | #include <crypto/aead.h> | 1 | #include <crypto/aead.h> |
| 4 | #include <crypto/authenc.h> | 2 | #include <crypto/authenc.h> |
| 5 | #include <linux/err.h> | 3 | #include <linux/err.h> |
| @@ -459,22 +457,28 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) | |||
| 459 | struct esp_data *esp = x->data; | 457 | struct esp_data *esp = x->data; |
| 460 | u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4); | 458 | u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4); |
| 461 | u32 align = max_t(u32, blksize, esp->padlen); | 459 | u32 align = max_t(u32, blksize, esp->padlen); |
| 462 | unsigned int net_adj; | 460 | u32 rem; |
| 461 | |||
| 462 | mtu -= x->props.header_len + crypto_aead_authsize(esp->aead); | ||
| 463 | rem = mtu & (align - 1); | ||
| 464 | mtu &= ~(align - 1); | ||
| 463 | 465 | ||
| 464 | switch (x->props.mode) { | 466 | switch (x->props.mode) { |
| 465 | case XFRM_MODE_TRANSPORT: | ||
| 466 | case XFRM_MODE_BEET: | ||
| 467 | net_adj = sizeof(struct iphdr); | ||
| 468 | break; | ||
| 469 | case XFRM_MODE_TUNNEL: | 467 | case XFRM_MODE_TUNNEL: |
| 470 | net_adj = 0; | ||
| 471 | break; | 468 | break; |
| 472 | default: | 469 | default: |
| 473 | BUG(); | 470 | case XFRM_MODE_TRANSPORT: |
| 471 | /* The worst case */ | ||
| 472 | mtu -= blksize - 4; | ||
| 473 | mtu += min_t(u32, blksize - 4, rem); | ||
| 474 | break; | ||
| 475 | case XFRM_MODE_BEET: | ||
| 476 | /* The worst case. */ | ||
| 477 | mtu += min_t(u32, IPV4_BEET_PHMAXLEN, rem); | ||
| 478 | break; | ||
| 474 | } | 479 | } |
| 475 | 480 | ||
| 476 | return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) - | 481 | return mtu - 2; |
| 477 | net_adj) & ~(align - 1)) + (net_adj - 2); | ||
| 478 | } | 482 | } |
| 479 | 483 | ||
| 480 | static void esp4_err(struct sk_buff *skb, u32 info) | 484 | static void esp4_err(struct sk_buff *skb, u32 info) |
| @@ -484,25 +488,16 @@ static void esp4_err(struct sk_buff *skb, u32 info) | |||
| 484 | struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); | 488 | struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); |
| 485 | struct xfrm_state *x; | 489 | struct xfrm_state *x; |
| 486 | 490 | ||
| 487 | switch (icmp_hdr(skb)->type) { | 491 | if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || |
| 488 | case ICMP_DEST_UNREACH: | 492 | icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) |
| 489 | if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) | ||
| 490 | return; | ||
| 491 | case ICMP_REDIRECT: | ||
| 492 | break; | ||
| 493 | default: | ||
| 494 | return; | 493 | return; |
| 495 | } | ||
| 496 | 494 | ||
| 497 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, | 495 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, |
| 498 | esph->spi, IPPROTO_ESP, AF_INET); | 496 | esph->spi, IPPROTO_ESP, AF_INET); |
| 499 | if (!x) | 497 | if (!x) |
| 500 | return; | 498 | return; |
| 501 | 499 | NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", | |
| 502 | if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) | 500 | ntohl(esph->spi), ntohl(iph->daddr)); |
| 503 | ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); | ||
| 504 | else | ||
| 505 | ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0); | ||
| 506 | xfrm_state_put(x); | 501 | xfrm_state_put(x); |
| 507 | } | 502 | } |
| 508 | 503 | ||
| @@ -711,11 +706,11 @@ static const struct net_protocol esp4_protocol = { | |||
| 711 | static int __init esp4_init(void) | 706 | static int __init esp4_init(void) |
| 712 | { | 707 | { |
| 713 | if (xfrm_register_type(&esp_type, AF_INET) < 0) { | 708 | if (xfrm_register_type(&esp_type, AF_INET) < 0) { |
| 714 | pr_info("%s: can't add xfrm type\n", __func__); | 709 | printk(KERN_INFO "ip esp init: can't add xfrm type\n"); |
| 715 | return -EAGAIN; | 710 | return -EAGAIN; |
| 716 | } | 711 | } |
| 717 | if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) { | 712 | if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) { |
| 718 | pr_info("%s: can't add protocol\n", __func__); | 713 | printk(KERN_INFO "ip esp init: can't add protocol\n"); |
| 719 | xfrm_unregister_type(&esp_type, AF_INET); | 714 | xfrm_unregister_type(&esp_type, AF_INET); |
| 720 | return -EAGAIN; | 715 | return -EAGAIN; |
| 721 | } | 716 | } |
| @@ -725,9 +720,9 @@ static int __init esp4_init(void) | |||
| 725 | static void __exit esp4_fini(void) | 720 | static void __exit esp4_fini(void) |
| 726 | { | 721 | { |
| 727 | if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0) | 722 | if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0) |
| 728 | pr_info("%s: can't remove protocol\n", __func__); | 723 | printk(KERN_INFO "ip esp close: can't remove protocol\n"); |
| 729 | if (xfrm_unregister_type(&esp_type, AF_INET) < 0) | 724 | if (xfrm_unregister_type(&esp_type, AF_INET) < 0) |
| 730 | pr_info("%s: can't remove xfrm type\n", __func__); | 725 | printk(KERN_INFO "ip esp close: can't remove xfrm type\n"); |
| 731 | } | 726 | } |
| 732 | 727 | ||
| 733 | module_init(esp4_init); | 728 | module_init(esp4_init); |
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 5cd75e2dab2..92fc5f69f5d 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | 15 | ||
| 16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
| 17 | #include <asm/uaccess.h> | 17 | #include <asm/uaccess.h> |
| 18 | #include <asm/system.h> | ||
| 18 | #include <linux/bitops.h> | 19 | #include <linux/bitops.h> |
| 19 | #include <linux/capability.h> | 20 | #include <linux/capability.h> |
| 20 | #include <linux/types.h> | 21 | #include <linux/types.h> |
| @@ -31,7 +32,6 @@ | |||
| 31 | #include <linux/if_addr.h> | 32 | #include <linux/if_addr.h> |
| 32 | #include <linux/if_arp.h> | 33 | #include <linux/if_arp.h> |
| 33 | #include <linux/skbuff.h> | 34 | #include <linux/skbuff.h> |
| 34 | #include <linux/cache.h> | ||
| 35 | #include <linux/init.h> | 35 | #include <linux/init.h> |
| 36 | #include <linux/list.h> | 36 | #include <linux/list.h> |
| 37 | #include <linux/slab.h> | 37 | #include <linux/slab.h> |
| @@ -86,24 +86,6 @@ struct fib_table *fib_new_table(struct net *net, u32 id) | |||
| 86 | tb = fib_trie_table(id); | 86 | tb = fib_trie_table(id); |
| 87 | if (!tb) | 87 | if (!tb) |
| 88 | return NULL; | 88 | return NULL; |
| 89 | |||
| 90 | switch (id) { | ||
| 91 | case RT_TABLE_LOCAL: | ||
| 92 | net->ipv4.fib_local = tb; | ||
| 93 | break; | ||
| 94 | |||
| 95 | case RT_TABLE_MAIN: | ||
| 96 | net->ipv4.fib_main = tb; | ||
| 97 | break; | ||
| 98 | |||
| 99 | case RT_TABLE_DEFAULT: | ||
| 100 | net->ipv4.fib_default = tb; | ||
| 101 | break; | ||
| 102 | |||
| 103 | default: | ||
| 104 | break; | ||
| 105 | } | ||
| 106 | |||
| 107 | h = id & (FIB_TABLE_HASHSZ - 1); | 89 | h = id & (FIB_TABLE_HASHSZ - 1); |
| 108 | hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); | 90 | hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); |
| 109 | return tb; | 91 | return tb; |
| @@ -148,20 +130,20 @@ static void fib_flush(struct net *net) | |||
| 148 | } | 130 | } |
| 149 | 131 | ||
| 150 | if (flushed) | 132 | if (flushed) |
| 151 | rt_cache_flush(net); | 133 | rt_cache_flush(net, -1); |
| 152 | } | 134 | } |
| 153 | 135 | ||
| 154 | /* | 136 | /* |
| 155 | * Find address type as if only "dev" was present in the system. If | 137 | * Find address type as if only "dev" was present in the system. If |
| 156 | * on_dev is NULL then all interfaces are taken into consideration. | 138 | * on_dev is NULL then all interfaces are taken into consideration. |
| 157 | */ | 139 | */ |
| 158 | static inline unsigned int __inet_dev_addr_type(struct net *net, | 140 | static inline unsigned __inet_dev_addr_type(struct net *net, |
| 159 | const struct net_device *dev, | 141 | const struct net_device *dev, |
| 160 | __be32 addr) | 142 | __be32 addr) |
| 161 | { | 143 | { |
| 162 | struct flowi4 fl4 = { .daddr = addr }; | 144 | struct flowi4 fl4 = { .daddr = addr }; |
| 163 | struct fib_result res; | 145 | struct fib_result res; |
| 164 | unsigned int ret = RTN_BROADCAST; | 146 | unsigned ret = RTN_BROADCAST; |
| 165 | struct fib_table *local_table; | 147 | struct fib_table *local_table; |
| 166 | 148 | ||
| 167 | if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) | 149 | if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) |
| @@ -169,6 +151,10 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, | |||
| 169 | if (ipv4_is_multicast(addr)) | 151 | if (ipv4_is_multicast(addr)) |
| 170 | return RTN_MULTICAST; | 152 | return RTN_MULTICAST; |
| 171 | 153 | ||
| 154 | #ifdef CONFIG_IP_MULTIPLE_TABLES | ||
| 155 | res.r = NULL; | ||
| 156 | #endif | ||
| 157 | |||
| 172 | local_table = fib_get_table(net, RT_TABLE_LOCAL); | 158 | local_table = fib_get_table(net, RT_TABLE_LOCAL); |
| 173 | if (local_table) { | 159 | if (local_table) { |
| 174 | ret = RTN_UNICAST; | 160 | ret = RTN_UNICAST; |
| @@ -195,44 +181,6 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, | |||
| 195 | } | 181 | } |
| 196 | EXPORT_SYMBOL(inet_dev_addr_type); | 182 | EXPORT_SYMBOL(inet_dev_addr_type); |
| 197 | 183 | ||
| 198 | __be32 fib_compute_spec_dst(struct sk_buff *skb) | ||
| 199 | { | ||
| 200 | struct net_device *dev = skb->dev; | ||
| 201 | struct in_device *in_dev; | ||
| 202 | struct fib_result res; | ||
| 203 | struct rtable *rt; | ||
| 204 | struct flowi4 fl4; | ||
| 205 | struct net *net; | ||
| 206 | int scope; | ||
| 207 | |||
| 208 | rt = skb_rtable(skb); | ||
| 209 | if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) == | ||
| 210 | RTCF_LOCAL) | ||
| 211 | return ip_hdr(skb)->daddr; | ||
| 212 | |||
| 213 | in_dev = __in_dev_get_rcu(dev); | ||
| 214 | BUG_ON(!in_dev); | ||
| 215 | |||
| 216 | net = dev_net(dev); | ||
| 217 | |||
| 218 | scope = RT_SCOPE_UNIVERSE; | ||
| 219 | if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { | ||
| 220 | fl4.flowi4_oif = 0; | ||
| 221 | fl4.flowi4_iif = LOOPBACK_IFINDEX; | ||
| 222 | fl4.daddr = ip_hdr(skb)->saddr; | ||
| 223 | fl4.saddr = 0; | ||
| 224 | fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); | ||
| 225 | fl4.flowi4_scope = scope; | ||
| 226 | fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; | ||
| 227 | if (!fib_lookup(net, &fl4, &res)) | ||
| 228 | return FIB_RES_PREFSRC(net, res); | ||
| 229 | } else { | ||
| 230 | scope = RT_SCOPE_LINK; | ||
| 231 | } | ||
| 232 | |||
| 233 | return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); | ||
| 234 | } | ||
| 235 | |||
| 236 | /* Given (packet source, input interface) and optional (dst, oif, tos): | 184 | /* Given (packet source, input interface) and optional (dst, oif, tos): |
| 237 | * - (main) check, that source is valid i.e. not broadcast or our local | 185 | * - (main) check, that source is valid i.e. not broadcast or our local |
| 238 | * address. | 186 | * address. |
| @@ -241,15 +189,17 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) | |||
| 241 | * - check, that packet arrived from expected physical interface. | 189 | * - check, that packet arrived from expected physical interface. |
| 242 | * called with rcu_read_lock() | 190 | * called with rcu_read_lock() |
| 243 | */ | 191 | */ |
| 244 | static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, | 192 | int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, |
| 245 | u8 tos, int oif, struct net_device *dev, | 193 | int oif, struct net_device *dev, __be32 *spec_dst, |
| 246 | int rpf, struct in_device *idev, u32 *itag) | 194 | u32 *itag) |
| 247 | { | 195 | { |
| 248 | int ret, no_addr, accept_local; | 196 | struct in_device *in_dev; |
| 249 | struct fib_result res; | ||
| 250 | struct flowi4 fl4; | 197 | struct flowi4 fl4; |
| 251 | struct net *net; | 198 | struct fib_result res; |
| 199 | int no_addr, rpf, accept_local; | ||
| 252 | bool dev_match; | 200 | bool dev_match; |
| 201 | int ret; | ||
| 202 | struct net *net; | ||
| 253 | 203 | ||
| 254 | fl4.flowi4_oif = 0; | 204 | fl4.flowi4_oif = 0; |
| 255 | fl4.flowi4_iif = oif; | 205 | fl4.flowi4_iif = oif; |
| @@ -258,10 +208,20 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, | |||
| 258 | fl4.flowi4_tos = tos; | 208 | fl4.flowi4_tos = tos; |
| 259 | fl4.flowi4_scope = RT_SCOPE_UNIVERSE; | 209 | fl4.flowi4_scope = RT_SCOPE_UNIVERSE; |
| 260 | 210 | ||
| 261 | no_addr = idev->ifa_list == NULL; | 211 | no_addr = rpf = accept_local = 0; |
| 212 | in_dev = __in_dev_get_rcu(dev); | ||
| 213 | if (in_dev) { | ||
| 214 | no_addr = in_dev->ifa_list == NULL; | ||
| 215 | |||
| 216 | /* Ignore rp_filter for packets protected by IPsec. */ | ||
| 217 | rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev); | ||
| 218 | |||
| 219 | accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); | ||
| 220 | fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; | ||
| 221 | } | ||
| 262 | 222 | ||
| 263 | accept_local = IN_DEV_ACCEPT_LOCAL(idev); | 223 | if (in_dev == NULL) |
| 264 | fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; | 224 | goto e_inval; |
| 265 | 225 | ||
| 266 | net = dev_net(dev); | 226 | net = dev_net(dev); |
| 267 | if (fib_lookup(net, &fl4, &res)) | 227 | if (fib_lookup(net, &fl4, &res)) |
| @@ -270,6 +230,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, | |||
| 270 | if (res.type != RTN_LOCAL || !accept_local) | 230 | if (res.type != RTN_LOCAL || !accept_local) |
| 271 | goto e_inval; | 231 | goto e_inval; |
| 272 | } | 232 | } |
| 233 | *spec_dst = FIB_RES_PREFSRC(net, res); | ||
| 273 | fib_combine_itag(itag, &res); | 234 | fib_combine_itag(itag, &res); |
| 274 | dev_match = false; | 235 | dev_match = false; |
| 275 | 236 | ||
| @@ -298,14 +259,17 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, | |||
| 298 | 259 | ||
| 299 | ret = 0; | 260 | ret = 0; |
| 300 | if (fib_lookup(net, &fl4, &res) == 0) { | 261 | if (fib_lookup(net, &fl4, &res) == 0) { |
| 301 | if (res.type == RTN_UNICAST) | 262 | if (res.type == RTN_UNICAST) { |
| 263 | *spec_dst = FIB_RES_PREFSRC(net, res); | ||
| 302 | ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; | 264 | ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; |
| 265 | } | ||
| 303 | } | 266 | } |
| 304 | return ret; | 267 | return ret; |
| 305 | 268 | ||
| 306 | last_resort: | 269 | last_resort: |
| 307 | if (rpf) | 270 | if (rpf) |
| 308 | goto e_rpf; | 271 | goto e_rpf; |
| 272 | *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); | ||
| 309 | *itag = 0; | 273 | *itag = 0; |
| 310 | return 0; | 274 | return 0; |
| 311 | 275 | ||
| @@ -315,21 +279,6 @@ e_rpf: | |||
| 315 | return -EXDEV; | 279 | return -EXDEV; |
| 316 | } | 280 | } |
| 317 | 281 | ||
| 318 | /* Ignore rp_filter for packets protected by IPsec. */ | ||
| 319 | int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, | ||
| 320 | u8 tos, int oif, struct net_device *dev, | ||
| 321 | struct in_device *idev, u32 *itag) | ||
| 322 | { | ||
| 323 | int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); | ||
| 324 | |||
| 325 | if (!r && !fib_num_tclassid_users(dev_net(dev)) && | ||
| 326 | (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { | ||
| 327 | *itag = 0; | ||
| 328 | return 0; | ||
| 329 | } | ||
| 330 | return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); | ||
| 331 | } | ||
| 332 | |||
| 333 | static inline __be32 sk_extract_addr(struct sockaddr *addr) | 282 | static inline __be32 sk_extract_addr(struct sockaddr *addr) |
| 334 | { | 283 | { |
| 335 | return ((struct sockaddr_in *) addr)->sin_addr.s_addr; | 284 | return ((struct sockaddr_in *) addr)->sin_addr.s_addr; |
| @@ -488,7 +437,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) | |||
| 488 | switch (cmd) { | 437 | switch (cmd) { |
| 489 | case SIOCADDRT: /* Add a route */ | 438 | case SIOCADDRT: /* Add a route */ |
| 490 | case SIOCDELRT: /* Delete a route */ | 439 | case SIOCDELRT: /* Delete a route */ |
| 491 | if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) | 440 | if (!capable(CAP_NET_ADMIN)) |
| 492 | return -EPERM; | 441 | return -EPERM; |
| 493 | 442 | ||
| 494 | if (copy_from_user(&rt, arg, sizeof(rt))) | 443 | if (copy_from_user(&rt, arg, sizeof(rt))) |
| @@ -558,7 +507,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, | |||
| 558 | cfg->fc_flags = rtm->rtm_flags; | 507 | cfg->fc_flags = rtm->rtm_flags; |
| 559 | cfg->fc_nlflags = nlh->nlmsg_flags; | 508 | cfg->fc_nlflags = nlh->nlmsg_flags; |
| 560 | 509 | ||
| 561 | cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; | 510 | cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid; |
| 562 | cfg->fc_nlinfo.nlh = nlh; | 511 | cfg->fc_nlinfo.nlh = nlh; |
| 563 | cfg->fc_nlinfo.nl_net = net; | 512 | cfg->fc_nlinfo.nl_net = net; |
| 564 | 513 | ||
| @@ -746,7 +695,7 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) | |||
| 746 | if (ifa->ifa_flags & IFA_F_SECONDARY) { | 695 | if (ifa->ifa_flags & IFA_F_SECONDARY) { |
| 747 | prim = inet_ifa_byprefix(in_dev, prefix, mask); | 696 | prim = inet_ifa_byprefix(in_dev, prefix, mask); |
| 748 | if (prim == NULL) { | 697 | if (prim == NULL) { |
| 749 | pr_warn("%s: bug: prim == NULL\n", __func__); | 698 | printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n"); |
| 750 | return; | 699 | return; |
| 751 | } | 700 | } |
| 752 | } | 701 | } |
| @@ -792,7 +741,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) | |||
| 792 | #define BRD_OK 2 | 741 | #define BRD_OK 2 |
| 793 | #define BRD0_OK 4 | 742 | #define BRD0_OK 4 |
| 794 | #define BRD1_OK 8 | 743 | #define BRD1_OK 8 |
| 795 | unsigned int ok = 0; | 744 | unsigned ok = 0; |
| 796 | int subnet = 0; /* Primary network */ | 745 | int subnet = 0; /* Primary network */ |
| 797 | int gone = 1; /* Address is missing */ | 746 | int gone = 1; /* Address is missing */ |
| 798 | int same_prefsrc = 0; /* Another primary with same IP */ | 747 | int same_prefsrc = 0; /* Another primary with same IP */ |
| @@ -800,11 +749,11 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) | |||
| 800 | if (ifa->ifa_flags & IFA_F_SECONDARY) { | 749 | if (ifa->ifa_flags & IFA_F_SECONDARY) { |
| 801 | prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); | 750 | prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); |
| 802 | if (prim == NULL) { | 751 | if (prim == NULL) { |
| 803 | pr_warn("%s: bug: prim == NULL\n", __func__); | 752 | printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n"); |
| 804 | return; | 753 | return; |
| 805 | } | 754 | } |
| 806 | if (iprim && iprim != prim) { | 755 | if (iprim && iprim != prim) { |
| 807 | pr_warn("%s: bug: iprim != prim\n", __func__); | 756 | printk(KERN_WARNING "fib_del_ifaddr: bug: iprim != prim\n"); |
| 808 | return; | 757 | return; |
| 809 | } | 758 | } |
| 810 | } else if (!ipv4_is_zeronet(any) && | 759 | } else if (!ipv4_is_zeronet(any) && |
| @@ -931,6 +880,10 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) | |||
| 931 | .flowi4_scope = frn->fl_scope, | 880 | .flowi4_scope = frn->fl_scope, |
| 932 | }; | 881 | }; |
| 933 | 882 | ||
| 883 | #ifdef CONFIG_IP_MULTIPLE_TABLES | ||
| 884 | res.r = NULL; | ||
| 885 | #endif | ||
| 886 | |||
| 934 | frn->err = -ENOENT; | 887 | frn->err = -ENOENT; |
| 935 | if (tb) { | 888 | if (tb) { |
| 936 | local_bh_disable(); | 889 | local_bh_disable(); |
| @@ -956,7 +909,7 @@ static void nl_fib_input(struct sk_buff *skb) | |||
| 956 | struct fib_result_nl *frn; | 909 | struct fib_result_nl *frn; |
| 957 | struct nlmsghdr *nlh; | 910 | struct nlmsghdr *nlh; |
| 958 | struct fib_table *tb; | 911 | struct fib_table *tb; |
| 959 | u32 portid; | 912 | u32 pid; |
| 960 | 913 | ||
| 961 | net = sock_net(skb->sk); | 914 | net = sock_net(skb->sk); |
| 962 | nlh = nlmsg_hdr(skb); | 915 | nlh = nlmsg_hdr(skb); |
| @@ -974,20 +927,17 @@ static void nl_fib_input(struct sk_buff *skb) | |||
| 974 | 927 | ||
| 975 | nl_fib_lookup(frn, tb); | 928 | nl_fib_lookup(frn, tb); |
| 976 | 929 | ||
| 977 | portid = NETLINK_CB(skb).portid; /* pid of sending process */ | 930 | pid = NETLINK_CB(skb).pid; /* pid of sending process */ |
| 978 | NETLINK_CB(skb).portid = 0; /* from kernel */ | 931 | NETLINK_CB(skb).pid = 0; /* from kernel */ |
| 979 | NETLINK_CB(skb).dst_group = 0; /* unicast */ | 932 | NETLINK_CB(skb).dst_group = 0; /* unicast */ |
| 980 | netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT); | 933 | netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); |
| 981 | } | 934 | } |
| 982 | 935 | ||
| 983 | static int __net_init nl_fib_lookup_init(struct net *net) | 936 | static int __net_init nl_fib_lookup_init(struct net *net) |
| 984 | { | 937 | { |
| 985 | struct sock *sk; | 938 | struct sock *sk; |
| 986 | struct netlink_kernel_cfg cfg = { | 939 | sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0, |
| 987 | .input = nl_fib_input, | 940 | nl_fib_input, NULL, THIS_MODULE); |
| 988 | }; | ||
| 989 | |||
| 990 | sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg); | ||
| 991 | if (sk == NULL) | 941 | if (sk == NULL) |
| 992 | return -EAFNOSUPPORT; | 942 | return -EAFNOSUPPORT; |
| 993 | net->ipv4.fibnl = sk; | 943 | net->ipv4.fibnl = sk; |
| @@ -1000,11 +950,11 @@ static void nl_fib_lookup_exit(struct net *net) | |||
| 1000 | net->ipv4.fibnl = NULL; | 950 | net->ipv4.fibnl = NULL; |
| 1001 | } | 951 | } |
| 1002 | 952 | ||
| 1003 | static void fib_disable_ip(struct net_device *dev, int force) | 953 | static void fib_disable_ip(struct net_device *dev, int force, int delay) |
| 1004 | { | 954 | { |
| 1005 | if (fib_sync_down_dev(dev, force)) | 955 | if (fib_sync_down_dev(dev, force)) |
| 1006 | fib_flush(dev_net(dev)); | 956 | fib_flush(dev_net(dev)); |
| 1007 | rt_cache_flush(dev_net(dev)); | 957 | rt_cache_flush(dev_net(dev), delay); |
| 1008 | arp_ifdown(dev); | 958 | arp_ifdown(dev); |
| 1009 | } | 959 | } |
| 1010 | 960 | ||
| @@ -1021,7 +971,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, | |||
| 1021 | fib_sync_up(dev); | 971 | fib_sync_up(dev); |
| 1022 | #endif | 972 | #endif |
| 1023 | atomic_inc(&net->ipv4.dev_addr_genid); | 973 | atomic_inc(&net->ipv4.dev_addr_genid); |
| 1024 | rt_cache_flush(dev_net(dev)); | 974 | rt_cache_flush(dev_net(dev), -1); |
| 1025 | break; | 975 | break; |
| 1026 | case NETDEV_DOWN: | 976 | case NETDEV_DOWN: |
| 1027 | fib_del_ifaddr(ifa, NULL); | 977 | fib_del_ifaddr(ifa, NULL); |
| @@ -1030,9 +980,9 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, | |||
| 1030 | /* Last address was deleted from this interface. | 980 | /* Last address was deleted from this interface. |
| 1031 | * Disable IP. | 981 | * Disable IP. |
| 1032 | */ | 982 | */ |
| 1033 | fib_disable_ip(dev, 1); | 983 | fib_disable_ip(dev, 1, 0); |
| 1034 | } else { | 984 | } else { |
| 1035 | rt_cache_flush(dev_net(dev)); | 985 | rt_cache_flush(dev_net(dev), -1); |
| 1036 | } | 986 | } |
| 1037 | break; | 987 | break; |
| 1038 | } | 988 | } |
| @@ -1042,16 +992,16 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, | |||
| 1042 | static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) | 992 | static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) |
| 1043 | { | 993 | { |
| 1044 | struct net_device *dev = ptr; | 994 | struct net_device *dev = ptr; |
| 1045 | struct in_device *in_dev; | 995 | struct in_device *in_dev = __in_dev_get_rtnl(dev); |
| 1046 | struct net *net = dev_net(dev); | 996 | struct net *net = dev_net(dev); |
| 1047 | 997 | ||
| 1048 | if (event == NETDEV_UNREGISTER) { | 998 | if (event == NETDEV_UNREGISTER) { |
| 1049 | fib_disable_ip(dev, 2); | 999 | fib_disable_ip(dev, 2, -1); |
| 1050 | rt_flush_dev(dev); | ||
| 1051 | return NOTIFY_DONE; | 1000 | return NOTIFY_DONE; |
| 1052 | } | 1001 | } |
| 1053 | 1002 | ||
| 1054 | in_dev = __in_dev_get_rtnl(dev); | 1003 | if (!in_dev) |
| 1004 | return NOTIFY_DONE; | ||
| 1055 | 1005 | ||
| 1056 | switch (event) { | 1006 | switch (event) { |
| 1057 | case NETDEV_UP: | 1007 | case NETDEV_UP: |
| @@ -1062,14 +1012,21 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo | |||
| 1062 | fib_sync_up(dev); | 1012 | fib_sync_up(dev); |
| 1063 | #endif | 1013 | #endif |
| 1064 | atomic_inc(&net->ipv4.dev_addr_genid); | 1014 | atomic_inc(&net->ipv4.dev_addr_genid); |
| 1065 | rt_cache_flush(net); | 1015 | rt_cache_flush(dev_net(dev), -1); |
| 1066 | break; | 1016 | break; |
| 1067 | case NETDEV_DOWN: | 1017 | case NETDEV_DOWN: |
| 1068 | fib_disable_ip(dev, 0); | 1018 | fib_disable_ip(dev, 0, 0); |
| 1069 | break; | 1019 | break; |
| 1070 | case NETDEV_CHANGEMTU: | 1020 | case NETDEV_CHANGEMTU: |
| 1071 | case NETDEV_CHANGE: | 1021 | case NETDEV_CHANGE: |
| 1072 | rt_cache_flush(net); | 1022 | rt_cache_flush(dev_net(dev), 0); |
| 1023 | break; | ||
| 1024 | case NETDEV_UNREGISTER_BATCH: | ||
| 1025 | /* The batch unregister is only called on the first | ||
| 1026 | * device in the list of devices being unregistered. | ||
| 1027 | * Therefore we should not pass dev_net(dev) in here. | ||
| 1028 | */ | ||
| 1029 | rt_cache_flush_batch(NULL); | ||
| 1073 | break; | 1030 | break; |
| 1074 | } | 1031 | } |
| 1075 | return NOTIFY_DONE; | 1032 | return NOTIFY_DONE; |
| @@ -1134,9 +1091,6 @@ static int __net_init fib_net_init(struct net *net) | |||
| 1134 | { | 1091 | { |
| 1135 | int error; | 1092 | int error; |
| 1136 | 1093 | ||
| 1137 | #ifdef CONFIG_IP_ROUTE_CLASSID | ||
| 1138 | net->ipv4.fib_num_tclassid_users = 0; | ||
| 1139 | #endif | ||
| 1140 | error = ip_fib_net_init(net); | 1094 | error = ip_fib_net_init(net); |
| 1141 | if (error < 0) | 1095 | if (error < 0) |
| 1142 | goto out; | 1096 | goto out; |
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 26aa65d1fce..a53bb1b5b11 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c | |||
| @@ -26,7 +26,6 @@ | |||
| 26 | #include <linux/init.h> | 26 | #include <linux/init.h> |
| 27 | #include <linux/list.h> | 27 | #include <linux/list.h> |
| 28 | #include <linux/rcupdate.h> | 28 | #include <linux/rcupdate.h> |
| 29 | #include <linux/export.h> | ||
| 30 | #include <net/ip.h> | 29 | #include <net/ip.h> |
| 31 | #include <net/route.h> | 30 | #include <net/route.h> |
| 32 | #include <net/tcp.h> | 31 | #include <net/tcp.h> |
| @@ -47,7 +46,14 @@ struct fib4_rule { | |||
| 47 | #endif | 46 | #endif |
| 48 | }; | 47 | }; |
| 49 | 48 | ||
| 50 | int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) | 49 | #ifdef CONFIG_IP_ROUTE_CLASSID |
| 50 | u32 fib_rules_tclass(const struct fib_result *res) | ||
| 51 | { | ||
| 52 | return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; | ||
| 53 | } | ||
| 54 | #endif | ||
| 55 | |||
| 56 | int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) | ||
| 51 | { | 57 | { |
| 52 | struct fib_lookup_arg arg = { | 58 | struct fib_lookup_arg arg = { |
| 53 | .result = res, | 59 | .result = res, |
| @@ -56,15 +62,10 @@ int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) | |||
| 56 | int err; | 62 | int err; |
| 57 | 63 | ||
| 58 | err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); | 64 | err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); |
| 59 | #ifdef CONFIG_IP_ROUTE_CLASSID | 65 | res->r = arg.rule; |
| 60 | if (arg.rule) | 66 | |
| 61 | res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid; | ||
| 62 | else | ||
| 63 | res->tclassid = 0; | ||
| 64 | #endif | ||
| 65 | return err; | 67 | return err; |
| 66 | } | 68 | } |
| 67 | EXPORT_SYMBOL_GPL(__fib_lookup); | ||
| 68 | 69 | ||
| 69 | static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, | 70 | static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, |
| 70 | int flags, struct fib_lookup_arg *arg) | 71 | int flags, struct fib_lookup_arg *arg) |
| @@ -166,11 +167,8 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, | |||
| 166 | rule4->dst = nla_get_be32(tb[FRA_DST]); | 167 | rule4->dst = nla_get_be32(tb[FRA_DST]); |
| 167 | 168 | ||
| 168 | #ifdef CONFIG_IP_ROUTE_CLASSID | 169 | #ifdef CONFIG_IP_ROUTE_CLASSID |
| 169 | if (tb[FRA_FLOW]) { | 170 | if (tb[FRA_FLOW]) |
| 170 | rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); | 171 | rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); |
| 171 | if (rule4->tclassid) | ||
| 172 | net->ipv4.fib_num_tclassid_users++; | ||
| 173 | } | ||
| 174 | #endif | 172 | #endif |
| 175 | 173 | ||
| 176 | rule4->src_len = frh->src_len; | 174 | rule4->src_len = frh->src_len; |
| @@ -179,24 +177,11 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, | |||
| 179 | rule4->dstmask = inet_make_mask(rule4->dst_len); | 177 | rule4->dstmask = inet_make_mask(rule4->dst_len); |
| 180 | rule4->tos = frh->tos; | 178 | rule4->tos = frh->tos; |
| 181 | 179 | ||
| 182 | net->ipv4.fib_has_custom_rules = true; | ||
| 183 | err = 0; | 180 | err = 0; |
| 184 | errout: | 181 | errout: |
| 185 | return err; | 182 | return err; |
| 186 | } | 183 | } |
| 187 | 184 | ||
| 188 | static void fib4_rule_delete(struct fib_rule *rule) | ||
| 189 | { | ||
| 190 | struct net *net = rule->fr_net; | ||
| 191 | #ifdef CONFIG_IP_ROUTE_CLASSID | ||
| 192 | struct fib4_rule *rule4 = (struct fib4_rule *) rule; | ||
| 193 | |||
| 194 | if (rule4->tclassid) | ||
| 195 | net->ipv4.fib_num_tclassid_users--; | ||
| 196 | #endif | ||
| 197 | net->ipv4.fib_has_custom_rules = true; | ||
| 198 | } | ||
| 199 | |||
| 200 | static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, | 185 | static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, |
| 201 | struct nlattr **tb) | 186 | struct nlattr **tb) |
| 202 | { | 187 | { |
| @@ -234,15 +219,15 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, | |||
| 234 | frh->src_len = rule4->src_len; | 219 | frh->src_len = rule4->src_len; |
| 235 | frh->tos = rule4->tos; | 220 | frh->tos = rule4->tos; |
| 236 | 221 | ||
| 237 | if ((rule4->dst_len && | 222 | if (rule4->dst_len) |
| 238 | nla_put_be32(skb, FRA_DST, rule4->dst)) || | 223 | NLA_PUT_BE32(skb, FRA_DST, rule4->dst); |
| 239 | (rule4->src_len && | 224 | |
| 240 | nla_put_be32(skb, FRA_SRC, rule4->src))) | 225 | if (rule4->src_len) |
| 241 | goto nla_put_failure; | 226 | NLA_PUT_BE32(skb, FRA_SRC, rule4->src); |
| 227 | |||
| 242 | #ifdef CONFIG_IP_ROUTE_CLASSID | 228 | #ifdef CONFIG_IP_ROUTE_CLASSID |
| 243 | if (rule4->tclassid && | 229 | if (rule4->tclassid) |
| 244 | nla_put_u32(skb, FRA_FLOW, rule4->tclassid)) | 230 | NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid); |
| 245 | goto nla_put_failure; | ||
| 246 | #endif | 231 | #endif |
| 247 | return 0; | 232 | return 0; |
| 248 | 233 | ||
| @@ -259,17 +244,16 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) | |||
| 259 | 244 | ||
| 260 | static void fib4_rule_flush_cache(struct fib_rules_ops *ops) | 245 | static void fib4_rule_flush_cache(struct fib_rules_ops *ops) |
| 261 | { | 246 | { |
| 262 | rt_cache_flush(ops->fro_net); | 247 | rt_cache_flush(ops->fro_net, -1); |
| 263 | } | 248 | } |
| 264 | 249 | ||
| 265 | static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = { | 250 | static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = { |
| 266 | .family = AF_INET, | 251 | .family = AF_INET, |
| 267 | .rule_size = sizeof(struct fib4_rule), | 252 | .rule_size = sizeof(struct fib4_rule), |
| 268 | .addr_size = sizeof(u32), | 253 | .addr_size = sizeof(u32), |
| 269 | .action = fib4_rule_action, | 254 | .action = fib4_rule_action, |
| 270 | .match = fib4_rule_match, | 255 | .match = fib4_rule_match, |
| 271 | .configure = fib4_rule_configure, | 256 | .configure = fib4_rule_configure, |
| 272 | .delete = fib4_rule_delete, | ||
| 273 | .compare = fib4_rule_compare, | 257 | .compare = fib4_rule_compare, |
| 274 | .fill = fib4_rule_fill, | 258 | .fill = fib4_rule_fill, |
| 275 | .default_pref = fib_default_rule_pref, | 259 | .default_pref = fib_default_rule_pref, |
| @@ -309,7 +293,6 @@ int __net_init fib4_rules_init(struct net *net) | |||
| 309 | if (err < 0) | 293 | if (err < 0) |
| 310 | goto fail; | 294 | goto fail; |
| 311 | net->ipv4.rules_ops = ops; | 295 | net->ipv4.rules_ops = ops; |
| 312 | net->ipv4.fib_has_custom_rules = false; | ||
| 313 | return 0; | 296 | return 0; |
| 314 | 297 | ||
| 315 | fail: | 298 | fail: |
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 4797a800faf..80106d89d54 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | */ | 14 | */ |
| 15 | 15 | ||
| 16 | #include <asm/uaccess.h> | 16 | #include <asm/uaccess.h> |
| 17 | #include <asm/system.h> | ||
| 17 | #include <linux/bitops.h> | 18 | #include <linux/bitops.h> |
| 18 | #include <linux/types.h> | 19 | #include <linux/types.h> |
| 19 | #include <linux/kernel.h> | 20 | #include <linux/kernel.h> |
| @@ -140,77 +141,11 @@ const struct fib_prop fib_props[RTN_MAX + 1] = { | |||
| 140 | }, | 141 | }, |
| 141 | }; | 142 | }; |
| 142 | 143 | ||
| 143 | static void rt_fibinfo_free(struct rtable __rcu **rtp) | ||
| 144 | { | ||
| 145 | struct rtable *rt = rcu_dereference_protected(*rtp, 1); | ||
| 146 | |||
| 147 | if (!rt) | ||
| 148 | return; | ||
| 149 | |||
| 150 | /* Not even needed : RCU_INIT_POINTER(*rtp, NULL); | ||
| 151 | * because we waited an RCU grace period before calling | ||
| 152 | * free_fib_info_rcu() | ||
| 153 | */ | ||
| 154 | |||
| 155 | dst_free(&rt->dst); | ||
| 156 | } | ||
| 157 | |||
| 158 | static void free_nh_exceptions(struct fib_nh *nh) | ||
| 159 | { | ||
| 160 | struct fnhe_hash_bucket *hash = nh->nh_exceptions; | ||
| 161 | int i; | ||
| 162 | |||
| 163 | for (i = 0; i < FNHE_HASH_SIZE; i++) { | ||
| 164 | struct fib_nh_exception *fnhe; | ||
| 165 | |||
| 166 | fnhe = rcu_dereference_protected(hash[i].chain, 1); | ||
| 167 | while (fnhe) { | ||
| 168 | struct fib_nh_exception *next; | ||
| 169 | |||
| 170 | next = rcu_dereference_protected(fnhe->fnhe_next, 1); | ||
| 171 | |||
| 172 | rt_fibinfo_free(&fnhe->fnhe_rth); | ||
| 173 | |||
| 174 | kfree(fnhe); | ||
| 175 | |||
| 176 | fnhe = next; | ||
| 177 | } | ||
| 178 | } | ||
| 179 | kfree(hash); | ||
| 180 | } | ||
| 181 | |||
| 182 | static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) | ||
| 183 | { | ||
| 184 | int cpu; | ||
| 185 | |||
| 186 | if (!rtp) | ||
| 187 | return; | ||
| 188 | |||
| 189 | for_each_possible_cpu(cpu) { | ||
| 190 | struct rtable *rt; | ||
| 191 | |||
| 192 | rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1); | ||
| 193 | if (rt) | ||
| 194 | dst_free(&rt->dst); | ||
| 195 | } | ||
| 196 | free_percpu(rtp); | ||
| 197 | } | ||
| 198 | |||
| 199 | /* Release a nexthop info record */ | 144 | /* Release a nexthop info record */ |
| 200 | static void free_fib_info_rcu(struct rcu_head *head) | 145 | static void free_fib_info_rcu(struct rcu_head *head) |
| 201 | { | 146 | { |
| 202 | struct fib_info *fi = container_of(head, struct fib_info, rcu); | 147 | struct fib_info *fi = container_of(head, struct fib_info, rcu); |
| 203 | 148 | ||
| 204 | change_nexthops(fi) { | ||
| 205 | if (nexthop_nh->nh_dev) | ||
| 206 | dev_put(nexthop_nh->nh_dev); | ||
| 207 | if (nexthop_nh->nh_exceptions) | ||
| 208 | free_nh_exceptions(nexthop_nh); | ||
| 209 | rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); | ||
| 210 | rt_fibinfo_free(&nexthop_nh->nh_rth_input); | ||
| 211 | } endfor_nexthops(fi); | ||
| 212 | |||
| 213 | release_net(fi->fib_net); | ||
| 214 | if (fi->fib_metrics != (u32 *) dst_default_metrics) | 149 | if (fi->fib_metrics != (u32 *) dst_default_metrics) |
| 215 | kfree(fi->fib_metrics); | 150 | kfree(fi->fib_metrics); |
| 216 | kfree(fi); | 151 | kfree(fi); |
| @@ -219,16 +154,16 @@ static void free_fib_info_rcu(struct rcu_head *head) | |||
| 219 | void free_fib_info(struct fib_info *fi) | 154 | void free_fib_info(struct fib_info *fi) |
| 220 | { | 155 | { |
| 221 | if (fi->fib_dead == 0) { | 156 | if (fi->fib_dead == 0) { |
| 222 | pr_warn("Freeing alive fib_info %p\n", fi); | 157 | pr_warning("Freeing alive fib_info %p\n", fi); |
| 223 | return; | 158 | return; |
| 224 | } | 159 | } |
| 225 | fib_info_cnt--; | ||
| 226 | #ifdef CONFIG_IP_ROUTE_CLASSID | ||
| 227 | change_nexthops(fi) { | 160 | change_nexthops(fi) { |
| 228 | if (nexthop_nh->nh_tclassid) | 161 | if (nexthop_nh->nh_dev) |
| 229 | fi->fib_net->ipv4.fib_num_tclassid_users--; | 162 | dev_put(nexthop_nh->nh_dev); |
| 163 | nexthop_nh->nh_dev = NULL; | ||
| 230 | } endfor_nexthops(fi); | 164 | } endfor_nexthops(fi); |
| 231 | #endif | 165 | fib_info_cnt--; |
| 166 | release_net(fi->fib_net); | ||
| 232 | call_rcu(&fi->rcu, free_fib_info_rcu); | 167 | call_rcu(&fi->rcu, free_fib_info_rcu); |
| 233 | } | 168 | } |
| 234 | 169 | ||
| @@ -314,7 +249,6 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) | |||
| 314 | nfi->fib_scope == fi->fib_scope && | 249 | nfi->fib_scope == fi->fib_scope && |
| 315 | nfi->fib_prefsrc == fi->fib_prefsrc && | 250 | nfi->fib_prefsrc == fi->fib_prefsrc && |
| 316 | nfi->fib_priority == fi->fib_priority && | 251 | nfi->fib_priority == fi->fib_priority && |
| 317 | nfi->fib_type == fi->fib_type && | ||
| 318 | memcmp(nfi->fib_metrics, fi->fib_metrics, | 252 | memcmp(nfi->fib_metrics, fi->fib_metrics, |
| 319 | sizeof(u32) * RTAX_MAX) == 0 && | 253 | sizeof(u32) * RTAX_MAX) == 0 && |
| 320 | ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && | 254 | ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && |
| @@ -392,7 +326,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, | |||
| 392 | if (skb == NULL) | 326 | if (skb == NULL) |
| 393 | goto errout; | 327 | goto errout; |
| 394 | 328 | ||
| 395 | err = fib_dump_info(skb, info->portid, seq, event, tb_id, | 329 | err = fib_dump_info(skb, info->pid, seq, event, tb_id, |
| 396 | fa->fa_type, key, dst_len, | 330 | fa->fa_type, key, dst_len, |
| 397 | fa->fa_tos, fa->fa_info, nlm_flags); | 331 | fa->fa_tos, fa->fa_info, nlm_flags); |
| 398 | if (err < 0) { | 332 | if (err < 0) { |
| @@ -401,7 +335,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, | |||
| 401 | kfree_skb(skb); | 335 | kfree_skb(skb); |
| 402 | goto errout; | 336 | goto errout; |
| 403 | } | 337 | } |
| 404 | rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE, | 338 | rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, |
| 405 | info->nlh, GFP_KERNEL); | 339 | info->nlh, GFP_KERNEL); |
| 406 | return; | 340 | return; |
| 407 | errout: | 341 | errout: |
| @@ -488,8 +422,6 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, | |||
| 488 | #ifdef CONFIG_IP_ROUTE_CLASSID | 422 | #ifdef CONFIG_IP_ROUTE_CLASSID |
| 489 | nla = nla_find(attrs, attrlen, RTA_FLOW); | 423 | nla = nla_find(attrs, attrlen, RTA_FLOW); |
| 490 | nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; | 424 | nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; |
| 491 | if (nexthop_nh->nh_tclassid) | ||
| 492 | fi->fib_net->ipv4.fib_num_tclassid_users++; | ||
| 493 | #endif | 425 | #endif |
| 494 | } | 426 | } |
| 495 | 427 | ||
| @@ -803,7 +735,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) | |||
| 803 | unsigned int bytes; | 735 | unsigned int bytes; |
| 804 | 736 | ||
| 805 | if (!new_size) | 737 | if (!new_size) |
| 806 | new_size = 16; | 738 | new_size = 1; |
| 807 | bytes = new_size * sizeof(struct hlist_head *); | 739 | bytes = new_size * sizeof(struct hlist_head *); |
| 808 | new_info_hash = fib_info_hash_alloc(bytes); | 740 | new_info_hash = fib_info_hash_alloc(bytes); |
| 809 | new_laddrhash = fib_info_hash_alloc(bytes); | 741 | new_laddrhash = fib_info_hash_alloc(bytes); |
| @@ -834,14 +766,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg) | |||
| 834 | fi->fib_flags = cfg->fc_flags; | 766 | fi->fib_flags = cfg->fc_flags; |
| 835 | fi->fib_priority = cfg->fc_priority; | 767 | fi->fib_priority = cfg->fc_priority; |
| 836 | fi->fib_prefsrc = cfg->fc_prefsrc; | 768 | fi->fib_prefsrc = cfg->fc_prefsrc; |
| 837 | fi->fib_type = cfg->fc_type; | ||
| 838 | 769 | ||
| 839 | fi->fib_nhs = nhs; | 770 | fi->fib_nhs = nhs; |
| 840 | change_nexthops(fi) { | 771 | change_nexthops(fi) { |
| 841 | nexthop_nh->nh_parent = fi; | 772 | nexthop_nh->nh_parent = fi; |
| 842 | nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *); | ||
| 843 | if (!nexthop_nh->nh_pcpu_rth_output) | ||
| 844 | goto failure; | ||
| 845 | } endfor_nexthops(fi) | 773 | } endfor_nexthops(fi) |
| 846 | 774 | ||
| 847 | if (cfg->fc_mx) { | 775 | if (cfg->fc_mx) { |
| @@ -852,16 +780,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg) | |||
| 852 | int type = nla_type(nla); | 780 | int type = nla_type(nla); |
| 853 | 781 | ||
| 854 | if (type) { | 782 | if (type) { |
| 855 | u32 val; | ||
| 856 | |||
| 857 | if (type > RTAX_MAX) | 783 | if (type > RTAX_MAX) |
| 858 | goto err_inval; | 784 | goto err_inval; |
| 859 | val = nla_get_u32(nla); | 785 | fi->fib_metrics[type - 1] = nla_get_u32(nla); |
| 860 | if (type == RTAX_ADVMSS && val > 65535 - 40) | ||
| 861 | val = 65535 - 40; | ||
| 862 | if (type == RTAX_MTU && val > 65535 - 15) | ||
| 863 | val = 65535 - 15; | ||
| 864 | fi->fib_metrics[type - 1] = val; | ||
| 865 | } | 786 | } |
| 866 | } | 787 | } |
| 867 | } | 788 | } |
| @@ -890,8 +811,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg) | |||
| 890 | nh->nh_flags = cfg->fc_flags; | 811 | nh->nh_flags = cfg->fc_flags; |
| 891 | #ifdef CONFIG_IP_ROUTE_CLASSID | 812 | #ifdef CONFIG_IP_ROUTE_CLASSID |
| 892 | nh->nh_tclassid = cfg->fc_flow; | 813 | nh->nh_tclassid = cfg->fc_flow; |
| 893 | if (nh->nh_tclassid) | ||
| 894 | fi->fib_net->ipv4.fib_num_tclassid_users++; | ||
| 895 | #endif | 814 | #endif |
| 896 | #ifdef CONFIG_IP_ROUTE_MULTIPATH | 815 | #ifdef CONFIG_IP_ROUTE_MULTIPATH |
| 897 | nh->nh_weight = 1; | 816 | nh->nh_weight = 1; |
| @@ -993,14 +912,14 @@ failure: | |||
| 993 | return ERR_PTR(err); | 912 | return ERR_PTR(err); |
| 994 | } | 913 | } |
| 995 | 914 | ||
| 996 | int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, | 915 | int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, |
| 997 | u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, | 916 | u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, |
| 998 | struct fib_info *fi, unsigned int flags) | 917 | struct fib_info *fi, unsigned int flags) |
| 999 | { | 918 | { |
| 1000 | struct nlmsghdr *nlh; | 919 | struct nlmsghdr *nlh; |
| 1001 | struct rtmsg *rtm; | 920 | struct rtmsg *rtm; |
| 1002 | 921 | ||
| 1003 | nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags); | 922 | nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags); |
| 1004 | if (nlh == NULL) | 923 | if (nlh == NULL) |
| 1005 | return -EMSGSIZE; | 924 | return -EMSGSIZE; |
| 1006 | 925 | ||
| @@ -1013,36 +932,33 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, | |||
| 1013 | rtm->rtm_table = tb_id; | 932 | rtm->rtm_table = tb_id; |
| 1014 | else | 933 | else |
| 1015 | rtm->rtm_table = RT_TABLE_COMPAT; | 934 | rtm->rtm_table = RT_TABLE_COMPAT; |
| 1016 | if (nla_put_u32(skb, RTA_TABLE, tb_id)) | 935 | NLA_PUT_U32(skb, RTA_TABLE, tb_id); |
| 1017 | goto nla_put_failure; | ||
| 1018 | rtm->rtm_type = type; | 936 | rtm->rtm_type = type; |
| 1019 | rtm->rtm_flags = fi->fib_flags; | 937 | rtm->rtm_flags = fi->fib_flags; |
| 1020 | rtm->rtm_scope = fi->fib_scope; | 938 | rtm->rtm_scope = fi->fib_scope; |
| 1021 | rtm->rtm_protocol = fi->fib_protocol; | 939 | rtm->rtm_protocol = fi->fib_protocol; |
| 1022 | 940 | ||
| 1023 | if (rtm->rtm_dst_len && | 941 | if (rtm->rtm_dst_len) |
| 1024 | nla_put_be32(skb, RTA_DST, dst)) | 942 | NLA_PUT_BE32(skb, RTA_DST, dst); |
| 1025 | goto nla_put_failure; | 943 | |
| 1026 | if (fi->fib_priority && | 944 | if (fi->fib_priority) |
| 1027 | nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) | 945 | NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority); |
| 1028 | goto nla_put_failure; | 946 | |
| 1029 | if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) | 947 | if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) |
| 1030 | goto nla_put_failure; | 948 | goto nla_put_failure; |
| 1031 | 949 | ||
| 1032 | if (fi->fib_prefsrc && | 950 | if (fi->fib_prefsrc) |
| 1033 | nla_put_be32(skb, RTA_PREFSRC, fi->fib_prefsrc)) | 951 | NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc); |
| 1034 | goto nla_put_failure; | 952 | |
| 1035 | if (fi->fib_nhs == 1) { | 953 | if (fi->fib_nhs == 1) { |
| 1036 | if (fi->fib_nh->nh_gw && | 954 | if (fi->fib_nh->nh_gw) |
| 1037 | nla_put_be32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw)) | 955 | NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw); |
| 1038 | goto nla_put_failure; | 956 | |
| 1039 | if (fi->fib_nh->nh_oif && | 957 | if (fi->fib_nh->nh_oif) |
| 1040 | nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif)) | 958 | NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); |
| 1041 | goto nla_put_failure; | ||
| 1042 | #ifdef CONFIG_IP_ROUTE_CLASSID | 959 | #ifdef CONFIG_IP_ROUTE_CLASSID |
| 1043 | if (fi->fib_nh[0].nh_tclassid && | 960 | if (fi->fib_nh[0].nh_tclassid) |
| 1044 | nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) | 961 | NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); |
| 1045 | goto nla_put_failure; | ||
| 1046 | #endif | 962 | #endif |
| 1047 | } | 963 | } |
| 1048 | #ifdef CONFIG_IP_ROUTE_MULTIPATH | 964 | #ifdef CONFIG_IP_ROUTE_MULTIPATH |
| @@ -1063,13 +979,11 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, | |||
| 1063 | rtnh->rtnh_hops = nh->nh_weight - 1; | 979 | rtnh->rtnh_hops = nh->nh_weight - 1; |
| 1064 | rtnh->rtnh_ifindex = nh->nh_oif; | 980 | rtnh->rtnh_ifindex = nh->nh_oif; |
| 1065 | 981 | ||
| 1066 | if (nh->nh_gw && | 982 | if (nh->nh_gw) |
| 1067 | nla_put_be32(skb, RTA_GATEWAY, nh->nh_gw)) | 983 | NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); |
| 1068 | goto nla_put_failure; | ||
| 1069 | #ifdef CONFIG_IP_ROUTE_CLASSID | 984 | #ifdef CONFIG_IP_ROUTE_CLASSID |
| 1070 | if (nh->nh_tclassid && | 985 | if (nh->nh_tclassid) |
| 1071 | nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) | 986 | NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); |
| 1072 | goto nla_put_failure; | ||
| 1073 | #endif | 987 | #endif |
| 1074 | /* length of rtnetlink header + attributes */ | 988 | /* length of rtnetlink header + attributes */ |
| 1075 | rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; | 989 | rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; |
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 31d771ca9a7..de9e2978476 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c | |||
| @@ -51,6 +51,7 @@ | |||
| 51 | #define VERSION "0.409" | 51 | #define VERSION "0.409" |
| 52 | 52 | ||
| 53 | #include <asm/uaccess.h> | 53 | #include <asm/uaccess.h> |
| 54 | #include <asm/system.h> | ||
| 54 | #include <linux/bitops.h> | 55 | #include <linux/bitops.h> |
| 55 | #include <linux/types.h> | 56 | #include <linux/types.h> |
| 56 | #include <linux/kernel.h> | 57 | #include <linux/kernel.h> |
| @@ -72,7 +73,6 @@ | |||
| 72 | #include <linux/list.h> | 73 | #include <linux/list.h> |
| 73 | #include <linux/slab.h> | 74 | #include <linux/slab.h> |
| 74 | #include <linux/prefetch.h> | 75 | #include <linux/prefetch.h> |
| 75 | #include <linux/export.h> | ||
| 76 | #include <net/net_namespace.h> | 76 | #include <net/net_namespace.h> |
| 77 | #include <net/ip.h> | 77 | #include <net/ip.h> |
| 78 | #include <net/protocol.h> | 78 | #include <net/protocol.h> |
| @@ -159,6 +159,7 @@ struct trie { | |||
| 159 | #endif | 159 | #endif |
| 160 | }; | 160 | }; |
| 161 | 161 | ||
| 162 | static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n); | ||
| 162 | static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, | 163 | static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, |
| 163 | int wasfull); | 164 | int wasfull); |
| 164 | static struct rt_trie_node *resize(struct trie *t, struct tnode *tn); | 165 | static struct rt_trie_node *resize(struct trie *t, struct tnode *tn); |
| @@ -367,7 +368,7 @@ static void __leaf_free_rcu(struct rcu_head *head) | |||
| 367 | 368 | ||
| 368 | static inline void free_leaf(struct leaf *l) | 369 | static inline void free_leaf(struct leaf *l) |
| 369 | { | 370 | { |
| 370 | call_rcu(&l->rcu, __leaf_free_rcu); | 371 | call_rcu_bh(&l->rcu, __leaf_free_rcu); |
| 371 | } | 372 | } |
| 372 | 373 | ||
| 373 | static inline void free_leaf_info(struct leaf_info *leaf) | 374 | static inline void free_leaf_info(struct leaf_info *leaf) |
| @@ -472,7 +473,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) | |||
| 472 | } | 473 | } |
| 473 | 474 | ||
| 474 | pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), | 475 | pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), |
| 475 | sizeof(struct rt_trie_node *) << bits); | 476 | sizeof(struct rt_trie_node) << bits); |
| 476 | return tn; | 477 | return tn; |
| 477 | } | 478 | } |
| 478 | 479 | ||
| @@ -489,7 +490,7 @@ static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node * | |||
| 489 | return ((struct tnode *) n)->pos == tn->pos + tn->bits; | 490 | return ((struct tnode *) n)->pos == tn->pos + tn->bits; |
| 490 | } | 491 | } |
| 491 | 492 | ||
| 492 | static inline void put_child(struct tnode *tn, int i, | 493 | static inline void put_child(struct trie *t, struct tnode *tn, int i, |
| 493 | struct rt_trie_node *n) | 494 | struct rt_trie_node *n) |
| 494 | { | 495 | { |
| 495 | tnode_put_child_reorg(tn, i, n, -1); | 496 | tnode_put_child_reorg(tn, i, n, -1); |
| @@ -753,8 +754,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) | |||
| 753 | goto nomem; | 754 | goto nomem; |
| 754 | } | 755 | } |
| 755 | 756 | ||
| 756 | put_child(tn, 2*i, (struct rt_trie_node *) left); | 757 | put_child(t, tn, 2*i, (struct rt_trie_node *) left); |
| 757 | put_child(tn, 2*i+1, (struct rt_trie_node *) right); | 758 | put_child(t, tn, 2*i+1, (struct rt_trie_node *) right); |
| 758 | } | 759 | } |
| 759 | } | 760 | } |
| 760 | 761 | ||
| @@ -775,9 +776,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) | |||
| 775 | if (tkey_extract_bits(node->key, | 776 | if (tkey_extract_bits(node->key, |
| 776 | oldtnode->pos + oldtnode->bits, | 777 | oldtnode->pos + oldtnode->bits, |
| 777 | 1) == 0) | 778 | 1) == 0) |
| 778 | put_child(tn, 2*i, node); | 779 | put_child(t, tn, 2*i, node); |
| 779 | else | 780 | else |
| 780 | put_child(tn, 2*i+1, node); | 781 | put_child(t, tn, 2*i+1, node); |
| 781 | continue; | 782 | continue; |
| 782 | } | 783 | } |
| 783 | 784 | ||
| @@ -785,8 +786,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) | |||
| 785 | inode = (struct tnode *) node; | 786 | inode = (struct tnode *) node; |
| 786 | 787 | ||
| 787 | if (inode->bits == 1) { | 788 | if (inode->bits == 1) { |
| 788 | put_child(tn, 2*i, rtnl_dereference(inode->child[0])); | 789 | put_child(t, tn, 2*i, rtnl_dereference(inode->child[0])); |
| 789 | put_child(tn, 2*i+1, rtnl_dereference(inode->child[1])); | 790 | put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1])); |
| 790 | 791 | ||
| 791 | tnode_free_safe(inode); | 792 | tnode_free_safe(inode); |
| 792 | continue; | 793 | continue; |
| @@ -816,22 +817,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) | |||
| 816 | */ | 817 | */ |
| 817 | 818 | ||
| 818 | left = (struct tnode *) tnode_get_child(tn, 2*i); | 819 | left = (struct tnode *) tnode_get_child(tn, 2*i); |
| 819 | put_child(tn, 2*i, NULL); | 820 | put_child(t, tn, 2*i, NULL); |
| 820 | 821 | ||
| 821 | BUG_ON(!left); | 822 | BUG_ON(!left); |
| 822 | 823 | ||
| 823 | right = (struct tnode *) tnode_get_child(tn, 2*i+1); | 824 | right = (struct tnode *) tnode_get_child(tn, 2*i+1); |
| 824 | put_child(tn, 2*i+1, NULL); | 825 | put_child(t, tn, 2*i+1, NULL); |
| 825 | 826 | ||
| 826 | BUG_ON(!right); | 827 | BUG_ON(!right); |
| 827 | 828 | ||
| 828 | size = tnode_child_length(left); | 829 | size = tnode_child_length(left); |
| 829 | for (j = 0; j < size; j++) { | 830 | for (j = 0; j < size; j++) { |
| 830 | put_child(left, j, rtnl_dereference(inode->child[j])); | 831 | put_child(t, left, j, rtnl_dereference(inode->child[j])); |
| 831 | put_child(right, j, rtnl_dereference(inode->child[j + size])); | 832 | put_child(t, right, j, rtnl_dereference(inode->child[j + size])); |
| 832 | } | 833 | } |
| 833 | put_child(tn, 2*i, resize(t, left)); | 834 | put_child(t, tn, 2*i, resize(t, left)); |
| 834 | put_child(tn, 2*i+1, resize(t, right)); | 835 | put_child(t, tn, 2*i+1, resize(t, right)); |
| 835 | 836 | ||
| 836 | tnode_free_safe(inode); | 837 | tnode_free_safe(inode); |
| 837 | } | 838 | } |
| @@ -876,7 +877,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) | |||
| 876 | if (!newn) | 877 | if (!newn) |
| 877 | goto nomem; | 878 | goto nomem; |
| 878 | 879 | ||
| 879 | put_child(tn, i/2, (struct rt_trie_node *)newn); | 880 | put_child(t, tn, i/2, (struct rt_trie_node *)newn); |
| 880 | } | 881 | } |
| 881 | 882 | ||
| 882 | } | 883 | } |
| @@ -891,21 +892,21 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) | |||
| 891 | if (left == NULL) { | 892 | if (left == NULL) { |
| 892 | if (right == NULL) /* Both are empty */ | 893 | if (right == NULL) /* Both are empty */ |
| 893 | continue; | 894 | continue; |
| 894 | put_child(tn, i/2, right); | 895 | put_child(t, tn, i/2, right); |
| 895 | continue; | 896 | continue; |
| 896 | } | 897 | } |
| 897 | 898 | ||
| 898 | if (right == NULL) { | 899 | if (right == NULL) { |
| 899 | put_child(tn, i/2, left); | 900 | put_child(t, tn, i/2, left); |
| 900 | continue; | 901 | continue; |
| 901 | } | 902 | } |
| 902 | 903 | ||
| 903 | /* Two nonempty children */ | 904 | /* Two nonempty children */ |
| 904 | newBinNode = (struct tnode *) tnode_get_child(tn, i/2); | 905 | newBinNode = (struct tnode *) tnode_get_child(tn, i/2); |
| 905 | put_child(tn, i/2, NULL); | 906 | put_child(t, tn, i/2, NULL); |
| 906 | put_child(newBinNode, 0, left); | 907 | put_child(t, newBinNode, 0, left); |
| 907 | put_child(newBinNode, 1, right); | 908 | put_child(t, newBinNode, 1, right); |
| 908 | put_child(tn, i/2, resize(t, newBinNode)); | 909 | put_child(t, tn, i/2, resize(t, newBinNode)); |
| 909 | } | 910 | } |
| 910 | tnode_free_safe(oldtnode); | 911 | tnode_free_safe(oldtnode); |
| 911 | return tn; | 912 | return tn; |
| @@ -1006,9 +1007,9 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) | |||
| 1006 | while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) { | 1007 | while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) { |
| 1007 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); | 1008 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); |
| 1008 | wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); | 1009 | wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); |
| 1009 | tn = (struct tnode *)resize(t, tn); | 1010 | tn = (struct tnode *) resize(t, (struct tnode *)tn); |
| 1010 | 1011 | ||
| 1011 | tnode_put_child_reorg(tp, cindex, | 1012 | tnode_put_child_reorg((struct tnode *)tp, cindex, |
| 1012 | (struct rt_trie_node *)tn, wasfull); | 1013 | (struct rt_trie_node *)tn, wasfull); |
| 1013 | 1014 | ||
| 1014 | tp = node_parent((struct rt_trie_node *) tn); | 1015 | tp = node_parent((struct rt_trie_node *) tn); |
| @@ -1023,7 +1024,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) | |||
| 1023 | 1024 | ||
| 1024 | /* Handle last (top) tnode */ | 1025 | /* Handle last (top) tnode */ |
| 1025 | if (IS_TNODE(tn)) | 1026 | if (IS_TNODE(tn)) |
| 1026 | tn = (struct tnode *)resize(t, tn); | 1027 | tn = (struct tnode *)resize(t, (struct tnode *)tn); |
| 1027 | 1028 | ||
| 1028 | rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); | 1029 | rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); |
| 1029 | tnode_free_flush(); | 1030 | tnode_free_flush(); |
| @@ -1124,7 +1125,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) | |||
| 1124 | node_set_parent((struct rt_trie_node *)l, tp); | 1125 | node_set_parent((struct rt_trie_node *)l, tp); |
| 1125 | 1126 | ||
| 1126 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); | 1127 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); |
| 1127 | put_child(tp, cindex, (struct rt_trie_node *)l); | 1128 | put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l); |
| 1128 | } else { | 1129 | } else { |
| 1129 | /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ | 1130 | /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ |
| 1130 | /* | 1131 | /* |
| @@ -1154,12 +1155,13 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) | |||
| 1154 | node_set_parent((struct rt_trie_node *)tn, tp); | 1155 | node_set_parent((struct rt_trie_node *)tn, tp); |
| 1155 | 1156 | ||
| 1156 | missbit = tkey_extract_bits(key, newpos, 1); | 1157 | missbit = tkey_extract_bits(key, newpos, 1); |
| 1157 | put_child(tn, missbit, (struct rt_trie_node *)l); | 1158 | put_child(t, tn, missbit, (struct rt_trie_node *)l); |
| 1158 | put_child(tn, 1-missbit, n); | 1159 | put_child(t, tn, 1-missbit, n); |
| 1159 | 1160 | ||
| 1160 | if (tp) { | 1161 | if (tp) { |
| 1161 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); | 1162 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); |
| 1162 | put_child(tp, cindex, (struct rt_trie_node *)tn); | 1163 | put_child(t, (struct tnode *)tp, cindex, |
| 1164 | (struct rt_trie_node *)tn); | ||
| 1163 | } else { | 1165 | } else { |
| 1164 | rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); | 1166 | rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); |
| 1165 | tp = tn; | 1167 | tp = tn; |
| @@ -1167,8 +1169,9 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) | |||
| 1167 | } | 1169 | } |
| 1168 | 1170 | ||
| 1169 | if (tp && tp->pos + tp->bits > 32) | 1171 | if (tp && tp->pos + tp->bits > 32) |
| 1170 | pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", | 1172 | pr_warning("fib_trie" |
| 1171 | tp, tp->pos, tp->bits, key, plen); | 1173 | " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", |
| 1174 | tp, tp->pos, tp->bits, key, plen); | ||
| 1172 | 1175 | ||
| 1173 | /* Rebalance the trie */ | 1176 | /* Rebalance the trie */ |
| 1174 | 1177 | ||
| @@ -1286,7 +1289,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) | |||
| 1286 | 1289 | ||
| 1287 | fib_release_info(fi_drop); | 1290 | fib_release_info(fi_drop); |
| 1288 | if (state & FA_S_ACCESSED) | 1291 | if (state & FA_S_ACCESSED) |
| 1289 | rt_cache_flush(cfg->fc_nlinfo.nl_net); | 1292 | rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); |
| 1290 | rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, | 1293 | rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, |
| 1291 | tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); | 1294 | tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); |
| 1292 | 1295 | ||
| @@ -1333,7 +1336,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) | |||
| 1333 | list_add_tail_rcu(&new_fa->fa_list, | 1336 | list_add_tail_rcu(&new_fa->fa_list, |
| 1334 | (fa ? &fa->fa_list : fa_head)); | 1337 | (fa ? &fa->fa_list : fa_head)); |
| 1335 | 1338 | ||
| 1336 | rt_cache_flush(cfg->fc_nlinfo.nl_net); | 1339 | rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); |
| 1337 | rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, | 1340 | rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, |
| 1338 | &cfg->fc_nlinfo, 0); | 1341 | &cfg->fc_nlinfo, 0); |
| 1339 | succeeded: | 1342 | succeeded: |
| @@ -1368,8 +1371,6 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, | |||
| 1368 | 1371 | ||
| 1369 | if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) | 1372 | if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) |
| 1370 | continue; | 1373 | continue; |
| 1371 | if (fi->fib_dead) | ||
| 1372 | continue; | ||
| 1373 | if (fa->fa_info->fib_scope < flp->flowi4_scope) | 1374 | if (fa->fa_info->fib_scope < flp->flowi4_scope) |
| 1374 | continue; | 1375 | continue; |
| 1375 | fib_alias_accessed(fa); | 1376 | fib_alias_accessed(fa); |
| @@ -1550,8 +1551,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, | |||
| 1550 | * state.directly. | 1551 | * state.directly. |
| 1551 | */ | 1552 | */ |
| 1552 | if (pref_mismatch) { | 1553 | if (pref_mismatch) { |
| 1553 | /* fls(x) = __fls(x) + 1 */ | 1554 | int mp = KEYLENGTH - fls(pref_mismatch); |
| 1554 | int mp = KEYLENGTH - __fls(pref_mismatch) - 1; | ||
| 1555 | 1555 | ||
| 1556 | if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0) | 1556 | if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0) |
| 1557 | goto backtrace; | 1557 | goto backtrace; |
| @@ -1606,7 +1606,6 @@ found: | |||
| 1606 | rcu_read_unlock(); | 1606 | rcu_read_unlock(); |
| 1607 | return ret; | 1607 | return ret; |
| 1608 | } | 1608 | } |
| 1609 | EXPORT_SYMBOL_GPL(fib_table_lookup); | ||
| 1610 | 1609 | ||
| 1611 | /* | 1610 | /* |
| 1612 | * Remove the leaf and return parent. | 1611 | * Remove the leaf and return parent. |
| @@ -1619,10 +1618,10 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l) | |||
| 1619 | 1618 | ||
| 1620 | if (tp) { | 1619 | if (tp) { |
| 1621 | t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); | 1620 | t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); |
| 1622 | put_child(tp, cindex, NULL); | 1621 | put_child(t, (struct tnode *)tp, cindex, NULL); |
| 1623 | trie_rebalance(t, tp); | 1622 | trie_rebalance(t, tp); |
| 1624 | } else | 1623 | } else |
| 1625 | RCU_INIT_POINTER(t->trie, NULL); | 1624 | rcu_assign_pointer(t->trie, NULL); |
| 1626 | 1625 | ||
| 1627 | free_leaf(l); | 1626 | free_leaf(l); |
| 1628 | } | 1627 | } |
| @@ -1656,12 +1655,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) | |||
| 1656 | if (!l) | 1655 | if (!l) |
| 1657 | return -ESRCH; | 1656 | return -ESRCH; |
| 1658 | 1657 | ||
| 1659 | li = find_leaf_info(l, plen); | 1658 | fa_head = get_fa_head(l, plen); |
| 1660 | |||
| 1661 | if (!li) | ||
| 1662 | return -ESRCH; | ||
| 1663 | |||
| 1664 | fa_head = &li->falh; | ||
| 1665 | fa = fib_find_alias(fa_head, tos, 0); | 1659 | fa = fib_find_alias(fa_head, tos, 0); |
| 1666 | 1660 | ||
| 1667 | if (!fa) | 1661 | if (!fa) |
| @@ -1697,6 +1691,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) | |||
| 1697 | rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, | 1691 | rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, |
| 1698 | &cfg->fc_nlinfo, 0); | 1692 | &cfg->fc_nlinfo, 0); |
| 1699 | 1693 | ||
| 1694 | l = fib_find_node(t, key); | ||
| 1695 | li = find_leaf_info(l, plen); | ||
| 1696 | |||
| 1700 | list_del_rcu(&fa->fa_list); | 1697 | list_del_rcu(&fa->fa_list); |
| 1701 | 1698 | ||
| 1702 | if (!plen) | 1699 | if (!plen) |
| @@ -1711,7 +1708,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) | |||
| 1711 | trie_leaf_remove(t, l); | 1708 | trie_leaf_remove(t, l); |
| 1712 | 1709 | ||
| 1713 | if (fa->fa_state & FA_S_ACCESSED) | 1710 | if (fa->fa_state & FA_S_ACCESSED) |
| 1714 | rt_cache_flush(cfg->fc_nlinfo.nl_net); | 1711 | rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); |
| 1715 | 1712 | ||
| 1716 | fib_release_info(fa->fa_info); | 1713 | fib_release_info(fa->fa_info); |
| 1717 | alias_free_mem_rcu(fa); | 1714 | alias_free_mem_rcu(fa); |
| @@ -1873,7 +1870,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, | |||
| 1873 | continue; | 1870 | continue; |
| 1874 | } | 1871 | } |
| 1875 | 1872 | ||
| 1876 | if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid, | 1873 | if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, |
| 1877 | cb->nlh->nlmsg_seq, | 1874 | cb->nlh->nlmsg_seq, |
| 1878 | RTM_NEWROUTE, | 1875 | RTM_NEWROUTE, |
| 1879 | tb->tb_id, | 1876 | tb->tb_id, |
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c index 42a491055c7..dbfc21de347 100644 --- a/net/ipv4/gre.c +++ b/net/ipv4/gre.c | |||
| @@ -10,8 +10,6 @@ | |||
| 10 | * | 10 | * |
| 11 | */ | 11 | */ |
| 12 | 12 | ||
| 13 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
| 14 | |||
| 15 | #include <linux/module.h> | 13 | #include <linux/module.h> |
| 16 | #include <linux/kernel.h> | 14 | #include <linux/kernel.h> |
| 17 | #include <linux/kmod.h> | 15 | #include <linux/kmod.h> |
| @@ -36,7 +34,7 @@ int gre_add_protocol(const struct gre_protocol *proto, u8 version) | |||
| 36 | if (gre_proto[version]) | 34 | if (gre_proto[version]) |
| 37 | goto err_out_unlock; | 35 | goto err_out_unlock; |
| 38 | 36 | ||
| 39 | RCU_INIT_POINTER(gre_proto[version], proto); | 37 | rcu_assign_pointer(gre_proto[version], proto); |
| 40 | spin_unlock(&gre_proto_lock); | 38 | spin_unlock(&gre_proto_lock); |
| 41 | return 0; | 39 | return 0; |
| 42 | 40 | ||
| @@ -56,7 +54,7 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version) | |||
| 56 | if (rcu_dereference_protected(gre_proto[version], | 54 | if (rcu_dereference_protected(gre_proto[version], |
| 57 | lockdep_is_held(&gre_proto_lock)) != proto) | 55 | lockdep_is_held(&gre_proto_lock)) != proto) |
| 58 | goto err_out_unlock; | 56 | goto err_out_unlock; |
| 59 | RCU_INIT_POINTER(gre_proto[version], NULL); | 57 | rcu_assign_pointer(gre_proto[version], NULL); |
| 60 | spin_unlock(&gre_proto_lock); | 58 | spin_unlock(&gre_proto_lock); |
| 61 | synchronize_rcu(); | 59 | synchronize_rcu(); |
| 62 | return 0; | 60 | return 0; |
| @@ -120,10 +118,10 @@ static const struct net_protocol net_gre_protocol = { | |||
| 120 | 118 | ||
| 121 | static int __init gre_init(void) | 119 | static int __init gre_init(void) |
| 122 | { | 120 | { |
| 123 | pr_info("GRE over IPv4 demultiplexor driver\n"); | 121 | pr_info("GRE over IPv4 demultiplexor driver"); |
| 124 | 122 | ||
| 125 | if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { | 123 | if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { |
| 126 | pr_err("can't add protocol\n"); | 124 | pr_err("gre: can't add protocol\n"); |
| 127 | return -EAGAIN; | 125 | return -EAGAIN; |
| 128 | } | 126 | } |
| 129 | 127 | ||
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 17ff9fd7cdd..23ef31baa1a 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c | |||
| @@ -62,8 +62,6 @@ | |||
| 62 | * | 62 | * |
| 63 | */ | 63 | */ |
| 64 | 64 | ||
| 65 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
| 66 | |||
| 67 | #include <linux/module.h> | 65 | #include <linux/module.h> |
| 68 | #include <linux/types.h> | 66 | #include <linux/types.h> |
| 69 | #include <linux/jiffies.h> | 67 | #include <linux/jiffies.h> |
| @@ -91,11 +89,11 @@ | |||
| 91 | #include <linux/errno.h> | 89 | #include <linux/errno.h> |
| 92 | #include <linux/timer.h> | 90 | #include <linux/timer.h> |
| 93 | #include <linux/init.h> | 91 | #include <linux/init.h> |
| 92 | #include <asm/system.h> | ||
| 94 | #include <asm/uaccess.h> | 93 | #include <asm/uaccess.h> |
| 95 | #include <net/checksum.h> | 94 | #include <net/checksum.h> |
| 96 | #include <net/xfrm.h> | 95 | #include <net/xfrm.h> |
| 97 | #include <net/inet_common.h> | 96 | #include <net/inet_common.h> |
| 98 | #include <net/ip_fib.h> | ||
| 99 | 97 | ||
| 100 | /* | 98 | /* |
| 101 | * Build xmit assembly blocks | 99 | * Build xmit assembly blocks |
| @@ -254,11 +252,10 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, | |||
| 254 | 252 | ||
| 255 | /* Limit if icmp type is enabled in ratemask. */ | 253 | /* Limit if icmp type is enabled in ratemask. */ |
| 256 | if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { | 254 | if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { |
| 257 | struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); | 255 | if (!rt->peer) |
| 258 | rc = inet_peer_xrlim_allow(peer, | 256 | rt_bind_peer(rt, fl4->daddr, 1); |
| 257 | rc = inet_peer_xrlim_allow(rt->peer, | ||
| 259 | net->ipv4.sysctl_icmp_ratelimit); | 258 | net->ipv4.sysctl_icmp_ratelimit); |
| 260 | if (peer) | ||
| 261 | inet_putpeer(peer); | ||
| 262 | } | 259 | } |
| 263 | out: | 260 | out: |
| 264 | return rc; | 261 | return rc; |
| @@ -336,7 +333,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) | |||
| 336 | struct flowi4 fl4; | 333 | struct flowi4 fl4; |
| 337 | struct sock *sk; | 334 | struct sock *sk; |
| 338 | struct inet_sock *inet; | 335 | struct inet_sock *inet; |
| 339 | __be32 daddr, saddr; | 336 | __be32 daddr; |
| 340 | 337 | ||
| 341 | if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) | 338 | if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) |
| 342 | return; | 339 | return; |
| @@ -350,7 +347,6 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) | |||
| 350 | 347 | ||
| 351 | inet->tos = ip_hdr(skb)->tos; | 348 | inet->tos = ip_hdr(skb)->tos; |
| 352 | daddr = ipc.addr = ip_hdr(skb)->saddr; | 349 | daddr = ipc.addr = ip_hdr(skb)->saddr; |
| 353 | saddr = fib_compute_spec_dst(skb); | ||
| 354 | ipc.opt = NULL; | 350 | ipc.opt = NULL; |
| 355 | ipc.tx_flags = 0; | 351 | ipc.tx_flags = 0; |
| 356 | if (icmp_param->replyopts.opt.opt.optlen) { | 352 | if (icmp_param->replyopts.opt.opt.optlen) { |
| @@ -360,7 +356,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) | |||
| 360 | } | 356 | } |
| 361 | memset(&fl4, 0, sizeof(fl4)); | 357 | memset(&fl4, 0, sizeof(fl4)); |
| 362 | fl4.daddr = daddr; | 358 | fl4.daddr = daddr; |
| 363 | fl4.saddr = saddr; | 359 | fl4.saddr = rt->rt_spec_dst; |
| 364 | fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); | 360 | fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); |
| 365 | fl4.flowi4_proto = IPPROTO_ICMP; | 361 | fl4.flowi4_proto = IPPROTO_ICMP; |
| 366 | security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); | 362 | security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); |
| @@ -572,7 +568,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) | |||
| 572 | rcu_read_lock(); | 568 | rcu_read_lock(); |
| 573 | if (rt_is_input_route(rt) && | 569 | if (rt_is_input_route(rt) && |
| 574 | net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) | 570 | net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) |
| 575 | dev = dev_get_by_index_rcu(net, inet_iif(skb_in)); | 571 | dev = dev_get_by_index_rcu(net, rt->rt_iif); |
| 576 | 572 | ||
| 577 | if (dev) | 573 | if (dev) |
| 578 | saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); | 574 | saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); |
| @@ -635,27 +631,6 @@ out:; | |||
| 635 | EXPORT_SYMBOL(icmp_send); | 631 | EXPORT_SYMBOL(icmp_send); |
| 636 | 632 | ||
| 637 | 633 | ||
| 638 | static void icmp_socket_deliver(struct sk_buff *skb, u32 info) | ||
| 639 | { | ||
| 640 | const struct iphdr *iph = (const struct iphdr *) skb->data; | ||
| 641 | const struct net_protocol *ipprot; | ||
| 642 | int protocol = iph->protocol; | ||
| 643 | |||
| 644 | /* Checkin full IP header plus 8 bytes of protocol to | ||
| 645 | * avoid additional coding at protocol handlers. | ||
| 646 | */ | ||
| 647 | if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) | ||
| 648 | return; | ||
| 649 | |||
| 650 | raw_icmp_error(skb, protocol, info); | ||
| 651 | |||
| 652 | rcu_read_lock(); | ||
| 653 | ipprot = rcu_dereference(inet_protos[protocol]); | ||
| 654 | if (ipprot && ipprot->err_handler) | ||
| 655 | ipprot->err_handler(skb, info); | ||
| 656 | rcu_read_unlock(); | ||
| 657 | } | ||
| 658 | |||
| 659 | /* | 634 | /* |
| 660 | * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH. | 635 | * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH. |
| 661 | */ | 636 | */ |
| @@ -664,8 +639,10 @@ static void icmp_unreach(struct sk_buff *skb) | |||
| 664 | { | 639 | { |
| 665 | const struct iphdr *iph; | 640 | const struct iphdr *iph; |
| 666 | struct icmphdr *icmph; | 641 | struct icmphdr *icmph; |
| 667 | struct net *net; | 642 | int hash, protocol; |
| 643 | const struct net_protocol *ipprot; | ||
| 668 | u32 info = 0; | 644 | u32 info = 0; |
| 645 | struct net *net; | ||
| 669 | 646 | ||
| 670 | net = dev_net(skb_dst(skb)->dev); | 647 | net = dev_net(skb_dst(skb)->dev); |
| 671 | 648 | ||
| @@ -693,16 +670,18 @@ static void icmp_unreach(struct sk_buff *skb) | |||
| 693 | break; | 670 | break; |
| 694 | case ICMP_FRAG_NEEDED: | 671 | case ICMP_FRAG_NEEDED: |
| 695 | if (ipv4_config.no_pmtu_disc) { | 672 | if (ipv4_config.no_pmtu_disc) { |
| 696 | LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), | 673 | LIMIT_NETDEBUG(KERN_INFO "ICMP: %pI4: fragmentation needed and DF set.\n", |
| 697 | &iph->daddr); | 674 | &iph->daddr); |
| 698 | } else { | 675 | } else { |
| 699 | info = ntohs(icmph->un.frag.mtu); | 676 | info = ip_rt_frag_needed(net, iph, |
| 677 | ntohs(icmph->un.frag.mtu), | ||
| 678 | skb->dev); | ||
| 700 | if (!info) | 679 | if (!info) |
| 701 | goto out; | 680 | goto out; |
| 702 | } | 681 | } |
| 703 | break; | 682 | break; |
| 704 | case ICMP_SR_FAILED: | 683 | case ICMP_SR_FAILED: |
| 705 | LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: Source Route Failed\n"), | 684 | LIMIT_NETDEBUG(KERN_INFO "ICMP: %pI4: Source Route Failed.\n", |
| 706 | &iph->daddr); | 685 | &iph->daddr); |
| 707 | break; | 686 | break; |
| 708 | default: | 687 | default: |
| @@ -733,14 +712,37 @@ static void icmp_unreach(struct sk_buff *skb) | |||
| 733 | 712 | ||
| 734 | if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses && | 713 | if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses && |
| 735 | inet_addr_type(net, iph->daddr) == RTN_BROADCAST) { | 714 | inet_addr_type(net, iph->daddr) == RTN_BROADCAST) { |
| 736 | net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n", | 715 | if (net_ratelimit()) |
| 737 | &ip_hdr(skb)->saddr, | 716 | printk(KERN_WARNING "%pI4 sent an invalid ICMP " |
| 738 | icmph->type, icmph->code, | 717 | "type %u, code %u " |
| 739 | &iph->daddr, skb->dev->name); | 718 | "error to a broadcast: %pI4 on %s\n", |
| 719 | &ip_hdr(skb)->saddr, | ||
| 720 | icmph->type, icmph->code, | ||
| 721 | &iph->daddr, | ||
| 722 | skb->dev->name); | ||
| 740 | goto out; | 723 | goto out; |
| 741 | } | 724 | } |
| 742 | 725 | ||
| 743 | icmp_socket_deliver(skb, info); | 726 | /* Checkin full IP header plus 8 bytes of protocol to |
| 727 | * avoid additional coding at protocol handlers. | ||
| 728 | */ | ||
| 729 | if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) | ||
| 730 | goto out; | ||
| 731 | |||
| 732 | iph = (const struct iphdr *)skb->data; | ||
| 733 | protocol = iph->protocol; | ||
| 734 | |||
| 735 | /* | ||
| 736 | * Deliver ICMP message to raw sockets. Pretty useless feature? | ||
| 737 | */ | ||
| 738 | raw_icmp_error(skb, protocol, info); | ||
| 739 | |||
| 740 | hash = protocol & (MAX_INET_PROTOS - 1); | ||
| 741 | rcu_read_lock(); | ||
| 742 | ipprot = rcu_dereference(inet_protos[hash]); | ||
| 743 | if (ipprot && ipprot->err_handler) | ||
| 744 | ipprot->err_handler(skb, info); | ||
| 745 | rcu_read_unlock(); | ||
| 744 | 746 | ||
| 745 | out: | 747 | out: |
| 746 | return; | 748 | return; |
| @@ -756,15 +758,46 @@ out_err: | |||
| 756 | 758 | ||
| 757 | static void icmp_redirect(struct sk_buff *skb) | 759 | static void icmp_redirect(struct sk_buff *skb) |
| 758 | { | 760 | { |
| 759 | if (skb->len < sizeof(struct iphdr)) { | 761 | const struct iphdr *iph; |
| 760 | ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); | 762 | |
| 761 | return; | 763 | if (skb->len < sizeof(struct iphdr)) |
| 762 | } | 764 | goto out_err; |
| 763 | 765 | ||
| 766 | /* | ||
| 767 | * Get the copied header of the packet that caused the redirect | ||
| 768 | */ | ||
| 764 | if (!pskb_may_pull(skb, sizeof(struct iphdr))) | 769 | if (!pskb_may_pull(skb, sizeof(struct iphdr))) |
| 765 | return; | 770 | goto out; |
| 771 | |||
| 772 | iph = (const struct iphdr *)skb->data; | ||
| 766 | 773 | ||
| 767 | icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway); | 774 | switch (icmp_hdr(skb)->code & 7) { |
| 775 | case ICMP_REDIR_NET: | ||
| 776 | case ICMP_REDIR_NETTOS: | ||
| 777 | /* | ||
| 778 | * As per RFC recommendations now handle it as a host redirect. | ||
| 779 | */ | ||
| 780 | case ICMP_REDIR_HOST: | ||
| 781 | case ICMP_REDIR_HOSTTOS: | ||
| 782 | ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr, | ||
| 783 | icmp_hdr(skb)->un.gateway, | ||
| 784 | iph->saddr, skb->dev); | ||
| 785 | break; | ||
| 786 | } | ||
| 787 | |||
| 788 | /* Ping wants to see redirects. | ||
| 789 | * Let's pretend they are errors of sorts... */ | ||
| 790 | if (iph->protocol == IPPROTO_ICMP && | ||
| 791 | iph->ihl >= 5 && | ||
| 792 | pskb_may_pull(skb, (iph->ihl<<2)+8)) { | ||
| 793 | ping_err(skb, icmp_hdr(skb)->un.gateway); | ||
| 794 | } | ||
| 795 | |||
| 796 | out: | ||
| 797 | return; | ||
| 798 | out_err: | ||
| 799 | ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); | ||
| 800 | goto out; | ||
| 768 | } | 801 | } |
| 769 | 802 | ||
| 770 | /* | 803 | /* |
| @@ -838,6 +871,87 @@ out_err: | |||
| 838 | goto out; | 871 | goto out; |
| 839 | } | 872 | } |
| 840 | 873 | ||
| 874 | |||
| 875 | /* | ||
| 876 | * Handle ICMP_ADDRESS_MASK requests. (RFC950) | ||
| 877 | * | ||
| 878 | * RFC1122 (3.2.2.9). A host MUST only send replies to | ||
| 879 | * ADDRESS_MASK requests if it's been configured as an address mask | ||
| 880 | * agent. Receiving a request doesn't constitute implicit permission to | ||
| 881 | * act as one. Of course, implementing this correctly requires (SHOULD) | ||
| 882 | * a way to turn the functionality on and off. Another one for sysctl(), | ||
| 883 | * I guess. -- MS | ||
| 884 | * | ||
| 885 | * RFC1812 (4.3.3.9). A router MUST implement it. | ||
| 886 | * A router SHOULD have switch turning it on/off. | ||
| 887 | * This switch MUST be ON by default. | ||
| 888 | * | ||
| 889 | * Gratuitous replies, zero-source replies are not implemented, | ||
| 890 | * that complies with RFC. DO NOT implement them!!! All the idea | ||
| 891 | * of broadcast addrmask replies as specified in RFC950 is broken. | ||
| 892 | * The problem is that it is not uncommon to have several prefixes | ||
| 893 | * on one physical interface. Moreover, addrmask agent can even be | ||
| 894 | * not aware of existing another prefixes. | ||
| 895 | * If source is zero, addrmask agent cannot choose correct prefix. | ||
| 896 | * Gratuitous mask announcements suffer from the same problem. | ||
| 897 | * RFC1812 explains it, but still allows to use ADDRMASK, | ||
| 898 | * that is pretty silly. --ANK | ||
| 899 | * | ||
| 900 | * All these rules are so bizarre, that I removed kernel addrmask | ||
| 901 | * support at all. It is wrong, it is obsolete, nobody uses it in | ||
| 902 | * any case. --ANK | ||
| 903 | * | ||
| 904 | * Furthermore you can do it with a usermode address agent program | ||
| 905 | * anyway... | ||
| 906 | */ | ||
| 907 | |||
| 908 | static void icmp_address(struct sk_buff *skb) | ||
| 909 | { | ||
| 910 | #if 0 | ||
| 911 | if (net_ratelimit()) | ||
| 912 | printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n"); | ||
| 913 | #endif | ||
| 914 | } | ||
| 915 | |||
| 916 | /* | ||
| 917 | * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain | ||
| 918 | * loudly if an inconsistency is found. | ||
| 919 | * called with rcu_read_lock() | ||
| 920 | */ | ||
| 921 | |||
| 922 | static void icmp_address_reply(struct sk_buff *skb) | ||
| 923 | { | ||
| 924 | struct rtable *rt = skb_rtable(skb); | ||
| 925 | struct net_device *dev = skb->dev; | ||
| 926 | struct in_device *in_dev; | ||
| 927 | struct in_ifaddr *ifa; | ||
| 928 | |||
| 929 | if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC)) | ||
| 930 | return; | ||
| 931 | |||
| 932 | in_dev = __in_dev_get_rcu(dev); | ||
| 933 | if (!in_dev) | ||
| 934 | return; | ||
| 935 | |||
| 936 | if (in_dev->ifa_list && | ||
| 937 | IN_DEV_LOG_MARTIANS(in_dev) && | ||
| 938 | IN_DEV_FORWARD(in_dev)) { | ||
| 939 | __be32 _mask, *mp; | ||
| 940 | |||
| 941 | mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask); | ||
| 942 | BUG_ON(mp == NULL); | ||
| 943 | for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { | ||
| 944 | if (*mp == ifa->ifa_mask && | ||
| 945 | inet_ifa_match(ip_hdr(skb)->saddr, ifa)) | ||
| 946 | break; | ||
| 947 | } | ||
| 948 | if (!ifa && net_ratelimit()) { | ||
| 949 | printk(KERN_INFO "Wrong address mask %pI4 from %s/%pI4\n", | ||
| 950 | mp, dev->name, &ip_hdr(skb)->saddr); | ||
| 951 | } | ||
| 952 | } | ||
| 953 | } | ||
| 954 | |||
| 841 | static void icmp_discard(struct sk_buff *skb) | 955 | static void icmp_discard(struct sk_buff *skb) |
| 842 | { | 956 | { |
| 843 | } | 957 | } |
| @@ -1001,10 +1115,10 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { | |||
| 1001 | .handler = icmp_discard, | 1115 | .handler = icmp_discard, |
| 1002 | }, | 1116 | }, |
| 1003 | [ICMP_ADDRESS] = { | 1117 | [ICMP_ADDRESS] = { |
| 1004 | .handler = icmp_discard, | 1118 | .handler = icmp_address, |
| 1005 | }, | 1119 | }, |
| 1006 | [ICMP_ADDRESSREPLY] = { | 1120 | [ICMP_ADDRESSREPLY] = { |
| 1007 | .handler = icmp_discard, | 1121 | .handler = icmp_address_reply, |
| 1008 | }, | 1122 | }, |
| 1009 | }; | 1123 | }; |
| 1010 | 1124 | ||
| @@ -1038,9 +1152,10 @@ static int __net_init icmp_sk_init(struct net *net) | |||
| 1038 | net->ipv4.icmp_sk[i] = sk; | 1152 | net->ipv4.icmp_sk[i] = sk; |
| 1039 | 1153 | ||
| 1040 | /* Enough space for 2 64K ICMP packets, including | 1154 | /* Enough space for 2 64K ICMP packets, including |
| 1041 | * sk_buff/skb_shared_info struct overhead. | 1155 | * sk_buff struct overhead. |
| 1042 | */ | 1156 | */ |
| 1043 | sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024); | 1157 | sk->sk_sndbuf = |
| 1158 | (2 * ((64 * 1024) + sizeof(struct sk_buff))); | ||
| 1044 | 1159 | ||
| 1045 | /* | 1160 | /* |
| 1046 | * Speedup sock_wfree() | 1161 | * Speedup sock_wfree() |
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 736ab70fd17..e0d42dbb33f 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c | |||
| @@ -73,6 +73,7 @@ | |||
| 73 | #include <linux/module.h> | 73 | #include <linux/module.h> |
| 74 | #include <linux/slab.h> | 74 | #include <linux/slab.h> |
| 75 | #include <asm/uaccess.h> | 75 | #include <asm/uaccess.h> |
| 76 | #include <asm/system.h> | ||
| 76 | #include <linux/types.h> | 77 | #include <linux/types.h> |
| 77 | #include <linux/kernel.h> | 78 | #include <linux/kernel.h> |
| 78 | #include <linux/jiffies.h> | 79 | #include <linux/jiffies.h> |
| @@ -303,11 +304,9 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) | |||
| 303 | struct igmpv3_report *pig; | 304 | struct igmpv3_report *pig; |
| 304 | struct net *net = dev_net(dev); | 305 | struct net *net = dev_net(dev); |
| 305 | struct flowi4 fl4; | 306 | struct flowi4 fl4; |
| 306 | int hlen = LL_RESERVED_SPACE(dev); | ||
| 307 | int tlen = dev->needed_tailroom; | ||
| 308 | 307 | ||
| 309 | while (1) { | 308 | while (1) { |
| 310 | skb = alloc_skb(size + hlen + tlen, | 309 | skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev), |
| 311 | GFP_ATOMIC | __GFP_NOWARN); | 310 | GFP_ATOMIC | __GFP_NOWARN); |
| 312 | if (skb) | 311 | if (skb) |
| 313 | break; | 312 | break; |
| @@ -328,7 +327,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) | |||
| 328 | skb_dst_set(skb, &rt->dst); | 327 | skb_dst_set(skb, &rt->dst); |
| 329 | skb->dev = dev; | 328 | skb->dev = dev; |
| 330 | 329 | ||
| 331 | skb_reserve(skb, hlen); | 330 | skb_reserve(skb, LL_RESERVED_SPACE(dev)); |
| 332 | 331 | ||
| 333 | skb_reset_network_header(skb); | 332 | skb_reset_network_header(skb); |
| 334 | pip = ip_hdr(skb); | 333 | pip = ip_hdr(skb); |
| @@ -344,10 +343,10 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) | |||
| 344 | pip->protocol = IPPROTO_IGMP; | 343 | pip->protocol = IPPROTO_IGMP; |
| 345 | pip->tot_len = 0; /* filled in later */ | 344 | pip->tot_len = 0; /* filled in later */ |
| 346 | ip_select_ident(pip, &rt->dst, NULL); | 345 | ip_select_ident(pip, &rt->dst, NULL); |
| 347 | ((u8 *)&pip[1])[0] = IPOPT_RA; | 346 | ((u8*)&pip[1])[0] = IPOPT_RA; |
| 348 | ((u8 *)&pip[1])[1] = 4; | 347 | ((u8*)&pip[1])[1] = 4; |
| 349 | ((u8 *)&pip[1])[2] = 0; | 348 | ((u8*)&pip[1])[2] = 0; |
| 350 | ((u8 *)&pip[1])[3] = 0; | 349 | ((u8*)&pip[1])[3] = 0; |
| 351 | 350 | ||
| 352 | skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4; | 351 | skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4; |
| 353 | skb_put(skb, sizeof(*pig)); | 352 | skb_put(skb, sizeof(*pig)); |
| @@ -648,7 +647,6 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, | |||
| 648 | __be32 group = pmc ? pmc->multiaddr : 0; | 647 | __be32 group = pmc ? pmc->multiaddr : 0; |
| 649 | struct flowi4 fl4; | 648 | struct flowi4 fl4; |
| 650 | __be32 dst; | 649 | __be32 dst; |
| 651 | int hlen, tlen; | ||
| 652 | 650 | ||
| 653 | if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) | 651 | if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) |
| 654 | return igmpv3_send_report(in_dev, pmc); | 652 | return igmpv3_send_report(in_dev, pmc); |
| @@ -663,9 +661,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, | |||
| 663 | if (IS_ERR(rt)) | 661 | if (IS_ERR(rt)) |
| 664 | return -1; | 662 | return -1; |
| 665 | 663 | ||
| 666 | hlen = LL_RESERVED_SPACE(dev); | 664 | skb = alloc_skb(IGMP_SIZE+LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); |
| 667 | tlen = dev->needed_tailroom; | ||
| 668 | skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC); | ||
| 669 | if (skb == NULL) { | 665 | if (skb == NULL) { |
| 670 | ip_rt_put(rt); | 666 | ip_rt_put(rt); |
| 671 | return -1; | 667 | return -1; |
| @@ -673,7 +669,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, | |||
| 673 | 669 | ||
| 674 | skb_dst_set(skb, &rt->dst); | 670 | skb_dst_set(skb, &rt->dst); |
| 675 | 671 | ||
| 676 | skb_reserve(skb, hlen); | 672 | skb_reserve(skb, LL_RESERVED_SPACE(dev)); |
| 677 | 673 | ||
| 678 | skb_reset_network_header(skb); | 674 | skb_reset_network_header(skb); |
| 679 | iph = ip_hdr(skb); | 675 | iph = ip_hdr(skb); |
| @@ -688,10 +684,10 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, | |||
| 688 | iph->saddr = fl4.saddr; | 684 | iph->saddr = fl4.saddr; |
| 689 | iph->protocol = IPPROTO_IGMP; | 685 | iph->protocol = IPPROTO_IGMP; |
| 690 | ip_select_ident(iph, &rt->dst, NULL); | 686 | ip_select_ident(iph, &rt->dst, NULL); |
| 691 | ((u8 *)&iph[1])[0] = IPOPT_RA; | 687 | ((u8*)&iph[1])[0] = IPOPT_RA; |
| 692 | ((u8 *)&iph[1])[1] = 4; | 688 | ((u8*)&iph[1])[1] = 4; |
| 693 | ((u8 *)&iph[1])[2] = 0; | 689 | ((u8*)&iph[1])[2] = 0; |
| 694 | ((u8 *)&iph[1])[3] = 0; | 690 | ((u8*)&iph[1])[3] = 0; |
| 695 | 691 | ||
| 696 | ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); | 692 | ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); |
| 697 | ih->type = type; | 693 | ih->type = type; |
| @@ -774,7 +770,7 @@ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) | |||
| 774 | if (psf->sf_count[MCAST_INCLUDE] || | 770 | if (psf->sf_count[MCAST_INCLUDE] || |
| 775 | pmc->sfcount[MCAST_EXCLUDE] != | 771 | pmc->sfcount[MCAST_EXCLUDE] != |
| 776 | psf->sf_count[MCAST_EXCLUDE]) | 772 | psf->sf_count[MCAST_EXCLUDE]) |
| 777 | break; | 773 | continue; |
| 778 | if (srcs[i] == psf->sf_inaddr) { | 774 | if (srcs[i] == psf->sf_inaddr) { |
| 779 | scount++; | 775 | scount++; |
| 780 | break; | 776 | break; |
| @@ -815,15 +811,14 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) | |||
| 815 | return 1; | 811 | return 1; |
| 816 | } | 812 | } |
| 817 | 813 | ||
| 818 | /* return true if packet was dropped */ | 814 | static void igmp_heard_report(struct in_device *in_dev, __be32 group) |
| 819 | static bool igmp_heard_report(struct in_device *in_dev, __be32 group) | ||
| 820 | { | 815 | { |
| 821 | struct ip_mc_list *im; | 816 | struct ip_mc_list *im; |
| 822 | 817 | ||
| 823 | /* Timers are only set for non-local groups */ | 818 | /* Timers are only set for non-local groups */ |
| 824 | 819 | ||
| 825 | if (group == IGMP_ALL_HOSTS) | 820 | if (group == IGMP_ALL_HOSTS) |
| 826 | return false; | 821 | return; |
| 827 | 822 | ||
| 828 | rcu_read_lock(); | 823 | rcu_read_lock(); |
| 829 | for_each_pmc_rcu(in_dev, im) { | 824 | for_each_pmc_rcu(in_dev, im) { |
| @@ -833,11 +828,9 @@ static bool igmp_heard_report(struct in_device *in_dev, __be32 group) | |||
| 833 | } | 828 | } |
| 834 | } | 829 | } |
| 835 | rcu_read_unlock(); | 830 | rcu_read_unlock(); |
| 836 | return false; | ||
| 837 | } | 831 | } |
| 838 | 832 | ||
| 839 | /* return true if packet was dropped */ | 833 | static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, |
| 840 | static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, | ||
| 841 | int len) | 834 | int len) |
| 842 | { | 835 | { |
| 843 | struct igmphdr *ih = igmp_hdr(skb); | 836 | struct igmphdr *ih = igmp_hdr(skb); |
| @@ -869,7 +862,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, | |||
| 869 | /* clear deleted report items */ | 862 | /* clear deleted report items */ |
| 870 | igmpv3_clear_delrec(in_dev); | 863 | igmpv3_clear_delrec(in_dev); |
| 871 | } else if (len < 12) { | 864 | } else if (len < 12) { |
| 872 | return true; /* ignore bogus packet; freed by caller */ | 865 | return; /* ignore bogus packet; freed by caller */ |
| 873 | } else if (IGMP_V1_SEEN(in_dev)) { | 866 | } else if (IGMP_V1_SEEN(in_dev)) { |
| 874 | /* This is a v3 query with v1 queriers present */ | 867 | /* This is a v3 query with v1 queriers present */ |
| 875 | max_delay = IGMP_Query_Response_Interval; | 868 | max_delay = IGMP_Query_Response_Interval; |
| @@ -886,13 +879,13 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, | |||
| 886 | max_delay = 1; /* can't mod w/ 0 */ | 879 | max_delay = 1; /* can't mod w/ 0 */ |
| 887 | } else { /* v3 */ | 880 | } else { /* v3 */ |
| 888 | if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) | 881 | if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) |
| 889 | return true; | 882 | return; |
| 890 | 883 | ||
| 891 | ih3 = igmpv3_query_hdr(skb); | 884 | ih3 = igmpv3_query_hdr(skb); |
| 892 | if (ih3->nsrcs) { | 885 | if (ih3->nsrcs) { |
| 893 | if (!pskb_may_pull(skb, sizeof(struct igmpv3_query) | 886 | if (!pskb_may_pull(skb, sizeof(struct igmpv3_query) |
| 894 | + ntohs(ih3->nsrcs)*sizeof(__be32))) | 887 | + ntohs(ih3->nsrcs)*sizeof(__be32))) |
| 895 | return true; | 888 | return; |
| 896 | ih3 = igmpv3_query_hdr(skb); | 889 | ih3 = igmpv3_query_hdr(skb); |
| 897 | } | 890 | } |
| 898 | 891 | ||
| @@ -904,9 +897,9 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, | |||
| 904 | in_dev->mr_qrv = ih3->qrv; | 897 | in_dev->mr_qrv = ih3->qrv; |
| 905 | if (!group) { /* general query */ | 898 | if (!group) { /* general query */ |
| 906 | if (ih3->nsrcs) | 899 | if (ih3->nsrcs) |
| 907 | return false; /* no sources allowed */ | 900 | return; /* no sources allowed */ |
| 908 | igmp_gq_start_timer(in_dev); | 901 | igmp_gq_start_timer(in_dev); |
| 909 | return false; | 902 | return; |
| 910 | } | 903 | } |
| 911 | /* mark sources to include, if group & source-specific */ | 904 | /* mark sources to include, if group & source-specific */ |
| 912 | mark = ih3->nsrcs != 0; | 905 | mark = ih3->nsrcs != 0; |
| @@ -942,7 +935,6 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, | |||
| 942 | igmp_mod_timer(im, max_delay); | 935 | igmp_mod_timer(im, max_delay); |
| 943 | } | 936 | } |
| 944 | rcu_read_unlock(); | 937 | rcu_read_unlock(); |
| 945 | return false; | ||
| 946 | } | 938 | } |
| 947 | 939 | ||
| 948 | /* called in rcu_read_lock() section */ | 940 | /* called in rcu_read_lock() section */ |
| @@ -952,7 +944,6 @@ int igmp_rcv(struct sk_buff *skb) | |||
| 952 | struct igmphdr *ih; | 944 | struct igmphdr *ih; |
| 953 | struct in_device *in_dev = __in_dev_get_rcu(skb->dev); | 945 | struct in_device *in_dev = __in_dev_get_rcu(skb->dev); |
| 954 | int len = skb->len; | 946 | int len = skb->len; |
| 955 | bool dropped = true; | ||
| 956 | 947 | ||
| 957 | if (in_dev == NULL) | 948 | if (in_dev == NULL) |
| 958 | goto drop; | 949 | goto drop; |
| @@ -974,7 +965,7 @@ int igmp_rcv(struct sk_buff *skb) | |||
| 974 | ih = igmp_hdr(skb); | 965 | ih = igmp_hdr(skb); |
| 975 | switch (ih->type) { | 966 | switch (ih->type) { |
| 976 | case IGMP_HOST_MEMBERSHIP_QUERY: | 967 | case IGMP_HOST_MEMBERSHIP_QUERY: |
| 977 | dropped = igmp_heard_query(in_dev, skb, len); | 968 | igmp_heard_query(in_dev, skb, len); |
| 978 | break; | 969 | break; |
| 979 | case IGMP_HOST_MEMBERSHIP_REPORT: | 970 | case IGMP_HOST_MEMBERSHIP_REPORT: |
| 980 | case IGMPV2_HOST_MEMBERSHIP_REPORT: | 971 | case IGMPV2_HOST_MEMBERSHIP_REPORT: |
| @@ -984,7 +975,7 @@ int igmp_rcv(struct sk_buff *skb) | |||
| 984 | /* don't rely on MC router hearing unicast reports */ | 975 | /* don't rely on MC router hearing unicast reports */ |
| 985 | if (skb->pkt_type == PACKET_MULTICAST || | 976 | if (skb->pkt_type == PACKET_MULTICAST || |
| 986 | skb->pkt_type == PACKET_BROADCAST) | 977 | skb->pkt_type == PACKET_BROADCAST) |
| 987 | dropped = igmp_heard_report(in_dev, ih->group); | 978 | igmp_heard_report(in_dev, ih->group); |
| 988 | break; | 979 | break; |
| 989 | case IGMP_PIM: | 980 | case IGMP_PIM: |
| 990 | #ifdef CONFIG_IP_PIMSM_V1 | 981 | #ifdef CONFIG_IP_PIMSM_V1 |
| @@ -1002,10 +993,7 @@ int igmp_rcv(struct sk_buff *skb) | |||
| 1002 | } | 993 | } |
| 1003 | 994 | ||
| 1004 | drop: | 995 | drop: |
| 1005 | if (dropped) | 996 | kfree_skb(skb); |
| 1006 | kfree_skb(skb); | ||
| 1007 | else | ||
| 1008 | consume_skb(skb); | ||
| 1009 | return 0; | 997 | return 0; |
| 1010 | } | 998 | } |
| 1011 | 999 | ||
| @@ -1023,7 +1011,7 @@ static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr) | |||
| 1023 | 1011 | ||
| 1024 | /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG. | 1012 | /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG. |
| 1025 | We will get multicast token leakage, when IFF_MULTICAST | 1013 | We will get multicast token leakage, when IFF_MULTICAST |
| 1026 | is changed. This check should be done in ndo_set_rx_mode | 1014 | is changed. This check should be done in dev->set_multicast_list |
| 1027 | routine. Something sort of: | 1015 | routine. Something sort of: |
| 1028 | if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; } | 1016 | if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; } |
| 1029 | --ANK | 1017 | --ANK |
| @@ -1588,7 +1576,7 @@ out_unlock: | |||
| 1588 | * Add multicast single-source filter to the interface list | 1576 | * Add multicast single-source filter to the interface list |
| 1589 | */ | 1577 | */ |
| 1590 | static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode, | 1578 | static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode, |
| 1591 | __be32 *psfsrc) | 1579 | __be32 *psfsrc, int delta) |
| 1592 | { | 1580 | { |
| 1593 | struct ip_sf_list *psf, *psf_prev; | 1581 | struct ip_sf_list *psf, *psf_prev; |
| 1594 | 1582 | ||
| @@ -1723,15 +1711,14 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, | |||
| 1723 | pmc->sfcount[sfmode]++; | 1711 | pmc->sfcount[sfmode]++; |
| 1724 | err = 0; | 1712 | err = 0; |
| 1725 | for (i=0; i<sfcount; i++) { | 1713 | for (i=0; i<sfcount; i++) { |
| 1726 | err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]); | 1714 | err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i], delta); |
| 1727 | if (err) | 1715 | if (err) |
| 1728 | break; | 1716 | break; |
| 1729 | } | 1717 | } |
| 1730 | if (err) { | 1718 | if (err) { |
| 1731 | int j; | 1719 | int j; |
| 1732 | 1720 | ||
| 1733 | if (!delta) | 1721 | pmc->sfcount[sfmode]--; |
| 1734 | pmc->sfcount[sfmode]--; | ||
| 1735 | for (j=0; j<i; j++) | 1722 | for (j=0; j<i; j++) |
| 1736 | (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]); | 1723 | (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]); |
| 1737 | } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) { | 1724 | } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) { |
| @@ -1850,7 +1837,7 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, | |||
| 1850 | } | 1837 | } |
| 1851 | err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, | 1838 | err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, |
| 1852 | iml->sfmode, psf->sl_count, psf->sl_addr, 0); | 1839 | iml->sfmode, psf->sl_count, psf->sl_addr, 0); |
| 1853 | RCU_INIT_POINTER(iml->sflist, NULL); | 1840 | rcu_assign_pointer(iml->sflist, NULL); |
| 1854 | /* decrease mem now to avoid the memleak warning */ | 1841 | /* decrease mem now to avoid the memleak warning */ |
| 1855 | atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc); | 1842 | atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc); |
| 1856 | kfree_rcu(psf, rcu); | 1843 | kfree_rcu(psf, rcu); |
| @@ -1904,7 +1891,6 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) | |||
| 1904 | rtnl_unlock(); | 1891 | rtnl_unlock(); |
| 1905 | return ret; | 1892 | return ret; |
| 1906 | } | 1893 | } |
| 1907 | EXPORT_SYMBOL(ip_mc_leave_group); | ||
| 1908 | 1894 | ||
| 1909 | int ip_mc_source(int add, int omode, struct sock *sk, struct | 1895 | int ip_mc_source(int add, int omode, struct sock *sk, struct |
| 1910 | ip_mreq_source *mreqs, int ifindex) | 1896 | ip_mreq_source *mreqs, int ifindex) |
| @@ -2444,8 +2430,6 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v) | |||
| 2444 | struct ip_mc_list *im = (struct ip_mc_list *)v; | 2430 | struct ip_mc_list *im = (struct ip_mc_list *)v; |
| 2445 | struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); | 2431 | struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); |
| 2446 | char *querier; | 2432 | char *querier; |
| 2447 | long delta; | ||
| 2448 | |||
| 2449 | #ifdef CONFIG_IP_MULTICAST | 2433 | #ifdef CONFIG_IP_MULTICAST |
| 2450 | querier = IGMP_V1_SEEN(state->in_dev) ? "V1" : | 2434 | querier = IGMP_V1_SEEN(state->in_dev) ? "V1" : |
| 2451 | IGMP_V2_SEEN(state->in_dev) ? "V2" : | 2435 | IGMP_V2_SEEN(state->in_dev) ? "V2" : |
| @@ -2459,12 +2443,11 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v) | |||
| 2459 | state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); | 2443 | state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); |
| 2460 | } | 2444 | } |
| 2461 | 2445 | ||
| 2462 | delta = im->timer.expires - jiffies; | ||
| 2463 | seq_printf(seq, | 2446 | seq_printf(seq, |
| 2464 | "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n", | 2447 | "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n", |
| 2465 | im->multiaddr, im->users, | 2448 | im->multiaddr, im->users, |
| 2466 | im->tm_running, | 2449 | im->tm_running, im->tm_running ? |
| 2467 | im->tm_running ? jiffies_delta_to_clock_t(delta) : 0, | 2450 | jiffies_to_clock_t(im->timer.expires-jiffies) : 0, |
| 2468 | im->reporter); | 2451 | im->reporter); |
| 2469 | } | 2452 | } |
| 2470 | return 0; | 2453 | return 0; |
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index d0670f00d52..c14d88ad348 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c | |||
| @@ -42,8 +42,7 @@ EXPORT_SYMBOL(sysctl_local_reserved_ports); | |||
| 42 | 42 | ||
| 43 | void inet_get_local_port_range(int *low, int *high) | 43 | void inet_get_local_port_range(int *low, int *high) |
| 44 | { | 44 | { |
| 45 | unsigned int seq; | 45 | unsigned seq; |
| 46 | |||
| 47 | do { | 46 | do { |
| 48 | seq = read_seqbegin(&sysctl_local_ports.lock); | 47 | seq = read_seqbegin(&sysctl_local_ports.lock); |
| 49 | 48 | ||
| @@ -54,7 +53,7 @@ void inet_get_local_port_range(int *low, int *high) | |||
| 54 | EXPORT_SYMBOL(inet_get_local_port_range); | 53 | EXPORT_SYMBOL(inet_get_local_port_range); |
| 55 | 54 | ||
| 56 | int inet_csk_bind_conflict(const struct sock *sk, | 55 | int inet_csk_bind_conflict(const struct sock *sk, |
| 57 | const struct inet_bind_bucket *tb, bool relax) | 56 | const struct inet_bind_bucket *tb) |
| 58 | { | 57 | { |
| 59 | struct sock *sk2; | 58 | struct sock *sk2; |
| 60 | struct hlist_node *node; | 59 | struct hlist_node *node; |
| @@ -80,14 +79,6 @@ int inet_csk_bind_conflict(const struct sock *sk, | |||
| 80 | sk2_rcv_saddr == sk_rcv_saddr(sk)) | 79 | sk2_rcv_saddr == sk_rcv_saddr(sk)) |
| 81 | break; | 80 | break; |
| 82 | } | 81 | } |
| 83 | if (!relax && reuse && sk2->sk_reuse && | ||
| 84 | sk2->sk_state != TCP_LISTEN) { | ||
| 85 | const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); | ||
| 86 | |||
| 87 | if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || | ||
| 88 | sk2_rcv_saddr == sk_rcv_saddr(sk)) | ||
| 89 | break; | ||
| 90 | } | ||
| 91 | } | 82 | } |
| 92 | } | 83 | } |
| 93 | return node != NULL; | 84 | return node != NULL; |
| @@ -131,16 +122,12 @@ again: | |||
| 131 | (tb->num_owners < smallest_size || smallest_size == -1)) { | 122 | (tb->num_owners < smallest_size || smallest_size == -1)) { |
| 132 | smallest_size = tb->num_owners; | 123 | smallest_size = tb->num_owners; |
| 133 | smallest_rover = rover; | 124 | smallest_rover = rover; |
| 134 | if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 && | 125 | if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) { |
| 135 | !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { | 126 | spin_unlock(&head->lock); |
| 136 | snum = smallest_rover; | 127 | snum = smallest_rover; |
| 137 | goto tb_found; | 128 | goto have_snum; |
| 138 | } | 129 | } |
| 139 | } | 130 | } |
| 140 | if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { | ||
| 141 | snum = rover; | ||
| 142 | goto tb_found; | ||
| 143 | } | ||
| 144 | goto next; | 131 | goto next; |
| 145 | } | 132 | } |
| 146 | break; | 133 | break; |
| @@ -182,22 +169,18 @@ have_snum: | |||
| 182 | goto tb_not_found; | 169 | goto tb_not_found; |
| 183 | tb_found: | 170 | tb_found: |
| 184 | if (!hlist_empty(&tb->owners)) { | 171 | if (!hlist_empty(&tb->owners)) { |
| 185 | if (sk->sk_reuse == SK_FORCE_REUSE) | ||
| 186 | goto success; | ||
| 187 | |||
| 188 | if (tb->fastreuse > 0 && | 172 | if (tb->fastreuse > 0 && |
| 189 | sk->sk_reuse && sk->sk_state != TCP_LISTEN && | 173 | sk->sk_reuse && sk->sk_state != TCP_LISTEN && |
| 190 | smallest_size == -1) { | 174 | smallest_size == -1) { |
| 191 | goto success; | 175 | goto success; |
| 192 | } else { | 176 | } else { |
| 193 | ret = 1; | 177 | ret = 1; |
| 194 | if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { | 178 | if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) { |
| 195 | if (sk->sk_reuse && sk->sk_state != TCP_LISTEN && | 179 | if (sk->sk_reuse && sk->sk_state != TCP_LISTEN && |
| 196 | smallest_size != -1 && --attempts >= 0) { | 180 | smallest_size != -1 && --attempts >= 0) { |
| 197 | spin_unlock(&head->lock); | 181 | spin_unlock(&head->lock); |
| 198 | goto again; | 182 | goto again; |
| 199 | } | 183 | } |
| 200 | |||
| 201 | goto fail_unlock; | 184 | goto fail_unlock; |
| 202 | } | 185 | } |
| 203 | } | 186 | } |
| @@ -283,9 +266,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) | |||
| 283 | struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) | 266 | struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) |
| 284 | { | 267 | { |
| 285 | struct inet_connection_sock *icsk = inet_csk(sk); | 268 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 286 | struct request_sock_queue *queue = &icsk->icsk_accept_queue; | ||
| 287 | struct sock *newsk; | 269 | struct sock *newsk; |
| 288 | struct request_sock *req; | ||
| 289 | int error; | 270 | int error; |
| 290 | 271 | ||
| 291 | lock_sock(sk); | 272 | lock_sock(sk); |
| @@ -298,7 +279,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) | |||
| 298 | goto out_err; | 279 | goto out_err; |
| 299 | 280 | ||
| 300 | /* Find already established connection */ | 281 | /* Find already established connection */ |
| 301 | if (reqsk_queue_empty(queue)) { | 282 | if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { |
| 302 | long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); | 283 | long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); |
| 303 | 284 | ||
| 304 | /* If this is a non blocking socket don't sleep */ | 285 | /* If this is a non blocking socket don't sleep */ |
| @@ -310,32 +291,14 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) | |||
| 310 | if (error) | 291 | if (error) |
| 311 | goto out_err; | 292 | goto out_err; |
| 312 | } | 293 | } |
| 313 | req = reqsk_queue_remove(queue); | 294 | |
| 314 | newsk = req->sk; | 295 | newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); |
| 315 | 296 | WARN_ON(newsk->sk_state == TCP_SYN_RECV); | |
| 316 | sk_acceptq_removed(sk); | ||
| 317 | if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) { | ||
| 318 | spin_lock_bh(&queue->fastopenq->lock); | ||
| 319 | if (tcp_rsk(req)->listener) { | ||
| 320 | /* We are still waiting for the final ACK from 3WHS | ||
| 321 | * so can't free req now. Instead, we set req->sk to | ||
| 322 | * NULL to signify that the child socket is taken | ||
| 323 | * so reqsk_fastopen_remove() will free the req | ||
| 324 | * when 3WHS finishes (or is aborted). | ||
| 325 | */ | ||
| 326 | req->sk = NULL; | ||
| 327 | req = NULL; | ||
| 328 | } | ||
| 329 | spin_unlock_bh(&queue->fastopenq->lock); | ||
| 330 | } | ||
| 331 | out: | 297 | out: |
| 332 | release_sock(sk); | 298 | release_sock(sk); |
| 333 | if (req) | ||
| 334 | __reqsk_free(req); | ||
| 335 | return newsk; | 299 | return newsk; |
| 336 | out_err: | 300 | out_err: |
| 337 | newsk = NULL; | 301 | newsk = NULL; |
| 338 | req = NULL; | ||
| 339 | *err = error; | 302 | *err = error; |
| 340 | goto out; | 303 | goto out; |
| 341 | } | 304 | } |
| @@ -394,19 +357,17 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, | |||
| 394 | const struct inet_request_sock *ireq = inet_rsk(req); | 357 | const struct inet_request_sock *ireq = inet_rsk(req); |
| 395 | struct ip_options_rcu *opt = inet_rsk(req)->opt; | 358 | struct ip_options_rcu *opt = inet_rsk(req)->opt; |
| 396 | struct net *net = sock_net(sk); | 359 | struct net *net = sock_net(sk); |
| 397 | int flags = inet_sk_flowi_flags(sk); | ||
| 398 | 360 | ||
| 399 | flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, | 361 | flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, |
| 400 | RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, | 362 | RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, |
| 401 | sk->sk_protocol, | 363 | sk->sk_protocol, inet_sk_flowi_flags(sk), |
| 402 | flags, | ||
| 403 | (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, | 364 | (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, |
| 404 | ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); | 365 | ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); |
| 405 | security_req_classify_flow(req, flowi4_to_flowi(fl4)); | 366 | security_req_classify_flow(req, flowi4_to_flowi(fl4)); |
| 406 | rt = ip_route_output_flow(net, fl4, sk); | 367 | rt = ip_route_output_flow(net, fl4, sk); |
| 407 | if (IS_ERR(rt)) | 368 | if (IS_ERR(rt)) |
| 408 | goto no_route; | 369 | goto no_route; |
| 409 | if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) | 370 | if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) |
| 410 | goto route_err; | 371 | goto route_err; |
| 411 | return &rt->dst; | 372 | return &rt->dst; |
| 412 | 373 | ||
| @@ -424,15 +385,12 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, | |||
| 424 | { | 385 | { |
| 425 | const struct inet_request_sock *ireq = inet_rsk(req); | 386 | const struct inet_request_sock *ireq = inet_rsk(req); |
| 426 | struct inet_sock *newinet = inet_sk(newsk); | 387 | struct inet_sock *newinet = inet_sk(newsk); |
| 427 | struct ip_options_rcu *opt; | 388 | struct ip_options_rcu *opt = ireq->opt; |
| 428 | struct net *net = sock_net(sk); | 389 | struct net *net = sock_net(sk); |
| 429 | struct flowi4 *fl4; | 390 | struct flowi4 *fl4; |
| 430 | struct rtable *rt; | 391 | struct rtable *rt; |
| 431 | 392 | ||
| 432 | fl4 = &newinet->cork.fl.u.ip4; | 393 | fl4 = &newinet->cork.fl.u.ip4; |
| 433 | |||
| 434 | rcu_read_lock(); | ||
| 435 | opt = rcu_dereference(newinet->inet_opt); | ||
| 436 | flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, | 394 | flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, |
| 437 | RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, | 395 | RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, |
| 438 | sk->sk_protocol, inet_sk_flowi_flags(sk), | 396 | sk->sk_protocol, inet_sk_flowi_flags(sk), |
| @@ -442,15 +400,13 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, | |||
| 442 | rt = ip_route_output_flow(net, fl4, sk); | 400 | rt = ip_route_output_flow(net, fl4, sk); |
| 443 | if (IS_ERR(rt)) | 401 | if (IS_ERR(rt)) |
| 444 | goto no_route; | 402 | goto no_route; |
| 445 | if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) | 403 | if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) |
| 446 | goto route_err; | 404 | goto route_err; |
| 447 | rcu_read_unlock(); | ||
| 448 | return &rt->dst; | 405 | return &rt->dst; |
| 449 | 406 | ||
| 450 | route_err: | 407 | route_err: |
| 451 | ip_rt_put(rt); | 408 | ip_rt_put(rt); |
| 452 | no_route: | 409 | no_route: |
| 453 | rcu_read_unlock(); | ||
| 454 | IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); | 410 | IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); |
| 455 | return NULL; | 411 | return NULL; |
| 456 | } | 412 | } |
| @@ -462,7 +418,7 @@ static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, | |||
| 462 | return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); | 418 | return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); |
| 463 | } | 419 | } |
| 464 | 420 | ||
| 465 | #if IS_ENABLED(CONFIG_IPV6) | 421 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 466 | #define AF_INET_FAMILY(fam) ((fam) == AF_INET) | 422 | #define AF_INET_FAMILY(fam) ((fam) == AF_INET) |
| 467 | #else | 423 | #else |
| 468 | #define AF_INET_FAMILY(fam) 1 | 424 | #define AF_INET_FAMILY(fam) 1 |
| @@ -521,31 +477,21 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh, | |||
| 521 | int *expire, int *resend) | 477 | int *expire, int *resend) |
| 522 | { | 478 | { |
| 523 | if (!rskq_defer_accept) { | 479 | if (!rskq_defer_accept) { |
| 524 | *expire = req->num_timeout >= thresh; | 480 | *expire = req->retrans >= thresh; |
| 525 | *resend = 1; | 481 | *resend = 1; |
| 526 | return; | 482 | return; |
| 527 | } | 483 | } |
| 528 | *expire = req->num_timeout >= thresh && | 484 | *expire = req->retrans >= thresh && |
| 529 | (!inet_rsk(req)->acked || req->num_timeout >= max_retries); | 485 | (!inet_rsk(req)->acked || req->retrans >= max_retries); |
| 530 | /* | 486 | /* |
| 531 | * Do not resend while waiting for data after ACK, | 487 | * Do not resend while waiting for data after ACK, |
| 532 | * start to resend on end of deferring period to give | 488 | * start to resend on end of deferring period to give |
| 533 | * last chance for data or ACK to create established socket. | 489 | * last chance for data or ACK to create established socket. |
| 534 | */ | 490 | */ |
| 535 | *resend = !inet_rsk(req)->acked || | 491 | *resend = !inet_rsk(req)->acked || |
| 536 | req->num_timeout >= rskq_defer_accept - 1; | 492 | req->retrans >= rskq_defer_accept - 1; |
| 537 | } | 493 | } |
| 538 | 494 | ||
| 539 | int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) | ||
| 540 | { | ||
| 541 | int err = req->rsk_ops->rtx_syn_ack(parent, req, NULL); | ||
| 542 | |||
| 543 | if (!err) | ||
| 544 | req->num_retrans++; | ||
| 545 | return err; | ||
| 546 | } | ||
| 547 | EXPORT_SYMBOL(inet_rtx_syn_ack); | ||
| 548 | |||
| 549 | void inet_csk_reqsk_queue_prune(struct sock *parent, | 495 | void inet_csk_reqsk_queue_prune(struct sock *parent, |
| 550 | const unsigned long interval, | 496 | const unsigned long interval, |
| 551 | const unsigned long timeout, | 497 | const unsigned long timeout, |
| @@ -565,7 +511,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, | |||
| 565 | 511 | ||
| 566 | /* Normally all the openreqs are young and become mature | 512 | /* Normally all the openreqs are young and become mature |
| 567 | * (i.e. converted to established socket) for first timeout. | 513 | * (i.e. converted to established socket) for first timeout. |
| 568 | * If synack was not acknowledged for 1 second, it means | 514 | * If synack was not acknowledged for 3 seconds, it means |
| 569 | * one of the following things: synack was lost, ack was lost, | 515 | * one of the following things: synack was lost, ack was lost, |
| 570 | * rtt is high or nobody planned to ack (i.e. synflood). | 516 | * rtt is high or nobody planned to ack (i.e. synflood). |
| 571 | * When server is a bit loaded, queue is populated with old | 517 | * When server is a bit loaded, queue is populated with old |
| @@ -606,17 +552,17 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, | |||
| 606 | syn_ack_recalc(req, thresh, max_retries, | 552 | syn_ack_recalc(req, thresh, max_retries, |
| 607 | queue->rskq_defer_accept, | 553 | queue->rskq_defer_accept, |
| 608 | &expire, &resend); | 554 | &expire, &resend); |
| 609 | req->rsk_ops->syn_ack_timeout(parent, req); | 555 | if (req->rsk_ops->syn_ack_timeout) |
| 556 | req->rsk_ops->syn_ack_timeout(parent, req); | ||
| 610 | if (!expire && | 557 | if (!expire && |
| 611 | (!resend || | 558 | (!resend || |
| 612 | !inet_rtx_syn_ack(parent, req) || | 559 | !req->rsk_ops->rtx_syn_ack(parent, req, NULL) || |
| 613 | inet_rsk(req)->acked)) { | 560 | inet_rsk(req)->acked)) { |
| 614 | unsigned long timeo; | 561 | unsigned long timeo; |
| 615 | 562 | ||
| 616 | if (req->num_timeout++ == 0) | 563 | if (req->retrans++ == 0) |
| 617 | lopt->qlen_young--; | 564 | lopt->qlen_young--; |
| 618 | timeo = min(timeout << req->num_timeout, | 565 | timeo = min((timeout << req->retrans), max_rto); |
| 619 | max_rto); | ||
| 620 | req->expires = now + timeo; | 566 | req->expires = now + timeo; |
| 621 | reqp = &req->dl_next; | 567 | reqp = &req->dl_next; |
| 622 | continue; | 568 | continue; |
| @@ -642,19 +588,10 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, | |||
| 642 | } | 588 | } |
| 643 | EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); | 589 | EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); |
| 644 | 590 | ||
| 645 | /** | 591 | struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, |
| 646 | * inet_csk_clone_lock - clone an inet socket, and lock its clone | 592 | const gfp_t priority) |
| 647 | * @sk: the socket to clone | ||
| 648 | * @req: request_sock | ||
| 649 | * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) | ||
| 650 | * | ||
| 651 | * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) | ||
| 652 | */ | ||
| 653 | struct sock *inet_csk_clone_lock(const struct sock *sk, | ||
| 654 | const struct request_sock *req, | ||
| 655 | const gfp_t priority) | ||
| 656 | { | 593 | { |
| 657 | struct sock *newsk = sk_clone_lock(sk, priority); | 594 | struct sock *newsk = sk_clone(sk, priority); |
| 658 | 595 | ||
| 659 | if (newsk != NULL) { | 596 | if (newsk != NULL) { |
| 660 | struct inet_connection_sock *newicsk = inet_csk(newsk); | 597 | struct inet_connection_sock *newicsk = inet_csk(newsk); |
| @@ -678,7 +615,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, | |||
| 678 | } | 615 | } |
| 679 | return newsk; | 616 | return newsk; |
| 680 | } | 617 | } |
| 681 | EXPORT_SYMBOL_GPL(inet_csk_clone_lock); | 618 | EXPORT_SYMBOL_GPL(inet_csk_clone); |
| 682 | 619 | ||
| 683 | /* | 620 | /* |
| 684 | * At this point, there should be no process reference to this | 621 | * At this point, there should be no process reference to this |
| @@ -710,22 +647,6 @@ void inet_csk_destroy_sock(struct sock *sk) | |||
| 710 | } | 647 | } |
| 711 | EXPORT_SYMBOL(inet_csk_destroy_sock); | 648 | EXPORT_SYMBOL(inet_csk_destroy_sock); |
| 712 | 649 | ||
| 713 | /* This function allows to force a closure of a socket after the call to | ||
| 714 | * tcp/dccp_create_openreq_child(). | ||
| 715 | */ | ||
| 716 | void inet_csk_prepare_forced_close(struct sock *sk) | ||
| 717 | { | ||
| 718 | /* sk_clone_lock locked the socket and set refcnt to 2 */ | ||
| 719 | bh_unlock_sock(sk); | ||
| 720 | sock_put(sk); | ||
| 721 | |||
| 722 | /* The below has to be done to allow calling inet_csk_destroy_sock */ | ||
| 723 | sock_set_flag(sk, SOCK_DEAD); | ||
| 724 | percpu_counter_inc(sk->sk_prot->orphan_count); | ||
| 725 | inet_sk(sk)->inet_num = 0; | ||
| 726 | } | ||
| 727 | EXPORT_SYMBOL(inet_csk_prepare_forced_close); | ||
| 728 | |||
| 729 | int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) | 650 | int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) |
| 730 | { | 651 | { |
| 731 | struct inet_sock *inet = inet_sk(sk); | 652 | struct inet_sock *inet = inet_sk(sk); |
| @@ -767,14 +688,13 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start); | |||
| 767 | void inet_csk_listen_stop(struct sock *sk) | 688 | void inet_csk_listen_stop(struct sock *sk) |
| 768 | { | 689 | { |
| 769 | struct inet_connection_sock *icsk = inet_csk(sk); | 690 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 770 | struct request_sock_queue *queue = &icsk->icsk_accept_queue; | ||
| 771 | struct request_sock *acc_req; | 691 | struct request_sock *acc_req; |
| 772 | struct request_sock *req; | 692 | struct request_sock *req; |
| 773 | 693 | ||
| 774 | inet_csk_delete_keepalive_timer(sk); | 694 | inet_csk_delete_keepalive_timer(sk); |
| 775 | 695 | ||
| 776 | /* make all the listen_opt local to us */ | 696 | /* make all the listen_opt local to us */ |
| 777 | acc_req = reqsk_queue_yank_acceptq(queue); | 697 | acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); |
| 778 | 698 | ||
| 779 | /* Following specs, it would be better either to send FIN | 699 | /* Following specs, it would be better either to send FIN |
| 780 | * (and enter FIN-WAIT-1, it is normal close) | 700 | * (and enter FIN-WAIT-1, it is normal close) |
| @@ -784,7 +704,7 @@ void inet_csk_listen_stop(struct sock *sk) | |||
| 784 | * To be honest, we are not able to make either | 704 | * To be honest, we are not able to make either |
| 785 | * of the variants now. --ANK | 705 | * of the variants now. --ANK |
| 786 | */ | 706 | */ |
| 787 | reqsk_queue_destroy(queue); | 707 | reqsk_queue_destroy(&icsk->icsk_accept_queue); |
| 788 | 708 | ||
| 789 | while ((req = acc_req) != NULL) { | 709 | while ((req = acc_req) != NULL) { |
| 790 | struct sock *child = req->sk; | 710 | struct sock *child = req->sk; |
| @@ -802,19 +722,6 @@ void inet_csk_listen_stop(struct sock *sk) | |||
| 802 | 722 | ||
| 803 | percpu_counter_inc(sk->sk_prot->orphan_count); | 723 | percpu_counter_inc(sk->sk_prot->orphan_count); |
| 804 | 724 | ||
| 805 | if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) { | ||
| 806 | BUG_ON(tcp_sk(child)->fastopen_rsk != req); | ||
| 807 | BUG_ON(sk != tcp_rsk(req)->listener); | ||
| 808 | |||
| 809 | /* Paranoid, to prevent race condition if | ||
| 810 | * an inbound pkt destined for child is | ||
| 811 | * blocked by sock lock in tcp_v4_rcv(). | ||
| 812 | * Also to satisfy an assertion in | ||
| 813 | * tcp_v4_destroy_sock(). | ||
| 814 | */ | ||
| 815 | tcp_sk(child)->fastopen_rsk = NULL; | ||
| 816 | sock_put(sk); | ||
| 817 | } | ||
| 818 | inet_csk_destroy_sock(child); | 725 | inet_csk_destroy_sock(child); |
| 819 | 726 | ||
| 820 | bh_unlock_sock(child); | 727 | bh_unlock_sock(child); |
| @@ -824,17 +731,6 @@ void inet_csk_listen_stop(struct sock *sk) | |||
| 824 | sk_acceptq_removed(sk); | 731 | sk_acceptq_removed(sk); |
| 825 | __reqsk_free(req); | 732 | __reqsk_free(req); |
| 826 | } | 733 | } |
| 827 | if (queue->fastopenq != NULL) { | ||
| 828 | /* Free all the reqs queued in rskq_rst_head. */ | ||
| 829 | spin_lock_bh(&queue->fastopenq->lock); | ||
| 830 | acc_req = queue->fastopenq->rskq_rst_head; | ||
| 831 | queue->fastopenq->rskq_rst_head = NULL; | ||
| 832 | spin_unlock_bh(&queue->fastopenq->lock); | ||
| 833 | while ((req = acc_req) != NULL) { | ||
| 834 | acc_req = req->dl_next; | ||
| 835 | __reqsk_free(req); | ||
| 836 | } | ||
| 837 | } | ||
| 838 | WARN_ON(sk->sk_ack_backlog); | 734 | WARN_ON(sk->sk_ack_backlog); |
| 839 | } | 735 | } |
| 840 | EXPORT_SYMBOL_GPL(inet_csk_listen_stop); | 736 | EXPORT_SYMBOL_GPL(inet_csk_listen_stop); |
| @@ -877,49 +773,3 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, | |||
| 877 | } | 773 | } |
| 878 | EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); | 774 | EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); |
| 879 | #endif | 775 | #endif |
| 880 | |||
| 881 | static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) | ||
| 882 | { | ||
| 883 | const struct inet_sock *inet = inet_sk(sk); | ||
| 884 | const struct ip_options_rcu *inet_opt; | ||
| 885 | __be32 daddr = inet->inet_daddr; | ||
| 886 | struct flowi4 *fl4; | ||
| 887 | struct rtable *rt; | ||
| 888 | |||
| 889 | rcu_read_lock(); | ||
| 890 | inet_opt = rcu_dereference(inet->inet_opt); | ||
| 891 | if (inet_opt && inet_opt->opt.srr) | ||
| 892 | daddr = inet_opt->opt.faddr; | ||
| 893 | fl4 = &fl->u.ip4; | ||
| 894 | rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, | ||
| 895 | inet->inet_saddr, inet->inet_dport, | ||
| 896 | inet->inet_sport, sk->sk_protocol, | ||
| 897 | RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); | ||
| 898 | if (IS_ERR(rt)) | ||
| 899 | rt = NULL; | ||
| 900 | if (rt) | ||
| 901 | sk_setup_caps(sk, &rt->dst); | ||
| 902 | rcu_read_unlock(); | ||
| 903 | |||
| 904 | return &rt->dst; | ||
| 905 | } | ||
| 906 | |||
| 907 | struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) | ||
| 908 | { | ||
| 909 | struct dst_entry *dst = __sk_dst_check(sk, 0); | ||
| 910 | struct inet_sock *inet = inet_sk(sk); | ||
| 911 | |||
| 912 | if (!dst) { | ||
| 913 | dst = inet_csk_rebuild_route(sk, &inet->cork.fl); | ||
| 914 | if (!dst) | ||
| 915 | goto out; | ||
| 916 | } | ||
| 917 | dst->ops->update_pmtu(dst, sk, NULL, mtu); | ||
| 918 | |||
| 919 | dst = __sk_dst_check(sk, 0); | ||
| 920 | if (!dst) | ||
| 921 | dst = inet_csk_rebuild_route(sk, &inet->cork.fl); | ||
| 922 | out: | ||
| 923 | return dst; | ||
| 924 | } | ||
| 925 | EXPORT_SYMBOL_GPL(inet_csk_update_pmtu); | ||
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 7afa2c3c788..389a2e6a17f 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c | |||
| @@ -33,7 +33,6 @@ | |||
| 33 | #include <linux/stddef.h> | 33 | #include <linux/stddef.h> |
| 34 | 34 | ||
| 35 | #include <linux/inet_diag.h> | 35 | #include <linux/inet_diag.h> |
| 36 | #include <linux/sock_diag.h> | ||
| 37 | 36 | ||
| 38 | static const struct inet_diag_handler **inet_diag_table; | 37 | static const struct inet_diag_handler **inet_diag_table; |
| 39 | 38 | ||
| @@ -44,25 +43,26 @@ struct inet_diag_entry { | |||
| 44 | u16 dport; | 43 | u16 dport; |
| 45 | u16 family; | 44 | u16 family; |
| 46 | u16 userlocks; | 45 | u16 userlocks; |
| 47 | #if IS_ENABLED(CONFIG_IPV6) | ||
| 48 | struct in6_addr saddr_storage; /* for IPv4-mapped-IPv6 addresses */ | ||
| 49 | struct in6_addr daddr_storage; /* for IPv4-mapped-IPv6 addresses */ | ||
| 50 | #endif | ||
| 51 | }; | 46 | }; |
| 52 | 47 | ||
| 48 | static struct sock *idiagnl; | ||
| 49 | |||
| 50 | #define INET_DIAG_PUT(skb, attrtype, attrlen) \ | ||
| 51 | RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) | ||
| 52 | |||
| 53 | static DEFINE_MUTEX(inet_diag_table_mutex); | 53 | static DEFINE_MUTEX(inet_diag_table_mutex); |
| 54 | 54 | ||
| 55 | static const struct inet_diag_handler *inet_diag_lock_handler(int proto) | 55 | static const struct inet_diag_handler *inet_diag_lock_handler(int type) |
| 56 | { | 56 | { |
| 57 | if (!inet_diag_table[proto]) | 57 | if (!inet_diag_table[type]) |
| 58 | request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, | 58 | request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, |
| 59 | NETLINK_SOCK_DIAG, AF_INET, proto); | 59 | NETLINK_INET_DIAG, type); |
| 60 | 60 | ||
| 61 | mutex_lock(&inet_diag_table_mutex); | 61 | mutex_lock(&inet_diag_table_mutex); |
| 62 | if (!inet_diag_table[proto]) | 62 | if (!inet_diag_table[type]) |
| 63 | return ERR_PTR(-ENOENT); | 63 | return ERR_PTR(-ENOENT); |
| 64 | 64 | ||
| 65 | return inet_diag_table[proto]; | 65 | return inet_diag_table[type]; |
| 66 | } | 66 | } |
| 67 | 67 | ||
| 68 | static inline void inet_diag_unlock_handler( | 68 | static inline void inet_diag_unlock_handler( |
| @@ -71,91 +71,68 @@ static inline void inet_diag_unlock_handler( | |||
| 71 | mutex_unlock(&inet_diag_table_mutex); | 71 | mutex_unlock(&inet_diag_table_mutex); |
| 72 | } | 72 | } |
| 73 | 73 | ||
| 74 | int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, | 74 | static int inet_csk_diag_fill(struct sock *sk, |
| 75 | struct sk_buff *skb, struct inet_diag_req_v2 *req, | 75 | struct sk_buff *skb, |
| 76 | struct user_namespace *user_ns, | 76 | int ext, u32 pid, u32 seq, u16 nlmsg_flags, |
| 77 | u32 portid, u32 seq, u16 nlmsg_flags, | ||
| 78 | const struct nlmsghdr *unlh) | 77 | const struct nlmsghdr *unlh) |
| 79 | { | 78 | { |
| 80 | const struct inet_sock *inet = inet_sk(sk); | 79 | const struct inet_sock *inet = inet_sk(sk); |
| 80 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 81 | struct inet_diag_msg *r; | 81 | struct inet_diag_msg *r; |
| 82 | struct nlmsghdr *nlh; | 82 | struct nlmsghdr *nlh; |
| 83 | struct nlattr *attr; | ||
| 84 | void *info = NULL; | 83 | void *info = NULL; |
| 84 | struct inet_diag_meminfo *minfo = NULL; | ||
| 85 | unsigned char *b = skb_tail_pointer(skb); | ||
| 85 | const struct inet_diag_handler *handler; | 86 | const struct inet_diag_handler *handler; |
| 86 | int ext = req->idiag_ext; | ||
| 87 | 87 | ||
| 88 | handler = inet_diag_table[req->sdiag_protocol]; | 88 | handler = inet_diag_table[unlh->nlmsg_type]; |
| 89 | BUG_ON(handler == NULL); | 89 | BUG_ON(handler == NULL); |
| 90 | 90 | ||
| 91 | nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), | 91 | nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); |
| 92 | nlmsg_flags); | 92 | nlh->nlmsg_flags = nlmsg_flags; |
| 93 | if (!nlh) | ||
| 94 | return -EMSGSIZE; | ||
| 95 | 93 | ||
| 96 | r = nlmsg_data(nlh); | 94 | r = NLMSG_DATA(nlh); |
| 97 | BUG_ON(sk->sk_state == TCP_TIME_WAIT); | 95 | BUG_ON(sk->sk_state == TCP_TIME_WAIT); |
| 98 | 96 | ||
| 97 | if (ext & (1 << (INET_DIAG_MEMINFO - 1))) | ||
| 98 | minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo)); | ||
| 99 | |||
| 100 | if (ext & (1 << (INET_DIAG_INFO - 1))) | ||
| 101 | info = INET_DIAG_PUT(skb, INET_DIAG_INFO, | ||
| 102 | handler->idiag_info_size); | ||
| 103 | |||
| 104 | if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) { | ||
| 105 | const size_t len = strlen(icsk->icsk_ca_ops->name); | ||
| 106 | |||
| 107 | strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1), | ||
| 108 | icsk->icsk_ca_ops->name); | ||
| 109 | } | ||
| 110 | |||
| 99 | r->idiag_family = sk->sk_family; | 111 | r->idiag_family = sk->sk_family; |
| 100 | r->idiag_state = sk->sk_state; | 112 | r->idiag_state = sk->sk_state; |
| 101 | r->idiag_timer = 0; | 113 | r->idiag_timer = 0; |
| 102 | r->idiag_retrans = 0; | 114 | r->idiag_retrans = 0; |
| 103 | 115 | ||
| 104 | r->id.idiag_if = sk->sk_bound_dev_if; | 116 | r->id.idiag_if = sk->sk_bound_dev_if; |
| 105 | sock_diag_save_cookie(sk, r->id.idiag_cookie); | 117 | r->id.idiag_cookie[0] = (u32)(unsigned long)sk; |
| 118 | r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); | ||
| 106 | 119 | ||
| 107 | r->id.idiag_sport = inet->inet_sport; | 120 | r->id.idiag_sport = inet->inet_sport; |
| 108 | r->id.idiag_dport = inet->inet_dport; | 121 | r->id.idiag_dport = inet->inet_dport; |
| 109 | r->id.idiag_src[0] = inet->inet_rcv_saddr; | 122 | r->id.idiag_src[0] = inet->inet_rcv_saddr; |
| 110 | r->id.idiag_dst[0] = inet->inet_daddr; | 123 | r->id.idiag_dst[0] = inet->inet_daddr; |
| 111 | 124 | ||
| 112 | if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown)) | 125 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) |
| 113 | goto errout; | ||
| 114 | |||
| 115 | /* IPv6 dual-stack sockets use inet->tos for IPv4 connections, | ||
| 116 | * hence this needs to be included regardless of socket family. | ||
| 117 | */ | ||
| 118 | if (ext & (1 << (INET_DIAG_TOS - 1))) | ||
| 119 | if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0) | ||
| 120 | goto errout; | ||
| 121 | |||
| 122 | #if IS_ENABLED(CONFIG_IPV6) | ||
| 123 | if (r->idiag_family == AF_INET6) { | 126 | if (r->idiag_family == AF_INET6) { |
| 124 | const struct ipv6_pinfo *np = inet6_sk(sk); | 127 | const struct ipv6_pinfo *np = inet6_sk(sk); |
| 125 | 128 | ||
| 126 | *(struct in6_addr *)r->id.idiag_src = np->rcv_saddr; | 129 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, |
| 127 | *(struct in6_addr *)r->id.idiag_dst = np->daddr; | 130 | &np->rcv_saddr); |
| 128 | 131 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, | |
| 129 | if (ext & (1 << (INET_DIAG_TCLASS - 1))) | 132 | &np->daddr); |
| 130 | if (nla_put_u8(skb, INET_DIAG_TCLASS, np->tclass) < 0) | ||
| 131 | goto errout; | ||
| 132 | } | 133 | } |
| 133 | #endif | 134 | #endif |
| 134 | 135 | ||
| 135 | r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); | ||
| 136 | r->idiag_inode = sock_i_ino(sk); | ||
| 137 | |||
| 138 | if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { | ||
| 139 | struct inet_diag_meminfo minfo = { | ||
| 140 | .idiag_rmem = sk_rmem_alloc_get(sk), | ||
| 141 | .idiag_wmem = sk->sk_wmem_queued, | ||
| 142 | .idiag_fmem = sk->sk_forward_alloc, | ||
| 143 | .idiag_tmem = sk_wmem_alloc_get(sk), | ||
| 144 | }; | ||
| 145 | |||
| 146 | if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0) | ||
| 147 | goto errout; | ||
| 148 | } | ||
| 149 | |||
| 150 | if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) | ||
| 151 | if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO)) | ||
| 152 | goto errout; | ||
| 153 | |||
| 154 | if (icsk == NULL) { | ||
| 155 | handler->idiag_get_info(sk, r, NULL); | ||
| 156 | goto out; | ||
| 157 | } | ||
| 158 | |||
| 159 | #define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ) | 136 | #define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ) |
| 160 | 137 | ||
| 161 | if (icsk->icsk_pending == ICSK_TIME_RETRANS) { | 138 | if (icsk->icsk_pending == ICSK_TIME_RETRANS) { |
| @@ -176,62 +153,47 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, | |||
| 176 | } | 153 | } |
| 177 | #undef EXPIRES_IN_MS | 154 | #undef EXPIRES_IN_MS |
| 178 | 155 | ||
| 179 | if (ext & (1 << (INET_DIAG_INFO - 1))) { | 156 | r->idiag_uid = sock_i_uid(sk); |
| 180 | attr = nla_reserve(skb, INET_DIAG_INFO, | 157 | r->idiag_inode = sock_i_ino(sk); |
| 181 | sizeof(struct tcp_info)); | ||
| 182 | if (!attr) | ||
| 183 | goto errout; | ||
| 184 | 158 | ||
| 185 | info = nla_data(attr); | 159 | if (minfo) { |
| 160 | minfo->idiag_rmem = sk_rmem_alloc_get(sk); | ||
| 161 | minfo->idiag_wmem = sk->sk_wmem_queued; | ||
| 162 | minfo->idiag_fmem = sk->sk_forward_alloc; | ||
| 163 | minfo->idiag_tmem = sk_wmem_alloc_get(sk); | ||
| 186 | } | 164 | } |
| 187 | 165 | ||
| 188 | if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) | ||
| 189 | if (nla_put_string(skb, INET_DIAG_CONG, | ||
| 190 | icsk->icsk_ca_ops->name) < 0) | ||
| 191 | goto errout; | ||
| 192 | |||
| 193 | handler->idiag_get_info(sk, r, info); | 166 | handler->idiag_get_info(sk, r, info); |
| 194 | 167 | ||
| 195 | if (sk->sk_state < TCP_TIME_WAIT && | 168 | if (sk->sk_state < TCP_TIME_WAIT && |
| 196 | icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) | 169 | icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) |
| 197 | icsk->icsk_ca_ops->get_info(sk, ext, skb); | 170 | icsk->icsk_ca_ops->get_info(sk, ext, skb); |
| 198 | 171 | ||
| 199 | out: | 172 | nlh->nlmsg_len = skb_tail_pointer(skb) - b; |
| 200 | return nlmsg_end(skb, nlh); | 173 | return skb->len; |
| 201 | 174 | ||
| 202 | errout: | 175 | rtattr_failure: |
| 203 | nlmsg_cancel(skb, nlh); | 176 | nlmsg_failure: |
| 177 | nlmsg_trim(skb, b); | ||
| 204 | return -EMSGSIZE; | 178 | return -EMSGSIZE; |
| 205 | } | 179 | } |
| 206 | EXPORT_SYMBOL_GPL(inet_sk_diag_fill); | ||
| 207 | |||
| 208 | static int inet_csk_diag_fill(struct sock *sk, | ||
| 209 | struct sk_buff *skb, struct inet_diag_req_v2 *req, | ||
| 210 | struct user_namespace *user_ns, | ||
| 211 | u32 portid, u32 seq, u16 nlmsg_flags, | ||
| 212 | const struct nlmsghdr *unlh) | ||
| 213 | { | ||
| 214 | return inet_sk_diag_fill(sk, inet_csk(sk), | ||
| 215 | skb, req, user_ns, portid, seq, nlmsg_flags, unlh); | ||
| 216 | } | ||
| 217 | 180 | ||
| 218 | static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, | 181 | static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, |
| 219 | struct sk_buff *skb, struct inet_diag_req_v2 *req, | 182 | struct sk_buff *skb, int ext, u32 pid, |
| 220 | u32 portid, u32 seq, u16 nlmsg_flags, | 183 | u32 seq, u16 nlmsg_flags, |
| 221 | const struct nlmsghdr *unlh) | 184 | const struct nlmsghdr *unlh) |
| 222 | { | 185 | { |
| 223 | long tmo; | 186 | long tmo; |
| 224 | struct inet_diag_msg *r; | 187 | struct inet_diag_msg *r; |
| 225 | struct nlmsghdr *nlh; | 188 | const unsigned char *previous_tail = skb_tail_pointer(skb); |
| 226 | 189 | struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq, | |
| 227 | nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), | 190 | unlh->nlmsg_type, sizeof(*r)); |
| 228 | nlmsg_flags); | ||
| 229 | if (!nlh) | ||
| 230 | return -EMSGSIZE; | ||
| 231 | 191 | ||
| 232 | r = nlmsg_data(nlh); | 192 | r = NLMSG_DATA(nlh); |
| 233 | BUG_ON(tw->tw_state != TCP_TIME_WAIT); | 193 | BUG_ON(tw->tw_state != TCP_TIME_WAIT); |
| 234 | 194 | ||
| 195 | nlh->nlmsg_flags = nlmsg_flags; | ||
| 196 | |||
| 235 | tmo = tw->tw_ttd - jiffies; | 197 | tmo = tw->tw_ttd - jiffies; |
| 236 | if (tmo < 0) | 198 | if (tmo < 0) |
| 237 | tmo = 0; | 199 | tmo = 0; |
| @@ -239,7 +201,8 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, | |||
| 239 | r->idiag_family = tw->tw_family; | 201 | r->idiag_family = tw->tw_family; |
| 240 | r->idiag_retrans = 0; | 202 | r->idiag_retrans = 0; |
| 241 | r->id.idiag_if = tw->tw_bound_dev_if; | 203 | r->id.idiag_if = tw->tw_bound_dev_if; |
| 242 | sock_diag_save_cookie(tw, r->id.idiag_cookie); | 204 | r->id.idiag_cookie[0] = (u32)(unsigned long)tw; |
| 205 | r->id.idiag_cookie[1] = (u32)(((unsigned long)tw >> 31) >> 1); | ||
| 243 | r->id.idiag_sport = tw->tw_sport; | 206 | r->id.idiag_sport = tw->tw_sport; |
| 244 | r->id.idiag_dport = tw->tw_dport; | 207 | r->id.idiag_dport = tw->tw_dport; |
| 245 | r->id.idiag_src[0] = tw->tw_rcv_saddr; | 208 | r->id.idiag_src[0] = tw->tw_rcv_saddr; |
| @@ -251,49 +214,62 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, | |||
| 251 | r->idiag_wqueue = 0; | 214 | r->idiag_wqueue = 0; |
| 252 | r->idiag_uid = 0; | 215 | r->idiag_uid = 0; |
| 253 | r->idiag_inode = 0; | 216 | r->idiag_inode = 0; |
| 254 | #if IS_ENABLED(CONFIG_IPV6) | 217 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) |
| 255 | if (tw->tw_family == AF_INET6) { | 218 | if (tw->tw_family == AF_INET6) { |
| 256 | const struct inet6_timewait_sock *tw6 = | 219 | const struct inet6_timewait_sock *tw6 = |
| 257 | inet6_twsk((struct sock *)tw); | 220 | inet6_twsk((struct sock *)tw); |
| 258 | 221 | ||
| 259 | *(struct in6_addr *)r->id.idiag_src = tw6->tw_v6_rcv_saddr; | 222 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, |
| 260 | *(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr; | 223 | &tw6->tw_v6_rcv_saddr); |
| 224 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, | ||
| 225 | &tw6->tw_v6_daddr); | ||
| 261 | } | 226 | } |
| 262 | #endif | 227 | #endif |
| 263 | 228 | nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail; | |
| 264 | return nlmsg_end(skb, nlh); | 229 | return skb->len; |
| 230 | nlmsg_failure: | ||
| 231 | nlmsg_trim(skb, previous_tail); | ||
| 232 | return -EMSGSIZE; | ||
| 265 | } | 233 | } |
| 266 | 234 | ||
| 267 | static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, | 235 | static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, |
| 268 | struct inet_diag_req_v2 *r, | 236 | int ext, u32 pid, u32 seq, u16 nlmsg_flags, |
| 269 | struct user_namespace *user_ns, | ||
| 270 | u32 portid, u32 seq, u16 nlmsg_flags, | ||
| 271 | const struct nlmsghdr *unlh) | 237 | const struct nlmsghdr *unlh) |
| 272 | { | 238 | { |
| 273 | if (sk->sk_state == TCP_TIME_WAIT) | 239 | if (sk->sk_state == TCP_TIME_WAIT) |
| 274 | return inet_twsk_diag_fill((struct inet_timewait_sock *)sk, | 240 | return inet_twsk_diag_fill((struct inet_timewait_sock *)sk, |
| 275 | skb, r, portid, seq, nlmsg_flags, | 241 | skb, ext, pid, seq, nlmsg_flags, |
| 276 | unlh); | 242 | unlh); |
| 277 | return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, nlmsg_flags, unlh); | 243 | return inet_csk_diag_fill(sk, skb, ext, pid, seq, nlmsg_flags, unlh); |
| 278 | } | 244 | } |
| 279 | 245 | ||
| 280 | int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb, | 246 | static int inet_diag_get_exact(struct sk_buff *in_skb, |
| 281 | const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req) | 247 | const struct nlmsghdr *nlh) |
| 282 | { | 248 | { |
| 283 | int err; | 249 | int err; |
| 284 | struct sock *sk; | 250 | struct sock *sk; |
| 251 | struct inet_diag_req *req = NLMSG_DATA(nlh); | ||
| 285 | struct sk_buff *rep; | 252 | struct sk_buff *rep; |
| 286 | struct net *net = sock_net(in_skb->sk); | 253 | struct inet_hashinfo *hashinfo; |
| 254 | const struct inet_diag_handler *handler; | ||
| 287 | 255 | ||
| 256 | handler = inet_diag_lock_handler(nlh->nlmsg_type); | ||
| 257 | if (IS_ERR(handler)) { | ||
| 258 | err = PTR_ERR(handler); | ||
| 259 | goto unlock; | ||
| 260 | } | ||
| 261 | |||
| 262 | hashinfo = handler->idiag_hashinfo; | ||
| 288 | err = -EINVAL; | 263 | err = -EINVAL; |
| 289 | if (req->sdiag_family == AF_INET) { | 264 | |
| 290 | sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0], | 265 | if (req->idiag_family == AF_INET) { |
| 266 | sk = inet_lookup(&init_net, hashinfo, req->id.idiag_dst[0], | ||
| 291 | req->id.idiag_dport, req->id.idiag_src[0], | 267 | req->id.idiag_dport, req->id.idiag_src[0], |
| 292 | req->id.idiag_sport, req->id.idiag_if); | 268 | req->id.idiag_sport, req->id.idiag_if); |
| 293 | } | 269 | } |
| 294 | #if IS_ENABLED(CONFIG_IPV6) | 270 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) |
| 295 | else if (req->sdiag_family == AF_INET6) { | 271 | else if (req->idiag_family == AF_INET6) { |
| 296 | sk = inet6_lookup(net, hashinfo, | 272 | sk = inet6_lookup(&init_net, hashinfo, |
| 297 | (struct in6_addr *)req->id.idiag_dst, | 273 | (struct in6_addr *)req->id.idiag_dst, |
| 298 | req->id.idiag_dport, | 274 | req->id.idiag_dport, |
| 299 | (struct in6_addr *)req->id.idiag_src, | 275 | (struct in6_addr *)req->id.idiag_src, |
| @@ -302,35 +278,37 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s | |||
| 302 | } | 278 | } |
| 303 | #endif | 279 | #endif |
| 304 | else { | 280 | else { |
| 305 | goto out_nosk; | 281 | goto unlock; |
| 306 | } | 282 | } |
| 307 | 283 | ||
| 308 | err = -ENOENT; | 284 | err = -ENOENT; |
| 309 | if (sk == NULL) | 285 | if (sk == NULL) |
| 310 | goto out_nosk; | 286 | goto unlock; |
| 311 | 287 | ||
| 312 | err = sock_diag_check_cookie(sk, req->id.idiag_cookie); | 288 | err = -ESTALE; |
| 313 | if (err) | 289 | if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE || |
| 290 | req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) && | ||
| 291 | ((u32)(unsigned long)sk != req->id.idiag_cookie[0] || | ||
| 292 | (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1])) | ||
| 314 | goto out; | 293 | goto out; |
| 315 | 294 | ||
| 316 | rep = nlmsg_new(sizeof(struct inet_diag_msg) + | 295 | err = -ENOMEM; |
| 317 | sizeof(struct inet_diag_meminfo) + | 296 | rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + |
| 318 | sizeof(struct tcp_info) + 64, GFP_KERNEL); | 297 | sizeof(struct inet_diag_meminfo) + |
| 319 | if (!rep) { | 298 | handler->idiag_info_size + 64)), |
| 320 | err = -ENOMEM; | 299 | GFP_KERNEL); |
| 300 | if (!rep) | ||
| 321 | goto out; | 301 | goto out; |
| 322 | } | ||
| 323 | 302 | ||
| 324 | err = sk_diag_fill(sk, rep, req, | 303 | err = sk_diag_fill(sk, rep, req->idiag_ext, |
| 325 | sk_user_ns(NETLINK_CB(in_skb).ssk), | 304 | NETLINK_CB(in_skb).pid, |
| 326 | NETLINK_CB(in_skb).portid, | ||
| 327 | nlh->nlmsg_seq, 0, nlh); | 305 | nlh->nlmsg_seq, 0, nlh); |
| 328 | if (err < 0) { | 306 | if (err < 0) { |
| 329 | WARN_ON(err == -EMSGSIZE); | 307 | WARN_ON(err == -EMSGSIZE); |
| 330 | nlmsg_free(rep); | 308 | kfree_skb(rep); |
| 331 | goto out; | 309 | goto out; |
| 332 | } | 310 | } |
| 333 | err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, | 311 | err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid, |
| 334 | MSG_DONTWAIT); | 312 | MSG_DONTWAIT); |
| 335 | if (err > 0) | 313 | if (err > 0) |
| 336 | err = 0; | 314 | err = 0; |
| @@ -342,25 +320,8 @@ out: | |||
| 342 | else | 320 | else |
| 343 | sock_put(sk); | 321 | sock_put(sk); |
| 344 | } | 322 | } |
| 345 | out_nosk: | 323 | unlock: |
| 346 | return err; | ||
| 347 | } | ||
| 348 | EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk); | ||
| 349 | |||
| 350 | static int inet_diag_get_exact(struct sk_buff *in_skb, | ||
| 351 | const struct nlmsghdr *nlh, | ||
| 352 | struct inet_diag_req_v2 *req) | ||
| 353 | { | ||
| 354 | const struct inet_diag_handler *handler; | ||
| 355 | int err; | ||
| 356 | |||
| 357 | handler = inet_diag_lock_handler(req->sdiag_protocol); | ||
| 358 | if (IS_ERR(handler)) | ||
| 359 | err = PTR_ERR(handler); | ||
| 360 | else | ||
| 361 | err = handler->dump_one(in_skb, nlh, req); | ||
| 362 | inet_diag_unlock_handler(handler); | 324 | inet_diag_unlock_handler(handler); |
| 363 | |||
| 364 | return err; | 325 | return err; |
| 365 | } | 326 | } |
| 366 | 327 | ||
| @@ -391,12 +352,9 @@ static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits) | |||
| 391 | } | 352 | } |
| 392 | 353 | ||
| 393 | 354 | ||
| 394 | static int inet_diag_bc_run(const struct nlattr *_bc, | 355 | static int inet_diag_bc_run(const void *bc, int len, |
| 395 | const struct inet_diag_entry *entry) | 356 | const struct inet_diag_entry *entry) |
| 396 | { | 357 | { |
| 397 | const void *bc = nla_data(_bc); | ||
| 398 | int len = nla_len(_bc); | ||
| 399 | |||
| 400 | while (len > 0) { | 358 | while (len > 0) { |
| 401 | int yes = 1; | 359 | int yes = 1; |
| 402 | const struct inet_diag_bc_op *op = bc; | 360 | const struct inet_diag_bc_op *op = bc; |
| @@ -435,31 +393,25 @@ static int inet_diag_bc_run(const struct nlattr *_bc, | |||
| 435 | break; | 393 | break; |
| 436 | } | 394 | } |
| 437 | 395 | ||
| 396 | if (cond->prefix_len == 0) | ||
| 397 | break; | ||
| 398 | |||
| 438 | if (op->code == INET_DIAG_BC_S_COND) | 399 | if (op->code == INET_DIAG_BC_S_COND) |
| 439 | addr = entry->saddr; | 400 | addr = entry->saddr; |
| 440 | else | 401 | else |
| 441 | addr = entry->daddr; | 402 | addr = entry->daddr; |
| 442 | 403 | ||
| 443 | if (cond->family != AF_UNSPEC && | ||
| 444 | cond->family != entry->family) { | ||
| 445 | if (entry->family == AF_INET6 && | ||
| 446 | cond->family == AF_INET) { | ||
| 447 | if (addr[0] == 0 && addr[1] == 0 && | ||
| 448 | addr[2] == htonl(0xffff) && | ||
| 449 | bitstring_match(addr + 3, | ||
| 450 | cond->addr, | ||
| 451 | cond->prefix_len)) | ||
| 452 | break; | ||
| 453 | } | ||
| 454 | yes = 0; | ||
| 455 | break; | ||
| 456 | } | ||
| 457 | |||
| 458 | if (cond->prefix_len == 0) | ||
| 459 | break; | ||
| 460 | if (bitstring_match(addr, cond->addr, | 404 | if (bitstring_match(addr, cond->addr, |
| 461 | cond->prefix_len)) | 405 | cond->prefix_len)) |
| 462 | break; | 406 | break; |
| 407 | if (entry->family == AF_INET6 && | ||
| 408 | cond->family == AF_INET) { | ||
| 409 | if (addr[0] == 0 && addr[1] == 0 && | ||
| 410 | addr[2] == htonl(0xffff) && | ||
| 411 | bitstring_match(addr + 3, cond->addr, | ||
| 412 | cond->prefix_len)) | ||
| 413 | break; | ||
| 414 | } | ||
| 463 | yes = 0; | 415 | yes = 0; |
| 464 | break; | 416 | break; |
| 465 | } | 417 | } |
| @@ -476,35 +428,6 @@ static int inet_diag_bc_run(const struct nlattr *_bc, | |||
| 476 | return len == 0; | 428 | return len == 0; |
| 477 | } | 429 | } |
| 478 | 430 | ||
| 479 | int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) | ||
| 480 | { | ||
| 481 | struct inet_diag_entry entry; | ||
| 482 | struct inet_sock *inet = inet_sk(sk); | ||
| 483 | |||
| 484 | if (bc == NULL) | ||
| 485 | return 1; | ||
| 486 | |||
| 487 | entry.family = sk->sk_family; | ||
| 488 | #if IS_ENABLED(CONFIG_IPV6) | ||
| 489 | if (entry.family == AF_INET6) { | ||
| 490 | struct ipv6_pinfo *np = inet6_sk(sk); | ||
| 491 | |||
| 492 | entry.saddr = np->rcv_saddr.s6_addr32; | ||
| 493 | entry.daddr = np->daddr.s6_addr32; | ||
| 494 | } else | ||
| 495 | #endif | ||
| 496 | { | ||
| 497 | entry.saddr = &inet->inet_rcv_saddr; | ||
| 498 | entry.daddr = &inet->inet_daddr; | ||
| 499 | } | ||
| 500 | entry.sport = inet->inet_num; | ||
| 501 | entry.dport = ntohs(inet->inet_dport); | ||
| 502 | entry.userlocks = sk->sk_userlocks; | ||
| 503 | |||
| 504 | return inet_diag_bc_run(bc, &entry); | ||
| 505 | } | ||
| 506 | EXPORT_SYMBOL_GPL(inet_diag_bc_sk); | ||
| 507 | |||
| 508 | static int valid_cc(const void *bc, int len, int cc) | 431 | static int valid_cc(const void *bc, int len, int cc) |
| 509 | { | 432 | { |
| 510 | while (len >= 0) { | 433 | while (len >= 0) { |
| @@ -522,55 +445,6 @@ static int valid_cc(const void *bc, int len, int cc) | |||
| 522 | return 0; | 445 | return 0; |
| 523 | } | 446 | } |
| 524 | 447 | ||
| 525 | /* Validate an inet_diag_hostcond. */ | ||
| 526 | static bool valid_hostcond(const struct inet_diag_bc_op *op, int len, | ||
| 527 | int *min_len) | ||
| 528 | { | ||
| 529 | int addr_len; | ||
| 530 | struct inet_diag_hostcond *cond; | ||
| 531 | |||
| 532 | /* Check hostcond space. */ | ||
| 533 | *min_len += sizeof(struct inet_diag_hostcond); | ||
| 534 | if (len < *min_len) | ||
| 535 | return false; | ||
| 536 | cond = (struct inet_diag_hostcond *)(op + 1); | ||
| 537 | |||
| 538 | /* Check address family and address length. */ | ||
| 539 | switch (cond->family) { | ||
| 540 | case AF_UNSPEC: | ||
| 541 | addr_len = 0; | ||
| 542 | break; | ||
| 543 | case AF_INET: | ||
| 544 | addr_len = sizeof(struct in_addr); | ||
| 545 | break; | ||
| 546 | case AF_INET6: | ||
| 547 | addr_len = sizeof(struct in6_addr); | ||
| 548 | break; | ||
| 549 | default: | ||
| 550 | return false; | ||
| 551 | } | ||
| 552 | *min_len += addr_len; | ||
| 553 | if (len < *min_len) | ||
| 554 | return false; | ||
| 555 | |||
| 556 | /* Check prefix length (in bits) vs address length (in bytes). */ | ||
| 557 | if (cond->prefix_len > 8 * addr_len) | ||
| 558 | return false; | ||
| 559 | |||
| 560 | return true; | ||
| 561 | } | ||
| 562 | |||
| 563 | /* Validate a port comparison operator. */ | ||
| 564 | static inline bool valid_port_comparison(const struct inet_diag_bc_op *op, | ||
| 565 | int len, int *min_len) | ||
| 566 | { | ||
| 567 | /* Port comparisons put the port in a follow-on inet_diag_bc_op. */ | ||
| 568 | *min_len += sizeof(struct inet_diag_bc_op); | ||
| 569 | if (len < *min_len) | ||
| 570 | return false; | ||
| 571 | return true; | ||
| 572 | } | ||
| 573 | |||
| 574 | static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) | 448 | static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) |
| 575 | { | 449 | { |
| 576 | const void *bc = bytecode; | 450 | const void *bc = bytecode; |
| @@ -578,39 +452,29 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) | |||
| 578 | 452 | ||
| 579 | while (len > 0) { | 453 | while (len > 0) { |
| 580 | const struct inet_diag_bc_op *op = bc; | 454 | const struct inet_diag_bc_op *op = bc; |
| 581 | int min_len = sizeof(struct inet_diag_bc_op); | ||
| 582 | 455 | ||
| 583 | //printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); | 456 | //printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); |
| 584 | switch (op->code) { | 457 | switch (op->code) { |
| 458 | case INET_DIAG_BC_AUTO: | ||
| 585 | case INET_DIAG_BC_S_COND: | 459 | case INET_DIAG_BC_S_COND: |
| 586 | case INET_DIAG_BC_D_COND: | 460 | case INET_DIAG_BC_D_COND: |
| 587 | if (!valid_hostcond(bc, len, &min_len)) | ||
| 588 | return -EINVAL; | ||
| 589 | break; | ||
| 590 | case INET_DIAG_BC_S_GE: | 461 | case INET_DIAG_BC_S_GE: |
| 591 | case INET_DIAG_BC_S_LE: | 462 | case INET_DIAG_BC_S_LE: |
| 592 | case INET_DIAG_BC_D_GE: | 463 | case INET_DIAG_BC_D_GE: |
| 593 | case INET_DIAG_BC_D_LE: | 464 | case INET_DIAG_BC_D_LE: |
| 594 | if (!valid_port_comparison(bc, len, &min_len)) | 465 | case INET_DIAG_BC_JMP: |
| 466 | if (op->no < 4 || op->no > len + 4 || op->no & 3) | ||
| 467 | return -EINVAL; | ||
| 468 | if (op->no < len && | ||
| 469 | !valid_cc(bytecode, bytecode_len, len - op->no)) | ||
| 595 | return -EINVAL; | 470 | return -EINVAL; |
| 596 | break; | 471 | break; |
| 597 | case INET_DIAG_BC_AUTO: | ||
| 598 | case INET_DIAG_BC_JMP: | ||
| 599 | case INET_DIAG_BC_NOP: | 472 | case INET_DIAG_BC_NOP: |
| 600 | break; | 473 | break; |
| 601 | default: | 474 | default: |
| 602 | return -EINVAL; | 475 | return -EINVAL; |
| 603 | } | 476 | } |
| 604 | 477 | if (op->yes < 4 || op->yes > len + 4 || op->yes & 3) | |
| 605 | if (op->code != INET_DIAG_BC_NOP) { | ||
| 606 | if (op->no < min_len || op->no > len + 4 || op->no & 3) | ||
| 607 | return -EINVAL; | ||
| 608 | if (op->no < len && | ||
| 609 | !valid_cc(bytecode, bytecode_len, len - op->no)) | ||
| 610 | return -EINVAL; | ||
| 611 | } | ||
| 612 | |||
| 613 | if (op->yes < min_len || op->yes > len + 4 || op->yes & 3) | ||
| 614 | return -EINVAL; | 478 | return -EINVAL; |
| 615 | bc += op->yes; | 479 | bc += op->yes; |
| 616 | len -= op->yes; | 480 | len -= op->yes; |
| @@ -620,30 +484,57 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) | |||
| 620 | 484 | ||
| 621 | static int inet_csk_diag_dump(struct sock *sk, | 485 | static int inet_csk_diag_dump(struct sock *sk, |
| 622 | struct sk_buff *skb, | 486 | struct sk_buff *skb, |
| 623 | struct netlink_callback *cb, | 487 | struct netlink_callback *cb) |
| 624 | struct inet_diag_req_v2 *r, | ||
| 625 | const struct nlattr *bc) | ||
| 626 | { | 488 | { |
| 627 | if (!inet_diag_bc_sk(bc, sk)) | 489 | struct inet_diag_req *r = NLMSG_DATA(cb->nlh); |
| 628 | return 0; | ||
| 629 | 490 | ||
| 630 | return inet_csk_diag_fill(sk, skb, r, | 491 | if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { |
| 631 | sk_user_ns(NETLINK_CB(cb->skb).ssk), | 492 | struct inet_diag_entry entry; |
| 632 | NETLINK_CB(cb->skb).portid, | 493 | const struct nlattr *bc = nlmsg_find_attr(cb->nlh, |
| 494 | sizeof(*r), | ||
| 495 | INET_DIAG_REQ_BYTECODE); | ||
| 496 | struct inet_sock *inet = inet_sk(sk); | ||
| 497 | |||
| 498 | entry.family = sk->sk_family; | ||
| 499 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) | ||
| 500 | if (entry.family == AF_INET6) { | ||
| 501 | struct ipv6_pinfo *np = inet6_sk(sk); | ||
| 502 | |||
| 503 | entry.saddr = np->rcv_saddr.s6_addr32; | ||
| 504 | entry.daddr = np->daddr.s6_addr32; | ||
| 505 | } else | ||
| 506 | #endif | ||
| 507 | { | ||
| 508 | entry.saddr = &inet->inet_rcv_saddr; | ||
| 509 | entry.daddr = &inet->inet_daddr; | ||
| 510 | } | ||
| 511 | entry.sport = inet->inet_num; | ||
| 512 | entry.dport = ntohs(inet->inet_dport); | ||
| 513 | entry.userlocks = sk->sk_userlocks; | ||
| 514 | |||
| 515 | if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry)) | ||
| 516 | return 0; | ||
| 517 | } | ||
| 518 | |||
| 519 | return inet_csk_diag_fill(sk, skb, r->idiag_ext, | ||
| 520 | NETLINK_CB(cb->skb).pid, | ||
| 633 | cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); | 521 | cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); |
| 634 | } | 522 | } |
| 635 | 523 | ||
| 636 | static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, | 524 | static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, |
| 637 | struct sk_buff *skb, | 525 | struct sk_buff *skb, |
| 638 | struct netlink_callback *cb, | 526 | struct netlink_callback *cb) |
| 639 | struct inet_diag_req_v2 *r, | ||
| 640 | const struct nlattr *bc) | ||
| 641 | { | 527 | { |
| 642 | if (bc != NULL) { | 528 | struct inet_diag_req *r = NLMSG_DATA(cb->nlh); |
| 529 | |||
| 530 | if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { | ||
| 643 | struct inet_diag_entry entry; | 531 | struct inet_diag_entry entry; |
| 532 | const struct nlattr *bc = nlmsg_find_attr(cb->nlh, | ||
| 533 | sizeof(*r), | ||
| 534 | INET_DIAG_REQ_BYTECODE); | ||
| 644 | 535 | ||
| 645 | entry.family = tw->tw_family; | 536 | entry.family = tw->tw_family; |
| 646 | #if IS_ENABLED(CONFIG_IPV6) | 537 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) |
| 647 | if (tw->tw_family == AF_INET6) { | 538 | if (tw->tw_family == AF_INET6) { |
| 648 | struct inet6_timewait_sock *tw6 = | 539 | struct inet6_timewait_sock *tw6 = |
| 649 | inet6_twsk((struct sock *)tw); | 540 | inet6_twsk((struct sock *)tw); |
| @@ -659,70 +550,38 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, | |||
| 659 | entry.dport = ntohs(tw->tw_dport); | 550 | entry.dport = ntohs(tw->tw_dport); |
| 660 | entry.userlocks = 0; | 551 | entry.userlocks = 0; |
| 661 | 552 | ||
| 662 | if (!inet_diag_bc_run(bc, &entry)) | 553 | if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry)) |
| 663 | return 0; | 554 | return 0; |
| 664 | } | 555 | } |
| 665 | 556 | ||
| 666 | return inet_twsk_diag_fill(tw, skb, r, | 557 | return inet_twsk_diag_fill(tw, skb, r->idiag_ext, |
| 667 | NETLINK_CB(cb->skb).portid, | 558 | NETLINK_CB(cb->skb).pid, |
| 668 | cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); | 559 | cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); |
| 669 | } | 560 | } |
| 670 | 561 | ||
| 671 | /* Get the IPv4, IPv6, or IPv4-mapped-IPv6 local and remote addresses | ||
| 672 | * from a request_sock. For IPv4-mapped-IPv6 we must map IPv4 to IPv6. | ||
| 673 | */ | ||
| 674 | static inline void inet_diag_req_addrs(const struct sock *sk, | ||
| 675 | const struct request_sock *req, | ||
| 676 | struct inet_diag_entry *entry) | ||
| 677 | { | ||
| 678 | struct inet_request_sock *ireq = inet_rsk(req); | ||
| 679 | |||
| 680 | #if IS_ENABLED(CONFIG_IPV6) | ||
| 681 | if (sk->sk_family == AF_INET6) { | ||
| 682 | if (req->rsk_ops->family == AF_INET6) { | ||
| 683 | entry->saddr = inet6_rsk(req)->loc_addr.s6_addr32; | ||
| 684 | entry->daddr = inet6_rsk(req)->rmt_addr.s6_addr32; | ||
| 685 | } else if (req->rsk_ops->family == AF_INET) { | ||
| 686 | ipv6_addr_set_v4mapped(ireq->loc_addr, | ||
| 687 | &entry->saddr_storage); | ||
| 688 | ipv6_addr_set_v4mapped(ireq->rmt_addr, | ||
| 689 | &entry->daddr_storage); | ||
| 690 | entry->saddr = entry->saddr_storage.s6_addr32; | ||
| 691 | entry->daddr = entry->daddr_storage.s6_addr32; | ||
| 692 | } | ||
| 693 | } else | ||
| 694 | #endif | ||
| 695 | { | ||
| 696 | entry->saddr = &ireq->loc_addr; | ||
| 697 | entry->daddr = &ireq->rmt_addr; | ||
| 698 | } | ||
| 699 | } | ||
| 700 | |||
| 701 | static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, | 562 | static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, |
| 702 | struct request_sock *req, | 563 | struct request_sock *req, u32 pid, u32 seq, |
| 703 | struct user_namespace *user_ns, | ||
| 704 | u32 portid, u32 seq, | ||
| 705 | const struct nlmsghdr *unlh) | 564 | const struct nlmsghdr *unlh) |
| 706 | { | 565 | { |
| 707 | const struct inet_request_sock *ireq = inet_rsk(req); | 566 | const struct inet_request_sock *ireq = inet_rsk(req); |
| 708 | struct inet_sock *inet = inet_sk(sk); | 567 | struct inet_sock *inet = inet_sk(sk); |
| 568 | unsigned char *b = skb_tail_pointer(skb); | ||
| 709 | struct inet_diag_msg *r; | 569 | struct inet_diag_msg *r; |
| 710 | struct nlmsghdr *nlh; | 570 | struct nlmsghdr *nlh; |
| 711 | long tmo; | 571 | long tmo; |
| 712 | 572 | ||
| 713 | nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), | 573 | nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); |
| 714 | NLM_F_MULTI); | 574 | nlh->nlmsg_flags = NLM_F_MULTI; |
| 715 | if (!nlh) | 575 | r = NLMSG_DATA(nlh); |
| 716 | return -EMSGSIZE; | ||
| 717 | 576 | ||
| 718 | r = nlmsg_data(nlh); | ||
| 719 | r->idiag_family = sk->sk_family; | 577 | r->idiag_family = sk->sk_family; |
| 720 | r->idiag_state = TCP_SYN_RECV; | 578 | r->idiag_state = TCP_SYN_RECV; |
| 721 | r->idiag_timer = 1; | 579 | r->idiag_timer = 1; |
| 722 | r->idiag_retrans = req->num_retrans; | 580 | r->idiag_retrans = req->retrans; |
| 723 | 581 | ||
| 724 | r->id.idiag_if = sk->sk_bound_dev_if; | 582 | r->id.idiag_if = sk->sk_bound_dev_if; |
| 725 | sock_diag_save_cookie(req, r->id.idiag_cookie); | 583 | r->id.idiag_cookie[0] = (u32)(unsigned long)req; |
| 584 | r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1); | ||
| 726 | 585 | ||
| 727 | tmo = req->expires - jiffies; | 586 | tmo = req->expires - jiffies; |
| 728 | if (tmo < 0) | 587 | if (tmo < 0) |
| @@ -735,28 +594,33 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, | |||
| 735 | r->idiag_expires = jiffies_to_msecs(tmo); | 594 | r->idiag_expires = jiffies_to_msecs(tmo); |
| 736 | r->idiag_rqueue = 0; | 595 | r->idiag_rqueue = 0; |
| 737 | r->idiag_wqueue = 0; | 596 | r->idiag_wqueue = 0; |
| 738 | r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); | 597 | r->idiag_uid = sock_i_uid(sk); |
| 739 | r->idiag_inode = 0; | 598 | r->idiag_inode = 0; |
| 740 | #if IS_ENABLED(CONFIG_IPV6) | 599 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) |
| 741 | if (r->idiag_family == AF_INET6) { | 600 | if (r->idiag_family == AF_INET6) { |
| 742 | struct inet_diag_entry entry; | 601 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, |
| 743 | inet_diag_req_addrs(sk, req, &entry); | 602 | &inet6_rsk(req)->loc_addr); |
| 744 | memcpy(r->id.idiag_src, entry.saddr, sizeof(struct in6_addr)); | 603 | ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, |
| 745 | memcpy(r->id.idiag_dst, entry.daddr, sizeof(struct in6_addr)); | 604 | &inet6_rsk(req)->rmt_addr); |
| 746 | } | 605 | } |
| 747 | #endif | 606 | #endif |
| 607 | nlh->nlmsg_len = skb_tail_pointer(skb) - b; | ||
| 608 | |||
| 609 | return skb->len; | ||
| 748 | 610 | ||
| 749 | return nlmsg_end(skb, nlh); | 611 | nlmsg_failure: |
| 612 | nlmsg_trim(skb, b); | ||
| 613 | return -1; | ||
| 750 | } | 614 | } |
| 751 | 615 | ||
| 752 | static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, | 616 | static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, |
| 753 | struct netlink_callback *cb, | 617 | struct netlink_callback *cb) |
| 754 | struct inet_diag_req_v2 *r, | ||
| 755 | const struct nlattr *bc) | ||
| 756 | { | 618 | { |
| 757 | struct inet_diag_entry entry; | 619 | struct inet_diag_entry entry; |
| 620 | struct inet_diag_req *r = NLMSG_DATA(cb->nlh); | ||
| 758 | struct inet_connection_sock *icsk = inet_csk(sk); | 621 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 759 | struct listen_sock *lopt; | 622 | struct listen_sock *lopt; |
| 623 | const struct nlattr *bc = NULL; | ||
| 760 | struct inet_sock *inet = inet_sk(sk); | 624 | struct inet_sock *inet = inet_sk(sk); |
| 761 | int j, s_j; | 625 | int j, s_j; |
| 762 | int reqnum, s_reqnum; | 626 | int reqnum, s_reqnum; |
| @@ -776,7 +640,9 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, | |||
| 776 | if (!lopt || !lopt->qlen) | 640 | if (!lopt || !lopt->qlen) |
| 777 | goto out; | 641 | goto out; |
| 778 | 642 | ||
| 779 | if (bc != NULL) { | 643 | if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { |
| 644 | bc = nlmsg_find_attr(cb->nlh, sizeof(*r), | ||
| 645 | INET_DIAG_REQ_BYTECODE); | ||
| 780 | entry.sport = inet->inet_num; | 646 | entry.sport = inet->inet_num; |
| 781 | entry.userlocks = sk->sk_userlocks; | 647 | entry.userlocks = sk->sk_userlocks; |
| 782 | } | 648 | } |
| @@ -795,16 +661,27 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, | |||
| 795 | continue; | 661 | continue; |
| 796 | 662 | ||
| 797 | if (bc) { | 663 | if (bc) { |
| 798 | inet_diag_req_addrs(sk, req, &entry); | 664 | entry.saddr = |
| 665 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) | ||
| 666 | (entry.family == AF_INET6) ? | ||
| 667 | inet6_rsk(req)->loc_addr.s6_addr32 : | ||
| 668 | #endif | ||
| 669 | &ireq->loc_addr; | ||
| 670 | entry.daddr = | ||
| 671 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) | ||
| 672 | (entry.family == AF_INET6) ? | ||
| 673 | inet6_rsk(req)->rmt_addr.s6_addr32 : | ||
| 674 | #endif | ||
| 675 | &ireq->rmt_addr; | ||
| 799 | entry.dport = ntohs(ireq->rmt_port); | 676 | entry.dport = ntohs(ireq->rmt_port); |
| 800 | 677 | ||
| 801 | if (!inet_diag_bc_run(bc, &entry)) | 678 | if (!inet_diag_bc_run(nla_data(bc), |
| 679 | nla_len(bc), &entry)) | ||
| 802 | continue; | 680 | continue; |
| 803 | } | 681 | } |
| 804 | 682 | ||
| 805 | err = inet_diag_fill_req(skb, sk, req, | 683 | err = inet_diag_fill_req(skb, sk, req, |
| 806 | sk_user_ns(NETLINK_CB(cb->skb).ssk), | 684 | NETLINK_CB(cb->skb).pid, |
| 807 | NETLINK_CB(cb->skb).portid, | ||
| 808 | cb->nlh->nlmsg_seq, cb->nlh); | 685 | cb->nlh->nlmsg_seq, cb->nlh); |
| 809 | if (err < 0) { | 686 | if (err < 0) { |
| 810 | cb->args[3] = j + 1; | 687 | cb->args[3] = j + 1; |
| @@ -822,12 +699,19 @@ out: | |||
| 822 | return err; | 699 | return err; |
| 823 | } | 700 | } |
| 824 | 701 | ||
| 825 | void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, | 702 | static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) |
| 826 | struct netlink_callback *cb, struct inet_diag_req_v2 *r, struct nlattr *bc) | ||
| 827 | { | 703 | { |
| 828 | int i, num; | 704 | int i, num; |
| 829 | int s_i, s_num; | 705 | int s_i, s_num; |
| 830 | struct net *net = sock_net(skb->sk); | 706 | struct inet_diag_req *r = NLMSG_DATA(cb->nlh); |
| 707 | const struct inet_diag_handler *handler; | ||
| 708 | struct inet_hashinfo *hashinfo; | ||
| 709 | |||
| 710 | handler = inet_diag_lock_handler(cb->nlh->nlmsg_type); | ||
| 711 | if (IS_ERR(handler)) | ||
| 712 | goto unlock; | ||
| 713 | |||
| 714 | hashinfo = handler->idiag_hashinfo; | ||
| 831 | 715 | ||
| 832 | s_i = cb->args[1]; | 716 | s_i = cb->args[1]; |
| 833 | s_num = num = cb->args[2]; | 717 | s_num = num = cb->args[2]; |
| @@ -847,18 +731,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, | |||
| 847 | sk_nulls_for_each(sk, node, &ilb->head) { | 731 | sk_nulls_for_each(sk, node, &ilb->head) { |
| 848 | struct inet_sock *inet = inet_sk(sk); | 732 | struct inet_sock *inet = inet_sk(sk); |
| 849 | 733 | ||
| 850 | if (!net_eq(sock_net(sk), net)) | ||
| 851 | continue; | ||
| 852 | |||
| 853 | if (num < s_num) { | 734 | if (num < s_num) { |
| 854 | num++; | 735 | num++; |
| 855 | continue; | 736 | continue; |
| 856 | } | 737 | } |
| 857 | 738 | ||
| 858 | if (r->sdiag_family != AF_UNSPEC && | ||
| 859 | sk->sk_family != r->sdiag_family) | ||
| 860 | goto next_listen; | ||
| 861 | |||
| 862 | if (r->id.idiag_sport != inet->inet_sport && | 739 | if (r->id.idiag_sport != inet->inet_sport && |
| 863 | r->id.idiag_sport) | 740 | r->id.idiag_sport) |
| 864 | goto next_listen; | 741 | goto next_listen; |
| @@ -868,7 +745,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, | |||
| 868 | cb->args[3] > 0) | 745 | cb->args[3] > 0) |
| 869 | goto syn_recv; | 746 | goto syn_recv; |
| 870 | 747 | ||
| 871 | if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { | 748 | if (inet_csk_diag_dump(sk, skb, cb) < 0) { |
| 872 | spin_unlock_bh(&ilb->lock); | 749 | spin_unlock_bh(&ilb->lock); |
| 873 | goto done; | 750 | goto done; |
| 874 | } | 751 | } |
| @@ -877,7 +754,7 @@ syn_recv: | |||
| 877 | if (!(r->idiag_states & TCPF_SYN_RECV)) | 754 | if (!(r->idiag_states & TCPF_SYN_RECV)) |
| 878 | goto next_listen; | 755 | goto next_listen; |
| 879 | 756 | ||
| 880 | if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) { | 757 | if (inet_diag_dump_reqs(skb, sk, cb) < 0) { |
| 881 | spin_unlock_bh(&ilb->lock); | 758 | spin_unlock_bh(&ilb->lock); |
| 882 | goto done; | 759 | goto done; |
| 883 | } | 760 | } |
| @@ -899,7 +776,7 @@ skip_listen_ht: | |||
| 899 | } | 776 | } |
| 900 | 777 | ||
| 901 | if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) | 778 | if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) |
| 902 | goto out; | 779 | goto unlock; |
| 903 | 780 | ||
| 904 | for (i = s_i; i <= hashinfo->ehash_mask; i++) { | 781 | for (i = s_i; i <= hashinfo->ehash_mask; i++) { |
| 905 | struct inet_ehash_bucket *head = &hashinfo->ehash[i]; | 782 | struct inet_ehash_bucket *head = &hashinfo->ehash[i]; |
| @@ -920,22 +797,17 @@ skip_listen_ht: | |||
| 920 | sk_nulls_for_each(sk, node, &head->chain) { | 797 | sk_nulls_for_each(sk, node, &head->chain) { |
| 921 | struct inet_sock *inet = inet_sk(sk); | 798 | struct inet_sock *inet = inet_sk(sk); |
| 922 | 799 | ||
| 923 | if (!net_eq(sock_net(sk), net)) | ||
| 924 | continue; | ||
| 925 | if (num < s_num) | 800 | if (num < s_num) |
| 926 | goto next_normal; | 801 | goto next_normal; |
| 927 | if (!(r->idiag_states & (1 << sk->sk_state))) | 802 | if (!(r->idiag_states & (1 << sk->sk_state))) |
| 928 | goto next_normal; | 803 | goto next_normal; |
| 929 | if (r->sdiag_family != AF_UNSPEC && | ||
| 930 | sk->sk_family != r->sdiag_family) | ||
| 931 | goto next_normal; | ||
| 932 | if (r->id.idiag_sport != inet->inet_sport && | 804 | if (r->id.idiag_sport != inet->inet_sport && |
| 933 | r->id.idiag_sport) | 805 | r->id.idiag_sport) |
| 934 | goto next_normal; | 806 | goto next_normal; |
| 935 | if (r->id.idiag_dport != inet->inet_dport && | 807 | if (r->id.idiag_dport != inet->inet_dport && |
| 936 | r->id.idiag_dport) | 808 | r->id.idiag_dport) |
| 937 | goto next_normal; | 809 | goto next_normal; |
| 938 | if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { | 810 | if (inet_csk_diag_dump(sk, skb, cb) < 0) { |
| 939 | spin_unlock_bh(lock); | 811 | spin_unlock_bh(lock); |
| 940 | goto done; | 812 | goto done; |
| 941 | } | 813 | } |
| @@ -948,21 +820,16 @@ next_normal: | |||
| 948 | 820 | ||
| 949 | inet_twsk_for_each(tw, node, | 821 | inet_twsk_for_each(tw, node, |
| 950 | &head->twchain) { | 822 | &head->twchain) { |
| 951 | if (!net_eq(twsk_net(tw), net)) | ||
| 952 | continue; | ||
| 953 | 823 | ||
| 954 | if (num < s_num) | 824 | if (num < s_num) |
| 955 | goto next_dying; | 825 | goto next_dying; |
| 956 | if (r->sdiag_family != AF_UNSPEC && | ||
| 957 | tw->tw_family != r->sdiag_family) | ||
| 958 | goto next_dying; | ||
| 959 | if (r->id.idiag_sport != tw->tw_sport && | 826 | if (r->id.idiag_sport != tw->tw_sport && |
| 960 | r->id.idiag_sport) | 827 | r->id.idiag_sport) |
| 961 | goto next_dying; | 828 | goto next_dying; |
| 962 | if (r->id.idiag_dport != tw->tw_dport && | 829 | if (r->id.idiag_dport != tw->tw_dport && |
| 963 | r->id.idiag_dport) | 830 | r->id.idiag_dport) |
| 964 | goto next_dying; | 831 | goto next_dying; |
| 965 | if (inet_twsk_diag_dump(tw, skb, cb, r, bc) < 0) { | 832 | if (inet_twsk_diag_dump(tw, skb, cb) < 0) { |
| 966 | spin_unlock_bh(lock); | 833 | spin_unlock_bh(lock); |
| 967 | goto done; | 834 | goto done; |
| 968 | } | 835 | } |
| @@ -976,89 +843,15 @@ next_dying: | |||
| 976 | done: | 843 | done: |
| 977 | cb->args[1] = i; | 844 | cb->args[1] = i; |
| 978 | cb->args[2] = num; | 845 | cb->args[2] = num; |
| 979 | out: | 846 | unlock: |
| 980 | ; | ||
| 981 | } | ||
| 982 | EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); | ||
| 983 | |||
| 984 | static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, | ||
| 985 | struct inet_diag_req_v2 *r, struct nlattr *bc) | ||
| 986 | { | ||
| 987 | const struct inet_diag_handler *handler; | ||
| 988 | int err = 0; | ||
| 989 | |||
| 990 | handler = inet_diag_lock_handler(r->sdiag_protocol); | ||
| 991 | if (!IS_ERR(handler)) | ||
| 992 | handler->dump(skb, cb, r, bc); | ||
| 993 | else | ||
| 994 | err = PTR_ERR(handler); | ||
| 995 | inet_diag_unlock_handler(handler); | 847 | inet_diag_unlock_handler(handler); |
| 996 | 848 | return skb->len; | |
| 997 | return err ? : skb->len; | ||
| 998 | } | ||
| 999 | |||
| 1000 | static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) | ||
| 1001 | { | ||
| 1002 | struct nlattr *bc = NULL; | ||
| 1003 | int hdrlen = sizeof(struct inet_diag_req_v2); | ||
| 1004 | |||
| 1005 | if (nlmsg_attrlen(cb->nlh, hdrlen)) | ||
| 1006 | bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); | ||
| 1007 | |||
| 1008 | return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc); | ||
| 1009 | } | 849 | } |
| 1010 | 850 | ||
| 1011 | static inline int inet_diag_type2proto(int type) | 851 | static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) |
| 1012 | { | 852 | { |
| 1013 | switch (type) { | ||
| 1014 | case TCPDIAG_GETSOCK: | ||
| 1015 | return IPPROTO_TCP; | ||
| 1016 | case DCCPDIAG_GETSOCK: | ||
| 1017 | return IPPROTO_DCCP; | ||
| 1018 | default: | ||
| 1019 | return 0; | ||
| 1020 | } | ||
| 1021 | } | ||
| 1022 | |||
| 1023 | static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb) | ||
| 1024 | { | ||
| 1025 | struct inet_diag_req *rc = nlmsg_data(cb->nlh); | ||
| 1026 | struct inet_diag_req_v2 req; | ||
| 1027 | struct nlattr *bc = NULL; | ||
| 1028 | int hdrlen = sizeof(struct inet_diag_req); | 853 | int hdrlen = sizeof(struct inet_diag_req); |
| 1029 | 854 | ||
| 1030 | req.sdiag_family = AF_UNSPEC; /* compatibility */ | ||
| 1031 | req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type); | ||
| 1032 | req.idiag_ext = rc->idiag_ext; | ||
| 1033 | req.idiag_states = rc->idiag_states; | ||
| 1034 | req.id = rc->id; | ||
| 1035 | |||
| 1036 | if (nlmsg_attrlen(cb->nlh, hdrlen)) | ||
| 1037 | bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); | ||
| 1038 | |||
| 1039 | return __inet_diag_dump(skb, cb, &req, bc); | ||
| 1040 | } | ||
| 1041 | |||
| 1042 | static int inet_diag_get_exact_compat(struct sk_buff *in_skb, | ||
| 1043 | const struct nlmsghdr *nlh) | ||
| 1044 | { | ||
| 1045 | struct inet_diag_req *rc = nlmsg_data(nlh); | ||
| 1046 | struct inet_diag_req_v2 req; | ||
| 1047 | |||
| 1048 | req.sdiag_family = rc->idiag_family; | ||
| 1049 | req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type); | ||
| 1050 | req.idiag_ext = rc->idiag_ext; | ||
| 1051 | req.idiag_states = rc->idiag_states; | ||
| 1052 | req.id = rc->id; | ||
| 1053 | |||
| 1054 | return inet_diag_get_exact(in_skb, nlh, &req); | ||
| 1055 | } | ||
| 1056 | |||
| 1057 | static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) | ||
| 1058 | { | ||
| 1059 | int hdrlen = sizeof(struct inet_diag_req); | ||
| 1060 | struct net *net = sock_net(skb->sk); | ||
| 1061 | |||
| 1062 | if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX || | 855 | if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX || |
| 1063 | nlmsg_len(nlh) < hdrlen) | 856 | nlmsg_len(nlh) < hdrlen) |
| 1064 | return -EINVAL; | 857 | return -EINVAL; |
| @@ -1074,62 +867,29 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 1074 | inet_diag_bc_audit(nla_data(attr), nla_len(attr))) | 867 | inet_diag_bc_audit(nla_data(attr), nla_len(attr))) |
| 1075 | return -EINVAL; | 868 | return -EINVAL; |
| 1076 | } | 869 | } |
| 1077 | { | ||
| 1078 | struct netlink_dump_control c = { | ||
| 1079 | .dump = inet_diag_dump_compat, | ||
| 1080 | }; | ||
| 1081 | return netlink_dump_start(net->diag_nlsk, skb, nlh, &c); | ||
| 1082 | } | ||
| 1083 | } | ||
| 1084 | |||
| 1085 | return inet_diag_get_exact_compat(skb, nlh); | ||
| 1086 | } | ||
| 1087 | |||
| 1088 | static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) | ||
| 1089 | { | ||
| 1090 | int hdrlen = sizeof(struct inet_diag_req_v2); | ||
| 1091 | struct net *net = sock_net(skb->sk); | ||
| 1092 | 870 | ||
| 1093 | if (nlmsg_len(h) < hdrlen) | 871 | return netlink_dump_start(idiagnl, skb, nlh, |
| 1094 | return -EINVAL; | 872 | inet_diag_dump, NULL, 0); |
| 1095 | |||
| 1096 | if (h->nlmsg_flags & NLM_F_DUMP) { | ||
| 1097 | if (nlmsg_attrlen(h, hdrlen)) { | ||
| 1098 | struct nlattr *attr; | ||
| 1099 | attr = nlmsg_find_attr(h, hdrlen, | ||
| 1100 | INET_DIAG_REQ_BYTECODE); | ||
| 1101 | if (attr == NULL || | ||
| 1102 | nla_len(attr) < sizeof(struct inet_diag_bc_op) || | ||
| 1103 | inet_diag_bc_audit(nla_data(attr), nla_len(attr))) | ||
| 1104 | return -EINVAL; | ||
| 1105 | } | ||
| 1106 | { | ||
| 1107 | struct netlink_dump_control c = { | ||
| 1108 | .dump = inet_diag_dump, | ||
| 1109 | }; | ||
| 1110 | return netlink_dump_start(net->diag_nlsk, skb, h, &c); | ||
| 1111 | } | ||
| 1112 | } | 873 | } |
| 1113 | 874 | ||
| 1114 | return inet_diag_get_exact(skb, h, nlmsg_data(h)); | 875 | return inet_diag_get_exact(skb, nlh); |
| 1115 | } | 876 | } |
| 1116 | 877 | ||
| 1117 | static const struct sock_diag_handler inet_diag_handler = { | 878 | static DEFINE_MUTEX(inet_diag_mutex); |
| 1118 | .family = AF_INET, | ||
| 1119 | .dump = inet_diag_handler_dump, | ||
| 1120 | }; | ||
| 1121 | 879 | ||
| 1122 | static const struct sock_diag_handler inet6_diag_handler = { | 880 | static void inet_diag_rcv(struct sk_buff *skb) |
| 1123 | .family = AF_INET6, | 881 | { |
| 1124 | .dump = inet_diag_handler_dump, | 882 | mutex_lock(&inet_diag_mutex); |
| 1125 | }; | 883 | netlink_rcv_skb(skb, &inet_diag_rcv_msg); |
| 884 | mutex_unlock(&inet_diag_mutex); | ||
| 885 | } | ||
| 1126 | 886 | ||
| 1127 | int inet_diag_register(const struct inet_diag_handler *h) | 887 | int inet_diag_register(const struct inet_diag_handler *h) |
| 1128 | { | 888 | { |
| 1129 | const __u16 type = h->idiag_type; | 889 | const __u16 type = h->idiag_type; |
| 1130 | int err = -EINVAL; | 890 | int err = -EINVAL; |
| 1131 | 891 | ||
| 1132 | if (type >= IPPROTO_MAX) | 892 | if (type >= INET_DIAG_GETSOCK_MAX) |
| 1133 | goto out; | 893 | goto out; |
| 1134 | 894 | ||
| 1135 | mutex_lock(&inet_diag_table_mutex); | 895 | mutex_lock(&inet_diag_table_mutex); |
| @@ -1148,7 +908,7 @@ void inet_diag_unregister(const struct inet_diag_handler *h) | |||
| 1148 | { | 908 | { |
| 1149 | const __u16 type = h->idiag_type; | 909 | const __u16 type = h->idiag_type; |
| 1150 | 910 | ||
| 1151 | if (type >= IPPROTO_MAX) | 911 | if (type >= INET_DIAG_GETSOCK_MAX) |
| 1152 | return; | 912 | return; |
| 1153 | 913 | ||
| 1154 | mutex_lock(&inet_diag_table_mutex); | 914 | mutex_lock(&inet_diag_table_mutex); |
| @@ -1159,7 +919,7 @@ EXPORT_SYMBOL_GPL(inet_diag_unregister); | |||
| 1159 | 919 | ||
| 1160 | static int __init inet_diag_init(void) | 920 | static int __init inet_diag_init(void) |
| 1161 | { | 921 | { |
| 1162 | const int inet_diag_table_size = (IPPROTO_MAX * | 922 | const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX * |
| 1163 | sizeof(struct inet_diag_handler *)); | 923 | sizeof(struct inet_diag_handler *)); |
| 1164 | int err = -ENOMEM; | 924 | int err = -ENOMEM; |
| 1165 | 925 | ||
| @@ -1167,35 +927,25 @@ static int __init inet_diag_init(void) | |||
| 1167 | if (!inet_diag_table) | 927 | if (!inet_diag_table) |
| 1168 | goto out; | 928 | goto out; |
| 1169 | 929 | ||
| 1170 | err = sock_diag_register(&inet_diag_handler); | 930 | idiagnl = netlink_kernel_create(&init_net, NETLINK_INET_DIAG, 0, |
| 1171 | if (err) | 931 | inet_diag_rcv, NULL, THIS_MODULE); |
| 1172 | goto out_free_nl; | 932 | if (idiagnl == NULL) |
| 1173 | 933 | goto out_free_table; | |
| 1174 | err = sock_diag_register(&inet6_diag_handler); | 934 | err = 0; |
| 1175 | if (err) | ||
| 1176 | goto out_free_inet; | ||
| 1177 | |||
| 1178 | sock_diag_register_inet_compat(inet_diag_rcv_msg_compat); | ||
| 1179 | out: | 935 | out: |
| 1180 | return err; | 936 | return err; |
| 1181 | 937 | out_free_table: | |
| 1182 | out_free_inet: | ||
| 1183 | sock_diag_unregister(&inet_diag_handler); | ||
| 1184 | out_free_nl: | ||
| 1185 | kfree(inet_diag_table); | 938 | kfree(inet_diag_table); |
| 1186 | goto out; | 939 | goto out; |
| 1187 | } | 940 | } |
| 1188 | 941 | ||
| 1189 | static void __exit inet_diag_exit(void) | 942 | static void __exit inet_diag_exit(void) |
| 1190 | { | 943 | { |
| 1191 | sock_diag_unregister(&inet6_diag_handler); | 944 | netlink_kernel_release(idiagnl); |
| 1192 | sock_diag_unregister(&inet_diag_handler); | ||
| 1193 | sock_diag_unregister_inet_compat(inet_diag_rcv_msg_compat); | ||
| 1194 | kfree(inet_diag_table); | 945 | kfree(inet_diag_table); |
| 1195 | } | 946 | } |
| 1196 | 947 | ||
| 1197 | module_init(inet_diag_init); | 948 | module_init(inet_diag_init); |
| 1198 | module_exit(inet_diag_exit); | 949 | module_exit(inet_diag_exit); |
| 1199 | MODULE_LICENSE("GPL"); | 950 | MODULE_LICENSE("GPL"); |
| 1200 | MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */); | 951 | MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_INET_DIAG); |
| 1201 | MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */); | ||
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 4750d2b74d7..5ff2a51b6d0 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c | |||
| @@ -89,7 +89,7 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) | |||
| 89 | nf->low_thresh = 0; | 89 | nf->low_thresh = 0; |
| 90 | 90 | ||
| 91 | local_bh_disable(); | 91 | local_bh_disable(); |
| 92 | inet_frag_evictor(nf, f, true); | 92 | inet_frag_evictor(nf, f); |
| 93 | local_bh_enable(); | 93 | local_bh_enable(); |
| 94 | } | 94 | } |
| 95 | EXPORT_SYMBOL(inet_frags_exit_net); | 95 | EXPORT_SYMBOL(inet_frags_exit_net); |
| @@ -158,16 +158,11 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, | |||
| 158 | } | 158 | } |
| 159 | EXPORT_SYMBOL(inet_frag_destroy); | 159 | EXPORT_SYMBOL(inet_frag_destroy); |
| 160 | 160 | ||
| 161 | int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force) | 161 | int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f) |
| 162 | { | 162 | { |
| 163 | struct inet_frag_queue *q; | 163 | struct inet_frag_queue *q; |
| 164 | int work, evicted = 0; | 164 | int work, evicted = 0; |
| 165 | 165 | ||
| 166 | if (!force) { | ||
| 167 | if (atomic_read(&nf->mem) <= nf->high_thresh) | ||
| 168 | return 0; | ||
| 169 | } | ||
| 170 | |||
| 171 | work = atomic_read(&nf->mem) - nf->low_thresh; | 166 | work = atomic_read(&nf->mem) - nf->low_thresh; |
| 172 | while (work > 0) { | 167 | while (work > 0) { |
| 173 | read_lock(&f->lock); | 168 | read_lock(&f->lock); |
| @@ -248,12 +243,12 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, | |||
| 248 | if (q == NULL) | 243 | if (q == NULL) |
| 249 | return NULL; | 244 | return NULL; |
| 250 | 245 | ||
| 251 | q->net = nf; | ||
| 252 | f->constructor(q, arg); | 246 | f->constructor(q, arg); |
| 253 | atomic_add(f->qsize, &nf->mem); | 247 | atomic_add(f->qsize, &nf->mem); |
| 254 | setup_timer(&q->timer, f->frag_expire, (unsigned long)q); | 248 | setup_timer(&q->timer, f->frag_expire, (unsigned long)q); |
| 255 | spin_lock_init(&q->lock); | 249 | spin_lock_init(&q->lock); |
| 256 | atomic_set(&q->refcnt, 1); | 250 | atomic_set(&q->refcnt, 1); |
| 251 | q->net = nf; | ||
| 257 | 252 | ||
| 258 | return q; | 253 | return q; |
| 259 | } | 254 | } |
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index fa3ae814871..984ec656b03 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c | |||
| @@ -217,7 +217,7 @@ begin: | |||
| 217 | } | 217 | } |
| 218 | EXPORT_SYMBOL_GPL(__inet_lookup_listener); | 218 | EXPORT_SYMBOL_GPL(__inet_lookup_listener); |
| 219 | 219 | ||
| 220 | struct sock *__inet_lookup_established(struct net *net, | 220 | struct sock * __inet_lookup_established(struct net *net, |
| 221 | struct inet_hashinfo *hashinfo, | 221 | struct inet_hashinfo *hashinfo, |
| 222 | const __be32 saddr, const __be16 sport, | 222 | const __be32 saddr, const __be16 sport, |
| 223 | const __be32 daddr, const u16 hnum, | 223 | const __be32 daddr, const u16 hnum, |
| @@ -237,14 +237,12 @@ struct sock *__inet_lookup_established(struct net *net, | |||
| 237 | rcu_read_lock(); | 237 | rcu_read_lock(); |
| 238 | begin: | 238 | begin: |
| 239 | sk_nulls_for_each_rcu(sk, node, &head->chain) { | 239 | sk_nulls_for_each_rcu(sk, node, &head->chain) { |
| 240 | if (sk->sk_hash != hash) | 240 | if (INET_MATCH(sk, net, hash, acookie, |
| 241 | continue; | 241 | saddr, daddr, ports, dif)) { |
| 242 | if (likely(INET_MATCH(sk, net, acookie, | ||
| 243 | saddr, daddr, ports, dif))) { | ||
| 244 | if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) | 242 | if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) |
| 245 | goto begintw; | 243 | goto begintw; |
| 246 | if (unlikely(!INET_MATCH(sk, net, acookie, | 244 | if (unlikely(!INET_MATCH(sk, net, hash, acookie, |
| 247 | saddr, daddr, ports, dif))) { | 245 | saddr, daddr, ports, dif))) { |
| 248 | sock_put(sk); | 246 | sock_put(sk); |
| 249 | goto begin; | 247 | goto begin; |
| 250 | } | 248 | } |
| @@ -262,18 +260,14 @@ begin: | |||
| 262 | begintw: | 260 | begintw: |
| 263 | /* Must check for a TIME_WAIT'er before going to listener hash. */ | 261 | /* Must check for a TIME_WAIT'er before going to listener hash. */ |
| 264 | sk_nulls_for_each_rcu(sk, node, &head->twchain) { | 262 | sk_nulls_for_each_rcu(sk, node, &head->twchain) { |
| 265 | if (sk->sk_hash != hash) | 263 | if (INET_TW_MATCH(sk, net, hash, acookie, |
| 266 | continue; | 264 | saddr, daddr, ports, dif)) { |
| 267 | if (likely(INET_TW_MATCH(sk, net, acookie, | ||
| 268 | saddr, daddr, ports, | ||
| 269 | dif))) { | ||
| 270 | if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { | 265 | if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { |
| 271 | sk = NULL; | 266 | sk = NULL; |
| 272 | goto out; | 267 | goto out; |
| 273 | } | 268 | } |
| 274 | if (unlikely(!INET_TW_MATCH(sk, net, acookie, | 269 | if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie, |
| 275 | saddr, daddr, ports, | 270 | saddr, daddr, ports, dif))) { |
| 276 | dif))) { | ||
| 277 | sock_put(sk); | 271 | sock_put(sk); |
| 278 | goto begintw; | 272 | goto begintw; |
| 279 | } | 273 | } |
| @@ -320,12 +314,10 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, | |||
| 320 | 314 | ||
| 321 | /* Check TIME-WAIT sockets first. */ | 315 | /* Check TIME-WAIT sockets first. */ |
| 322 | sk_nulls_for_each(sk2, node, &head->twchain) { | 316 | sk_nulls_for_each(sk2, node, &head->twchain) { |
| 323 | if (sk2->sk_hash != hash) | 317 | tw = inet_twsk(sk2); |
| 324 | continue; | ||
| 325 | 318 | ||
| 326 | if (likely(INET_TW_MATCH(sk2, net, acookie, | 319 | if (INET_TW_MATCH(sk2, net, hash, acookie, |
| 327 | saddr, daddr, ports, dif))) { | 320 | saddr, daddr, ports, dif)) { |
| 328 | tw = inet_twsk(sk2); | ||
| 329 | if (twsk_unique(sk, sk2, twp)) | 321 | if (twsk_unique(sk, sk2, twp)) |
| 330 | goto unique; | 322 | goto unique; |
| 331 | else | 323 | else |
| @@ -336,10 +328,8 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, | |||
| 336 | 328 | ||
| 337 | /* And established part... */ | 329 | /* And established part... */ |
| 338 | sk_nulls_for_each(sk2, node, &head->chain) { | 330 | sk_nulls_for_each(sk2, node, &head->chain) { |
| 339 | if (sk2->sk_hash != hash) | 331 | if (INET_MATCH(sk2, net, hash, acookie, |
| 340 | continue; | 332 | saddr, daddr, ports, dif)) |
| 341 | if (likely(INET_MATCH(sk2, net, acookie, | ||
| 342 | saddr, daddr, ports, dif))) | ||
| 343 | goto not_unique; | 333 | goto not_unique; |
| 344 | } | 334 | } |
| 345 | 335 | ||
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index cc280a3f4f9..ef7ae6049a5 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c | |||
| @@ -244,11 +244,11 @@ static void lro_add_frags(struct net_lro_desc *lro_desc, | |||
| 244 | skb->truesize += truesize; | 244 | skb->truesize += truesize; |
| 245 | 245 | ||
| 246 | skb_frags[0].page_offset += hlen; | 246 | skb_frags[0].page_offset += hlen; |
| 247 | skb_frag_size_sub(&skb_frags[0], hlen); | 247 | skb_frags[0].size -= hlen; |
| 248 | 248 | ||
| 249 | while (tcp_data_len > 0) { | 249 | while (tcp_data_len > 0) { |
| 250 | *(lro_desc->next_frag) = *skb_frags; | 250 | *(lro_desc->next_frag) = *skb_frags; |
| 251 | tcp_data_len -= skb_frag_size(skb_frags); | 251 | tcp_data_len -= skb_frags->size; |
| 252 | lro_desc->next_frag++; | 252 | lro_desc->next_frag++; |
| 253 | skb_frags++; | 253 | skb_frags++; |
| 254 | skb_shinfo(skb)->nr_frags++; | 254 | skb_shinfo(skb)->nr_frags++; |
| @@ -400,14 +400,14 @@ static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr, | |||
| 400 | skb_frags = skb_shinfo(skb)->frags; | 400 | skb_frags = skb_shinfo(skb)->frags; |
| 401 | while (data_len > 0) { | 401 | while (data_len > 0) { |
| 402 | *skb_frags = *frags; | 402 | *skb_frags = *frags; |
| 403 | data_len -= skb_frag_size(frags); | 403 | data_len -= frags->size; |
| 404 | skb_frags++; | 404 | skb_frags++; |
| 405 | frags++; | 405 | frags++; |
| 406 | skb_shinfo(skb)->nr_frags++; | 406 | skb_shinfo(skb)->nr_frags++; |
| 407 | } | 407 | } |
| 408 | 408 | ||
| 409 | skb_shinfo(skb)->frags[0].page_offset += hdr_len; | 409 | skb_shinfo(skb)->frags[0].page_offset += hdr_len; |
| 410 | skb_frag_size_sub(&skb_shinfo(skb)->frags[0], hdr_len); | 410 | skb_shinfo(skb)->frags[0].size -= hdr_len; |
| 411 | 411 | ||
| 412 | skb->ip_summed = ip_summed; | 412 | skb->ip_summed = ip_summed; |
| 413 | skb->csum = sum; | 413 | skb->csum = sum; |
| @@ -433,7 +433,7 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr, | |||
| 433 | if (!lro_mgr->get_frag_header || | 433 | if (!lro_mgr->get_frag_header || |
| 434 | lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, | 434 | lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, |
| 435 | (void *)&tcph, &flags, priv)) { | 435 | (void *)&tcph, &flags, priv)) { |
| 436 | mac_hdr = skb_frag_address(frags); | 436 | mac_hdr = page_address(frags->page) + frags->page_offset; |
| 437 | goto out1; | 437 | goto out1; |
| 438 | } | 438 | } |
| 439 | 439 | ||
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 2784db3155f..3c8dfa16614 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c | |||
| @@ -11,7 +11,6 @@ | |||
| 11 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
| 12 | #include <linux/kmemcheck.h> | 12 | #include <linux/kmemcheck.h> |
| 13 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
| 14 | #include <linux/module.h> | ||
| 15 | #include <net/inet_hashtables.h> | 14 | #include <net/inet_hashtables.h> |
| 16 | #include <net/inet_timewait_sock.h> | 15 | #include <net/inet_timewait_sock.h> |
| 17 | #include <net/ip.h> | 16 | #include <net/ip.h> |
| @@ -89,8 +88,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, | |||
| 89 | 88 | ||
| 90 | #ifdef SOCK_REFCNT_DEBUG | 89 | #ifdef SOCK_REFCNT_DEBUG |
| 91 | if (atomic_read(&tw->tw_refcnt) != 1) { | 90 | if (atomic_read(&tw->tw_refcnt) != 1) { |
| 92 | pr_debug("%s timewait_sock %p refcnt=%d\n", | 91 | printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n", |
| 93 | tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); | 92 | tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); |
| 94 | } | 93 | } |
| 95 | #endif | 94 | #endif |
| 96 | while (refcnt) { | 95 | while (refcnt) { |
| @@ -184,7 +183,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat | |||
| 184 | tw->tw_daddr = inet->inet_daddr; | 183 | tw->tw_daddr = inet->inet_daddr; |
| 185 | tw->tw_rcv_saddr = inet->inet_rcv_saddr; | 184 | tw->tw_rcv_saddr = inet->inet_rcv_saddr; |
| 186 | tw->tw_bound_dev_if = sk->sk_bound_dev_if; | 185 | tw->tw_bound_dev_if = sk->sk_bound_dev_if; |
| 187 | tw->tw_tos = inet->tos; | ||
| 188 | tw->tw_num = inet->inet_num; | 186 | tw->tw_num = inet->inet_num; |
| 189 | tw->tw_state = TCP_TIME_WAIT; | 187 | tw->tw_state = TCP_TIME_WAIT; |
| 190 | tw->tw_substate = state; | 188 | tw->tw_substate = state; |
| @@ -263,7 +261,7 @@ rescan: | |||
| 263 | void inet_twdr_hangman(unsigned long data) | 261 | void inet_twdr_hangman(unsigned long data) |
| 264 | { | 262 | { |
| 265 | struct inet_timewait_death_row *twdr; | 263 | struct inet_timewait_death_row *twdr; |
| 266 | unsigned int need_timer; | 264 | int unsigned need_timer; |
| 267 | 265 | ||
| 268 | twdr = (struct inet_timewait_death_row *)data; | 266 | twdr = (struct inet_timewait_death_row *)data; |
| 269 | spin_lock(&twdr->death_lock); | 267 | spin_lock(&twdr->death_lock); |
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 000e3d239d6..86f13c67ea8 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c | |||
| @@ -17,7 +17,6 @@ | |||
| 17 | #include <linux/kernel.h> | 17 | #include <linux/kernel.h> |
| 18 | #include <linux/mm.h> | 18 | #include <linux/mm.h> |
| 19 | #include <linux/net.h> | 19 | #include <linux/net.h> |
| 20 | #include <linux/workqueue.h> | ||
| 21 | #include <net/ip.h> | 20 | #include <net/ip.h> |
| 22 | #include <net/inetpeer.h> | 21 | #include <net/inetpeer.h> |
| 23 | #include <net/secure_seq.h> | 22 | #include <net/secure_seq.h> |
| @@ -67,11 +66,6 @@ | |||
| 67 | 66 | ||
| 68 | static struct kmem_cache *peer_cachep __read_mostly; | 67 | static struct kmem_cache *peer_cachep __read_mostly; |
| 69 | 68 | ||
| 70 | static LIST_HEAD(gc_list); | ||
| 71 | static const int gc_delay = 60 * HZ; | ||
| 72 | static struct delayed_work gc_work; | ||
| 73 | static DEFINE_SPINLOCK(gc_lock); | ||
| 74 | |||
| 75 | #define node_height(x) x->avl_height | 69 | #define node_height(x) x->avl_height |
| 76 | 70 | ||
| 77 | #define peer_avl_empty ((struct inet_peer *)&peer_fake_node) | 71 | #define peer_avl_empty ((struct inet_peer *)&peer_fake_node) |
| @@ -82,39 +76,23 @@ static const struct inet_peer peer_fake_node = { | |||
| 82 | .avl_height = 0 | 76 | .avl_height = 0 |
| 83 | }; | 77 | }; |
| 84 | 78 | ||
| 85 | void inet_peer_base_init(struct inet_peer_base *bp) | 79 | struct inet_peer_base { |
| 86 | { | 80 | struct inet_peer __rcu *root; |
| 87 | bp->root = peer_avl_empty_rcu; | 81 | seqlock_t lock; |
| 88 | seqlock_init(&bp->lock); | 82 | int total; |
| 89 | bp->flush_seq = ~0U; | 83 | }; |
| 90 | bp->total = 0; | ||
| 91 | } | ||
| 92 | EXPORT_SYMBOL_GPL(inet_peer_base_init); | ||
| 93 | |||
| 94 | static atomic_t v4_seq = ATOMIC_INIT(0); | ||
| 95 | static atomic_t v6_seq = ATOMIC_INIT(0); | ||
| 96 | |||
| 97 | static atomic_t *inetpeer_seq_ptr(int family) | ||
| 98 | { | ||
| 99 | return (family == AF_INET ? &v4_seq : &v6_seq); | ||
| 100 | } | ||
| 101 | |||
| 102 | static inline void flush_check(struct inet_peer_base *base, int family) | ||
| 103 | { | ||
| 104 | atomic_t *fp = inetpeer_seq_ptr(family); | ||
| 105 | |||
| 106 | if (unlikely(base->flush_seq != atomic_read(fp))) { | ||
| 107 | inetpeer_invalidate_tree(base); | ||
| 108 | base->flush_seq = atomic_read(fp); | ||
| 109 | } | ||
| 110 | } | ||
| 111 | 84 | ||
| 112 | void inetpeer_invalidate_family(int family) | 85 | static struct inet_peer_base v4_peers = { |
| 113 | { | 86 | .root = peer_avl_empty_rcu, |
| 114 | atomic_t *fp = inetpeer_seq_ptr(family); | 87 | .lock = __SEQLOCK_UNLOCKED(v4_peers.lock), |
| 88 | .total = 0, | ||
| 89 | }; | ||
| 115 | 90 | ||
| 116 | atomic_inc(fp); | 91 | static struct inet_peer_base v6_peers = { |
| 117 | } | 92 | .root = peer_avl_empty_rcu, |
| 93 | .lock = __SEQLOCK_UNLOCKED(v6_peers.lock), | ||
| 94 | .total = 0, | ||
| 95 | }; | ||
| 118 | 96 | ||
| 119 | #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ | 97 | #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ |
| 120 | 98 | ||
| @@ -124,52 +102,6 @@ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries m | |||
| 124 | int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ | 102 | int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ |
| 125 | int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ | 103 | int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ |
| 126 | 104 | ||
| 127 | static void inetpeer_gc_worker(struct work_struct *work) | ||
| 128 | { | ||
| 129 | struct inet_peer *p, *n, *c; | ||
| 130 | LIST_HEAD(list); | ||
| 131 | |||
| 132 | spin_lock_bh(&gc_lock); | ||
| 133 | list_replace_init(&gc_list, &list); | ||
| 134 | spin_unlock_bh(&gc_lock); | ||
| 135 | |||
| 136 | if (list_empty(&list)) | ||
| 137 | return; | ||
| 138 | |||
| 139 | list_for_each_entry_safe(p, n, &list, gc_list) { | ||
| 140 | |||
| 141 | if (need_resched()) | ||
| 142 | cond_resched(); | ||
| 143 | |||
| 144 | c = rcu_dereference_protected(p->avl_left, 1); | ||
| 145 | if (c != peer_avl_empty) { | ||
| 146 | list_add_tail(&c->gc_list, &list); | ||
| 147 | p->avl_left = peer_avl_empty_rcu; | ||
| 148 | } | ||
| 149 | |||
| 150 | c = rcu_dereference_protected(p->avl_right, 1); | ||
| 151 | if (c != peer_avl_empty) { | ||
| 152 | list_add_tail(&c->gc_list, &list); | ||
| 153 | p->avl_right = peer_avl_empty_rcu; | ||
| 154 | } | ||
| 155 | |||
| 156 | n = list_entry(p->gc_list.next, struct inet_peer, gc_list); | ||
| 157 | |||
| 158 | if (!atomic_read(&p->refcnt)) { | ||
| 159 | list_del(&p->gc_list); | ||
| 160 | kmem_cache_free(peer_cachep, p); | ||
| 161 | } | ||
| 162 | } | ||
| 163 | |||
| 164 | if (list_empty(&list)) | ||
| 165 | return; | ||
| 166 | |||
| 167 | spin_lock_bh(&gc_lock); | ||
| 168 | list_splice(&list, &gc_list); | ||
| 169 | spin_unlock_bh(&gc_lock); | ||
| 170 | |||
| 171 | schedule_delayed_work(&gc_work, gc_delay); | ||
| 172 | } | ||
| 173 | 105 | ||
| 174 | /* Called from ip_output.c:ip_init */ | 106 | /* Called from ip_output.c:ip_init */ |
| 175 | void __init inet_initpeers(void) | 107 | void __init inet_initpeers(void) |
| @@ -194,7 +126,6 @@ void __init inet_initpeers(void) | |||
| 194 | 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, | 126 | 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, |
| 195 | NULL); | 127 | NULL); |
| 196 | 128 | ||
| 197 | INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker); | ||
| 198 | } | 129 | } |
| 199 | 130 | ||
| 200 | static int addr_compare(const struct inetpeer_addr *a, | 131 | static int addr_compare(const struct inetpeer_addr *a, |
| @@ -205,7 +136,7 @@ static int addr_compare(const struct inetpeer_addr *a, | |||
| 205 | for (i = 0; i < n; i++) { | 136 | for (i = 0; i < n; i++) { |
| 206 | if (a->addr.a6[i] == b->addr.a6[i]) | 137 | if (a->addr.a6[i] == b->addr.a6[i]) |
| 207 | continue; | 138 | continue; |
| 208 | if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i]) | 139 | if (a->addr.a6[i] < b->addr.a6[i]) |
| 209 | return -1; | 140 | return -1; |
| 210 | return 1; | 141 | return 1; |
| 211 | } | 142 | } |
| @@ -419,6 +350,11 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base, | |||
| 419 | call_rcu(&p->rcu, inetpeer_free_rcu); | 350 | call_rcu(&p->rcu, inetpeer_free_rcu); |
| 420 | } | 351 | } |
| 421 | 352 | ||
| 353 | static struct inet_peer_base *family_to_base(int family) | ||
| 354 | { | ||
| 355 | return family == AF_INET ? &v4_peers : &v6_peers; | ||
| 356 | } | ||
| 357 | |||
| 422 | /* perform garbage collect on all items stacked during a lookup */ | 358 | /* perform garbage collect on all items stacked during a lookup */ |
| 423 | static int inet_peer_gc(struct inet_peer_base *base, | 359 | static int inet_peer_gc(struct inet_peer_base *base, |
| 424 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], | 360 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], |
| @@ -456,17 +392,14 @@ static int inet_peer_gc(struct inet_peer_base *base, | |||
| 456 | return cnt; | 392 | return cnt; |
| 457 | } | 393 | } |
| 458 | 394 | ||
| 459 | struct inet_peer *inet_getpeer(struct inet_peer_base *base, | 395 | struct inet_peer *inet_getpeer(const struct inetpeer_addr *daddr, int create) |
| 460 | const struct inetpeer_addr *daddr, | ||
| 461 | int create) | ||
| 462 | { | 396 | { |
| 463 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; | 397 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; |
| 398 | struct inet_peer_base *base = family_to_base(daddr->family); | ||
| 464 | struct inet_peer *p; | 399 | struct inet_peer *p; |
| 465 | unsigned int sequence; | 400 | unsigned int sequence; |
| 466 | int invalidated, gccnt = 0; | 401 | int invalidated, gccnt = 0; |
| 467 | 402 | ||
| 468 | flush_check(base, daddr->family); | ||
| 469 | |||
| 470 | /* Attempt a lockless lookup first. | 403 | /* Attempt a lockless lookup first. |
| 471 | * Because of a concurrent writer, we might not find an existing entry. | 404 | * Because of a concurrent writer, we might not find an existing entry. |
| 472 | */ | 405 | */ |
| @@ -508,13 +441,14 @@ relookup: | |||
| 508 | (daddr->family == AF_INET) ? | 441 | (daddr->family == AF_INET) ? |
| 509 | secure_ip_id(daddr->addr.a4) : | 442 | secure_ip_id(daddr->addr.a4) : |
| 510 | secure_ipv6_id(daddr->addr.a6)); | 443 | secure_ipv6_id(daddr->addr.a6)); |
| 444 | p->tcp_ts_stamp = 0; | ||
| 511 | p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; | 445 | p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; |
| 512 | p->rate_tokens = 0; | 446 | p->rate_tokens = 0; |
| 513 | /* 60*HZ is arbitrary, but chosen enough high so that the first | 447 | p->rate_last = 0; |
| 514 | * calculation of tokens is at its maximum. | 448 | p->pmtu_expires = 0; |
| 515 | */ | 449 | p->pmtu_orig = 0; |
| 516 | p->rate_last = jiffies - 60*HZ; | 450 | memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); |
| 517 | INIT_LIST_HEAD(&p->gc_list); | 451 | |
| 518 | 452 | ||
| 519 | /* Link the node. */ | 453 | /* Link the node. */ |
| 520 | link_to_pool(p, base); | 454 | link_to_pool(p, base); |
| @@ -574,31 +508,3 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) | |||
| 574 | return rc; | 508 | return rc; |
| 575 | } | 509 | } |
| 576 | EXPORT_SYMBOL(inet_peer_xrlim_allow); | 510 | EXPORT_SYMBOL(inet_peer_xrlim_allow); |
| 577 | |||
| 578 | static void inetpeer_inval_rcu(struct rcu_head *head) | ||
| 579 | { | ||
| 580 | struct inet_peer *p = container_of(head, struct inet_peer, gc_rcu); | ||
| 581 | |||
| 582 | spin_lock_bh(&gc_lock); | ||
| 583 | list_add_tail(&p->gc_list, &gc_list); | ||
| 584 | spin_unlock_bh(&gc_lock); | ||
| 585 | |||
| 586 | schedule_delayed_work(&gc_work, gc_delay); | ||
| 587 | } | ||
| 588 | |||
| 589 | void inetpeer_invalidate_tree(struct inet_peer_base *base) | ||
| 590 | { | ||
| 591 | struct inet_peer *root; | ||
| 592 | |||
| 593 | write_seqlock_bh(&base->lock); | ||
| 594 | |||
| 595 | root = rcu_deref_locked(base->root, base); | ||
| 596 | if (root != peer_avl_empty) { | ||
| 597 | base->root = peer_avl_empty_rcu; | ||
| 598 | base->total = 0; | ||
| 599 | call_rcu(&root->gc_rcu, inetpeer_inval_rcu); | ||
| 600 | } | ||
| 601 | |||
| 602 | write_sequnlock_bh(&base->lock); | ||
| 603 | } | ||
| 604 | EXPORT_SYMBOL(inetpeer_invalidate_tree); | ||
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 694de3b7aeb..3b34d1c8627 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c | |||
| @@ -41,10 +41,9 @@ | |||
| 41 | 41 | ||
| 42 | static int ip_forward_finish(struct sk_buff *skb) | 42 | static int ip_forward_finish(struct sk_buff *skb) |
| 43 | { | 43 | { |
| 44 | struct ip_options *opt = &(IPCB(skb)->opt); | 44 | struct ip_options * opt = &(IPCB(skb)->opt); |
| 45 | 45 | ||
| 46 | IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); | 46 | IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); |
| 47 | IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len); | ||
| 48 | 47 | ||
| 49 | if (unlikely(opt->optlen)) | 48 | if (unlikely(opt->optlen)) |
| 50 | ip_forward_options(skb); | 49 | ip_forward_options(skb); |
| @@ -56,7 +55,7 @@ int ip_forward(struct sk_buff *skb) | |||
| 56 | { | 55 | { |
| 57 | struct iphdr *iph; /* Our header */ | 56 | struct iphdr *iph; /* Our header */ |
| 58 | struct rtable *rt; /* Route we use */ | 57 | struct rtable *rt; /* Route we use */ |
| 59 | struct ip_options *opt = &(IPCB(skb)->opt); | 58 | struct ip_options * opt = &(IPCB(skb)->opt); |
| 60 | 59 | ||
| 61 | if (skb_warn_if_lro(skb)) | 60 | if (skb_warn_if_lro(skb)) |
| 62 | goto drop; | 61 | goto drop; |
| @@ -85,7 +84,7 @@ int ip_forward(struct sk_buff *skb) | |||
| 85 | 84 | ||
| 86 | rt = skb_rtable(skb); | 85 | rt = skb_rtable(skb); |
| 87 | 86 | ||
| 88 | if (opt->is_strictroute && rt->rt_uses_gateway) | 87 | if (opt->is_strictroute && ip_hdr(skb)->daddr != rt->rt_gateway) |
| 89 | goto sr_failed; | 88 | goto sr_failed; |
| 90 | 89 | ||
| 91 | if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && | 90 | if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && |
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index eb9d63a570c..0e0ab98abc6 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c | |||
| @@ -20,8 +20,6 @@ | |||
| 20 | * Patrick McHardy : LRU queue of frag heads for evictor. | 20 | * Patrick McHardy : LRU queue of frag heads for evictor. |
| 21 | */ | 21 | */ |
| 22 | 22 | ||
| 23 | #define pr_fmt(fmt) "IPv4: " fmt | ||
| 24 | |||
| 25 | #include <linux/compiler.h> | 23 | #include <linux/compiler.h> |
| 26 | #include <linux/module.h> | 24 | #include <linux/module.h> |
| 27 | #include <linux/types.h> | 25 | #include <linux/types.h> |
| @@ -148,17 +146,17 @@ static unsigned int ip4_hashfn(struct inet_frag_queue *q) | |||
| 148 | return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); | 146 | return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); |
| 149 | } | 147 | } |
| 150 | 148 | ||
| 151 | static bool ip4_frag_match(struct inet_frag_queue *q, void *a) | 149 | static int ip4_frag_match(struct inet_frag_queue *q, void *a) |
| 152 | { | 150 | { |
| 153 | struct ipq *qp; | 151 | struct ipq *qp; |
| 154 | struct ip4_create_arg *arg = a; | 152 | struct ip4_create_arg *arg = a; |
| 155 | 153 | ||
| 156 | qp = container_of(q, struct ipq, q); | 154 | qp = container_of(q, struct ipq, q); |
| 157 | return qp->id == arg->iph->id && | 155 | return qp->id == arg->iph->id && |
| 158 | qp->saddr == arg->iph->saddr && | 156 | qp->saddr == arg->iph->saddr && |
| 159 | qp->daddr == arg->iph->daddr && | 157 | qp->daddr == arg->iph->daddr && |
| 160 | qp->protocol == arg->iph->protocol && | 158 | qp->protocol == arg->iph->protocol && |
| 161 | qp->user == arg->user; | 159 | qp->user == arg->user; |
| 162 | } | 160 | } |
| 163 | 161 | ||
| 164 | /* Memory Tracking Functions. */ | 162 | /* Memory Tracking Functions. */ |
| @@ -171,10 +169,6 @@ static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb) | |||
| 171 | static void ip4_frag_init(struct inet_frag_queue *q, void *a) | 169 | static void ip4_frag_init(struct inet_frag_queue *q, void *a) |
| 172 | { | 170 | { |
| 173 | struct ipq *qp = container_of(q, struct ipq, q); | 171 | struct ipq *qp = container_of(q, struct ipq, q); |
| 174 | struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, | ||
| 175 | frags); | ||
| 176 | struct net *net = container_of(ipv4, struct net, ipv4); | ||
| 177 | |||
| 178 | struct ip4_create_arg *arg = a; | 172 | struct ip4_create_arg *arg = a; |
| 179 | 173 | ||
| 180 | qp->protocol = arg->iph->protocol; | 174 | qp->protocol = arg->iph->protocol; |
| @@ -184,7 +178,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a) | |||
| 184 | qp->daddr = arg->iph->daddr; | 178 | qp->daddr = arg->iph->daddr; |
| 185 | qp->user = arg->user; | 179 | qp->user = arg->user; |
| 186 | qp->peer = sysctl_ipfrag_max_dist ? | 180 | qp->peer = sysctl_ipfrag_max_dist ? |
| 187 | inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL; | 181 | inet_getpeer_v4(arg->iph->saddr, 1) : NULL; |
| 188 | } | 182 | } |
| 189 | 183 | ||
| 190 | static __inline__ void ip4_frag_free(struct inet_frag_queue *q) | 184 | static __inline__ void ip4_frag_free(struct inet_frag_queue *q) |
| @@ -219,7 +213,7 @@ static void ip_evictor(struct net *net) | |||
| 219 | { | 213 | { |
| 220 | int evicted; | 214 | int evicted; |
| 221 | 215 | ||
| 222 | evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false); | 216 | evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags); |
| 223 | if (evicted) | 217 | if (evicted) |
| 224 | IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted); | 218 | IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted); |
| 225 | } | 219 | } |
| @@ -305,7 +299,7 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) | |||
| 305 | return container_of(q, struct ipq, q); | 299 | return container_of(q, struct ipq, q); |
| 306 | 300 | ||
| 307 | out_nomem: | 301 | out_nomem: |
| 308 | LIMIT_NETDEBUG(KERN_ERR pr_fmt("ip_frag_create: no memory left !\n")); | 302 | LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n"); |
| 309 | return NULL; | 303 | return NULL; |
| 310 | } | 304 | } |
| 311 | 305 | ||
| @@ -398,7 +392,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) | |||
| 398 | /* Is this the final fragment? */ | 392 | /* Is this the final fragment? */ |
| 399 | if ((flags & IP_MF) == 0) { | 393 | if ((flags & IP_MF) == 0) { |
| 400 | /* If we already have some bits beyond end | 394 | /* If we already have some bits beyond end |
| 401 | * or have different end, the segment is corrupted. | 395 | * or have different end, the segment is corrrupted. |
| 402 | */ | 396 | */ |
| 403 | if (end < qp->q.len || | 397 | if (end < qp->q.len || |
| 404 | ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len)) | 398 | ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len)) |
| @@ -523,10 +517,6 @@ found: | |||
| 523 | if (offset == 0) | 517 | if (offset == 0) |
| 524 | qp->q.last_in |= INET_FRAG_FIRST_IN; | 518 | qp->q.last_in |= INET_FRAG_FIRST_IN; |
| 525 | 519 | ||
| 526 | if (ip_hdr(skb)->frag_off & htons(IP_DF) && | ||
| 527 | skb->len + ihl > qp->q.max_size) | ||
| 528 | qp->q.max_size = skb->len + ihl; | ||
| 529 | |||
| 530 | if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && | 520 | if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && |
| 531 | qp->q.meat == qp->q.len) | 521 | qp->q.meat == qp->q.len) |
| 532 | return ip_frag_reasm(qp, prev, dev); | 522 | return ip_frag_reasm(qp, prev, dev); |
| @@ -553,7 +543,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, | |||
| 553 | int len; | 543 | int len; |
| 554 | int ihlen; | 544 | int ihlen; |
| 555 | int err; | 545 | int err; |
| 556 | int sum_truesize; | ||
| 557 | u8 ecn; | 546 | u8 ecn; |
| 558 | 547 | ||
| 559 | ipq_kill(qp); | 548 | ipq_kill(qp); |
| @@ -578,7 +567,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, | |||
| 578 | skb_morph(head, qp->q.fragments); | 567 | skb_morph(head, qp->q.fragments); |
| 579 | head->next = qp->q.fragments->next; | 568 | head->next = qp->q.fragments->next; |
| 580 | 569 | ||
| 581 | consume_skb(qp->q.fragments); | 570 | kfree_skb(qp->q.fragments); |
| 582 | qp->q.fragments = head; | 571 | qp->q.fragments = head; |
| 583 | } | 572 | } |
| 584 | 573 | ||
| @@ -610,8 +599,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, | |||
| 610 | head->next = clone; | 599 | head->next = clone; |
| 611 | skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; | 600 | skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; |
| 612 | skb_frag_list_init(head); | 601 | skb_frag_list_init(head); |
| 613 | for (i = 0; i < skb_shinfo(head)->nr_frags; i++) | 602 | for (i=0; i<skb_shinfo(head)->nr_frags; i++) |
| 614 | plen += skb_frag_size(&skb_shinfo(head)->frags[i]); | 603 | plen += skb_shinfo(head)->frags[i].size; |
| 615 | clone->len = clone->data_len = head->data_len - plen; | 604 | clone->len = clone->data_len = head->data_len - plen; |
| 616 | head->data_len -= clone->len; | 605 | head->data_len -= clone->len; |
| 617 | head->len -= clone->len; | 606 | head->len -= clone->len; |
| @@ -620,41 +609,26 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, | |||
| 620 | atomic_add(clone->truesize, &qp->q.net->mem); | 609 | atomic_add(clone->truesize, &qp->q.net->mem); |
| 621 | } | 610 | } |
| 622 | 611 | ||
| 612 | skb_shinfo(head)->frag_list = head->next; | ||
| 623 | skb_push(head, head->data - skb_network_header(head)); | 613 | skb_push(head, head->data - skb_network_header(head)); |
| 624 | 614 | ||
| 625 | sum_truesize = head->truesize; | 615 | for (fp=head->next; fp; fp = fp->next) { |
| 626 | for (fp = head->next; fp;) { | 616 | head->data_len += fp->len; |
| 627 | bool headstolen; | 617 | head->len += fp->len; |
| 628 | int delta; | ||
| 629 | struct sk_buff *next = fp->next; | ||
| 630 | |||
| 631 | sum_truesize += fp->truesize; | ||
| 632 | if (head->ip_summed != fp->ip_summed) | 618 | if (head->ip_summed != fp->ip_summed) |
| 633 | head->ip_summed = CHECKSUM_NONE; | 619 | head->ip_summed = CHECKSUM_NONE; |
| 634 | else if (head->ip_summed == CHECKSUM_COMPLETE) | 620 | else if (head->ip_summed == CHECKSUM_COMPLETE) |
| 635 | head->csum = csum_add(head->csum, fp->csum); | 621 | head->csum = csum_add(head->csum, fp->csum); |
| 636 | 622 | head->truesize += fp->truesize; | |
| 637 | if (skb_try_coalesce(head, fp, &headstolen, &delta)) { | ||
| 638 | kfree_skb_partial(fp, headstolen); | ||
| 639 | } else { | ||
| 640 | if (!skb_shinfo(head)->frag_list) | ||
| 641 | skb_shinfo(head)->frag_list = fp; | ||
| 642 | head->data_len += fp->len; | ||
| 643 | head->len += fp->len; | ||
| 644 | head->truesize += fp->truesize; | ||
| 645 | } | ||
| 646 | fp = next; | ||
| 647 | } | 623 | } |
| 648 | atomic_sub(sum_truesize, &qp->q.net->mem); | 624 | atomic_sub(head->truesize, &qp->q.net->mem); |
| 649 | 625 | ||
| 650 | head->next = NULL; | 626 | head->next = NULL; |
| 651 | head->dev = dev; | 627 | head->dev = dev; |
| 652 | head->tstamp = qp->q.stamp; | 628 | head->tstamp = qp->q.stamp; |
| 653 | IPCB(head)->frag_max_size = qp->q.max_size; | ||
| 654 | 629 | ||
| 655 | iph = ip_hdr(head); | 630 | iph = ip_hdr(head); |
| 656 | /* max_size != 0 implies at least one fragment had IP_DF set */ | 631 | iph->frag_off = 0; |
| 657 | iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0; | ||
| 658 | iph->tot_len = htons(len); | 632 | iph->tot_len = htons(len); |
| 659 | iph->tos |= ecn; | 633 | iph->tos |= ecn; |
| 660 | IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); | 634 | IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); |
| @@ -663,12 +637,14 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, | |||
| 663 | return 0; | 637 | return 0; |
| 664 | 638 | ||
| 665 | out_nomem: | 639 | out_nomem: |
| 666 | LIMIT_NETDEBUG(KERN_ERR pr_fmt("queue_glue: no memory for gluing queue %p\n"), | 640 | LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing " |
| 667 | qp); | 641 | "queue %p\n", qp); |
| 668 | err = -ENOMEM; | 642 | err = -ENOMEM; |
| 669 | goto out_fail; | 643 | goto out_fail; |
| 670 | out_oversize: | 644 | out_oversize: |
| 671 | net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr); | 645 | if (net_ratelimit()) |
| 646 | printk(KERN_INFO "Oversized IP packet from %pI4.\n", | ||
| 647 | &qp->saddr); | ||
| 672 | out_fail: | 648 | out_fail: |
| 673 | IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); | 649 | IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); |
| 674 | return err; | 650 | return err; |
| @@ -684,7 +660,8 @@ int ip_defrag(struct sk_buff *skb, u32 user) | |||
| 684 | IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); | 660 | IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); |
| 685 | 661 | ||
| 686 | /* Start by cleaning up the memory. */ | 662 | /* Start by cleaning up the memory. */ |
| 687 | ip_evictor(net); | 663 | if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) |
| 664 | ip_evictor(net); | ||
| 688 | 665 | ||
| 689 | /* Lookup (or create) queue header */ | 666 | /* Lookup (or create) queue header */ |
| 690 | if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { | 667 | if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { |
| @@ -705,41 +682,6 @@ int ip_defrag(struct sk_buff *skb, u32 user) | |||
| 705 | } | 682 | } |
| 706 | EXPORT_SYMBOL(ip_defrag); | 683 | EXPORT_SYMBOL(ip_defrag); |
| 707 | 684 | ||
| 708 | struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) | ||
| 709 | { | ||
| 710 | struct iphdr iph; | ||
| 711 | u32 len; | ||
| 712 | |||
| 713 | if (skb->protocol != htons(ETH_P_IP)) | ||
| 714 | return skb; | ||
| 715 | |||
| 716 | if (!skb_copy_bits(skb, 0, &iph, sizeof(iph))) | ||
| 717 | return skb; | ||
| 718 | |||
| 719 | if (iph.ihl < 5 || iph.version != 4) | ||
| 720 | return skb; | ||
| 721 | |||
| 722 | len = ntohs(iph.tot_len); | ||
| 723 | if (skb->len < len || len < (iph.ihl * 4)) | ||
| 724 | return skb; | ||
| 725 | |||
| 726 | if (ip_is_fragment(&iph)) { | ||
| 727 | skb = skb_share_check(skb, GFP_ATOMIC); | ||
| 728 | if (skb) { | ||
| 729 | if (!pskb_may_pull(skb, iph.ihl*4)) | ||
| 730 | return skb; | ||
| 731 | if (pskb_trim_rcsum(skb, len)) | ||
| 732 | return skb; | ||
| 733 | memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); | ||
| 734 | if (ip_defrag(skb, user)) | ||
| 735 | return NULL; | ||
| 736 | skb->rxhash = 0; | ||
| 737 | } | ||
| 738 | } | ||
| 739 | return skb; | ||
| 740 | } | ||
| 741 | EXPORT_SYMBOL(ip_check_defrag); | ||
| 742 | |||
| 743 | #ifdef CONFIG_SYSCTL | 685 | #ifdef CONFIG_SYSCTL |
| 744 | static int zero; | 686 | static int zero; |
| 745 | 687 | ||
| @@ -801,13 +743,9 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) | |||
| 801 | table[0].data = &net->ipv4.frags.high_thresh; | 743 | table[0].data = &net->ipv4.frags.high_thresh; |
| 802 | table[1].data = &net->ipv4.frags.low_thresh; | 744 | table[1].data = &net->ipv4.frags.low_thresh; |
| 803 | table[2].data = &net->ipv4.frags.timeout; | 745 | table[2].data = &net->ipv4.frags.timeout; |
| 804 | |||
| 805 | /* Don't export sysctls to unprivileged users */ | ||
| 806 | if (net->user_ns != &init_user_ns) | ||
| 807 | table[0].procname = NULL; | ||
| 808 | } | 746 | } |
| 809 | 747 | ||
| 810 | hdr = register_net_sysctl(net, "net/ipv4", table); | 748 | hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); |
| 811 | if (hdr == NULL) | 749 | if (hdr == NULL) |
| 812 | goto err_reg; | 750 | goto err_reg; |
| 813 | 751 | ||
| @@ -832,7 +770,7 @@ static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) | |||
| 832 | 770 | ||
| 833 | static void ip4_frags_ctl_register(void) | 771 | static void ip4_frags_ctl_register(void) |
| 834 | { | 772 | { |
| 835 | register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table); | 773 | register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table); |
| 836 | } | 774 | } |
| 837 | #else | 775 | #else |
| 838 | static inline int ip4_frags_ns_ctl_register(struct net *net) | 776 | static inline int ip4_frags_ns_ctl_register(struct net *net) |
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 303012adf9e..d7bb94c4834 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c | |||
| @@ -10,8 +10,6 @@ | |||
| 10 | * | 10 | * |
| 11 | */ | 11 | */ |
| 12 | 12 | ||
| 13 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
| 14 | |||
| 15 | #include <linux/capability.h> | 13 | #include <linux/capability.h> |
| 16 | #include <linux/module.h> | 14 | #include <linux/module.h> |
| 17 | #include <linux/types.h> | 15 | #include <linux/types.h> |
| @@ -48,7 +46,7 @@ | |||
| 48 | #include <net/rtnetlink.h> | 46 | #include <net/rtnetlink.h> |
| 49 | #include <net/gre.h> | 47 | #include <net/gre.h> |
| 50 | 48 | ||
| 51 | #if IS_ENABLED(CONFIG_IPV6) | 49 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 52 | #include <net/ipv6.h> | 50 | #include <net/ipv6.h> |
| 53 | #include <net/ip6_fib.h> | 51 | #include <net/ip6_fib.h> |
| 54 | #include <net/ip6_route.h> | 52 | #include <net/ip6_route.h> |
| @@ -67,7 +65,7 @@ | |||
| 67 | it is infeasible task. The most general solutions would be | 65 | it is infeasible task. The most general solutions would be |
| 68 | to keep skb->encapsulation counter (sort of local ttl), | 66 | to keep skb->encapsulation counter (sort of local ttl), |
| 69 | and silently drop packet when it expires. It is a good | 67 | and silently drop packet when it expires. It is a good |
| 70 | solution, but it supposes maintaining new variable in ALL | 68 | solution, but it supposes maintaing new variable in ALL |
| 71 | skb, even if no tunneling is used. | 69 | skb, even if no tunneling is used. |
| 72 | 70 | ||
| 73 | Current solution: xmit_recursion breaks dead loops. This is a percpu | 71 | Current solution: xmit_recursion breaks dead loops. This is a percpu |
| @@ -93,14 +91,14 @@ | |||
| 93 | 91 | ||
| 94 | One of them is to parse packet trying to detect inner encapsulation | 92 | One of them is to parse packet trying to detect inner encapsulation |
| 95 | made by our node. It is difficult or even impossible, especially, | 93 | made by our node. It is difficult or even impossible, especially, |
| 96 | taking into account fragmentation. TO be short, ttl is not solution at all. | 94 | taking into account fragmentation. TO be short, tt is not solution at all. |
| 97 | 95 | ||
| 98 | Current solution: The solution was UNEXPECTEDLY SIMPLE. | 96 | Current solution: The solution was UNEXPECTEDLY SIMPLE. |
| 99 | We force DF flag on tunnels with preconfigured hop limit, | 97 | We force DF flag on tunnels with preconfigured hop limit, |
| 100 | that is ALL. :-) Well, it does not remove the problem completely, | 98 | that is ALL. :-) Well, it does not remove the problem completely, |
| 101 | but exponential growth of network traffic is changed to linear | 99 | but exponential growth of network traffic is changed to linear |
| 102 | (branches, that exceed pmtu are pruned) and tunnel mtu | 100 | (branches, that exceed pmtu are pruned) and tunnel mtu |
| 103 | rapidly degrades to value <68, where looping stops. | 101 | fastly degrades to value <68, where looping stops. |
| 104 | Yes, it is not good if there exists a router in the loop, | 102 | Yes, it is not good if there exists a router in the loop, |
| 105 | which does not force DF, even when encapsulating packets have DF set. | 103 | which does not force DF, even when encapsulating packets have DF set. |
| 106 | But it is not our problem! Nobody could accuse us, we made | 104 | But it is not our problem! Nobody could accuse us, we made |
| @@ -120,10 +118,6 @@ | |||
| 120 | Alexey Kuznetsov. | 118 | Alexey Kuznetsov. |
| 121 | */ | 119 | */ |
| 122 | 120 | ||
| 123 | static bool log_ecn_error = true; | ||
| 124 | module_param(log_ecn_error, bool, 0644); | ||
| 125 | MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); | ||
| 126 | |||
| 127 | static struct rtnl_link_ops ipgre_link_ops __read_mostly; | 121 | static struct rtnl_link_ops ipgre_link_ops __read_mostly; |
| 128 | static int ipgre_tunnel_init(struct net_device *dev); | 122 | static int ipgre_tunnel_init(struct net_device *dev); |
| 129 | static void ipgre_tunnel_setup(struct net_device *dev); | 123 | static void ipgre_tunnel_setup(struct net_device *dev); |
| @@ -164,66 +158,46 @@ struct ipgre_net { | |||
| 164 | #define tunnels_r tunnels[2] | 158 | #define tunnels_r tunnels[2] |
| 165 | #define tunnels_l tunnels[1] | 159 | #define tunnels_l tunnels[1] |
| 166 | #define tunnels_wc tunnels[0] | 160 | #define tunnels_wc tunnels[0] |
| 161 | /* | ||
| 162 | * Locking : hash tables are protected by RCU and RTNL | ||
| 163 | */ | ||
| 164 | |||
| 165 | #define for_each_ip_tunnel_rcu(start) \ | ||
| 166 | for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) | ||
| 167 | 167 | ||
| 168 | static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev, | 168 | /* often modified stats are per cpu, other are shared (netdev->stats) */ |
| 169 | struct rtnl_link_stats64 *tot) | 169 | struct pcpu_tstats { |
| 170 | unsigned long rx_packets; | ||
| 171 | unsigned long rx_bytes; | ||
| 172 | unsigned long tx_packets; | ||
| 173 | unsigned long tx_bytes; | ||
| 174 | }; | ||
| 175 | |||
| 176 | static struct net_device_stats *ipgre_get_stats(struct net_device *dev) | ||
| 170 | { | 177 | { |
| 178 | struct pcpu_tstats sum = { 0 }; | ||
| 171 | int i; | 179 | int i; |
| 172 | 180 | ||
| 173 | for_each_possible_cpu(i) { | 181 | for_each_possible_cpu(i) { |
| 174 | const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); | 182 | const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); |
| 175 | u64 rx_packets, rx_bytes, tx_packets, tx_bytes; | ||
| 176 | unsigned int start; | ||
| 177 | |||
| 178 | do { | ||
| 179 | start = u64_stats_fetch_begin_bh(&tstats->syncp); | ||
| 180 | rx_packets = tstats->rx_packets; | ||
| 181 | tx_packets = tstats->tx_packets; | ||
| 182 | rx_bytes = tstats->rx_bytes; | ||
| 183 | tx_bytes = tstats->tx_bytes; | ||
| 184 | } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); | ||
| 185 | |||
| 186 | tot->rx_packets += rx_packets; | ||
| 187 | tot->tx_packets += tx_packets; | ||
| 188 | tot->rx_bytes += rx_bytes; | ||
| 189 | tot->tx_bytes += tx_bytes; | ||
| 190 | } | ||
| 191 | |||
| 192 | tot->multicast = dev->stats.multicast; | ||
| 193 | tot->rx_crc_errors = dev->stats.rx_crc_errors; | ||
| 194 | tot->rx_fifo_errors = dev->stats.rx_fifo_errors; | ||
| 195 | tot->rx_length_errors = dev->stats.rx_length_errors; | ||
| 196 | tot->rx_frame_errors = dev->stats.rx_frame_errors; | ||
| 197 | tot->rx_errors = dev->stats.rx_errors; | ||
| 198 | |||
| 199 | tot->tx_fifo_errors = dev->stats.tx_fifo_errors; | ||
| 200 | tot->tx_carrier_errors = dev->stats.tx_carrier_errors; | ||
| 201 | tot->tx_dropped = dev->stats.tx_dropped; | ||
| 202 | tot->tx_aborted_errors = dev->stats.tx_aborted_errors; | ||
| 203 | tot->tx_errors = dev->stats.tx_errors; | ||
| 204 | 183 | ||
| 205 | return tot; | 184 | sum.rx_packets += tstats->rx_packets; |
| 206 | } | 185 | sum.rx_bytes += tstats->rx_bytes; |
| 207 | 186 | sum.tx_packets += tstats->tx_packets; | |
| 208 | /* Does key in tunnel parameters match packet */ | 187 | sum.tx_bytes += tstats->tx_bytes; |
| 209 | static bool ipgre_key_match(const struct ip_tunnel_parm *p, | 188 | } |
| 210 | __be16 flags, __be32 key) | 189 | dev->stats.rx_packets = sum.rx_packets; |
| 211 | { | 190 | dev->stats.rx_bytes = sum.rx_bytes; |
| 212 | if (p->i_flags & GRE_KEY) { | 191 | dev->stats.tx_packets = sum.tx_packets; |
| 213 | if (flags & GRE_KEY) | 192 | dev->stats.tx_bytes = sum.tx_bytes; |
| 214 | return key == p->i_key; | 193 | return &dev->stats; |
| 215 | else | ||
| 216 | return false; /* key expected, none present */ | ||
| 217 | } else | ||
| 218 | return !(flags & GRE_KEY); | ||
| 219 | } | 194 | } |
| 220 | 195 | ||
| 221 | /* Given src, dst and key, find appropriate for input tunnel. */ | 196 | /* Given src, dst and key, find appropriate for input tunnel. */ |
| 222 | 197 | ||
| 223 | static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev, | 198 | static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, |
| 224 | __be32 remote, __be32 local, | 199 | __be32 remote, __be32 local, |
| 225 | __be16 flags, __be32 key, | 200 | __be32 key, __be16 gre_proto) |
| 226 | __be16 gre_proto) | ||
| 227 | { | 201 | { |
| 228 | struct net *net = dev_net(dev); | 202 | struct net *net = dev_net(dev); |
| 229 | int link = dev->ifindex; | 203 | int link = dev->ifindex; |
| @@ -235,15 +209,13 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev, | |||
| 235 | ARPHRD_ETHER : ARPHRD_IPGRE; | 209 | ARPHRD_ETHER : ARPHRD_IPGRE; |
| 236 | int score, cand_score = 4; | 210 | int score, cand_score = 4; |
| 237 | 211 | ||
| 238 | for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) { | 212 | for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) { |
| 239 | if (local != t->parms.iph.saddr || | 213 | if (local != t->parms.iph.saddr || |
| 240 | remote != t->parms.iph.daddr || | 214 | remote != t->parms.iph.daddr || |
| 215 | key != t->parms.i_key || | ||
| 241 | !(t->dev->flags & IFF_UP)) | 216 | !(t->dev->flags & IFF_UP)) |
| 242 | continue; | 217 | continue; |
| 243 | 218 | ||
| 244 | if (!ipgre_key_match(&t->parms, flags, key)) | ||
| 245 | continue; | ||
| 246 | |||
| 247 | if (t->dev->type != ARPHRD_IPGRE && | 219 | if (t->dev->type != ARPHRD_IPGRE && |
| 248 | t->dev->type != dev_type) | 220 | t->dev->type != dev_type) |
| 249 | continue; | 221 | continue; |
| @@ -262,14 +234,12 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev, | |||
| 262 | } | 234 | } |
| 263 | } | 235 | } |
| 264 | 236 | ||
| 265 | for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) { | 237 | for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) { |
| 266 | if (remote != t->parms.iph.daddr || | 238 | if (remote != t->parms.iph.daddr || |
| 239 | key != t->parms.i_key || | ||
| 267 | !(t->dev->flags & IFF_UP)) | 240 | !(t->dev->flags & IFF_UP)) |
| 268 | continue; | 241 | continue; |
| 269 | 242 | ||
| 270 | if (!ipgre_key_match(&t->parms, flags, key)) | ||
| 271 | continue; | ||
| 272 | |||
| 273 | if (t->dev->type != ARPHRD_IPGRE && | 243 | if (t->dev->type != ARPHRD_IPGRE && |
| 274 | t->dev->type != dev_type) | 244 | t->dev->type != dev_type) |
| 275 | continue; | 245 | continue; |
| @@ -288,16 +258,14 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev, | |||
| 288 | } | 258 | } |
| 289 | } | 259 | } |
| 290 | 260 | ||
| 291 | for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) { | 261 | for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) { |
| 292 | if ((local != t->parms.iph.saddr && | 262 | if ((local != t->parms.iph.saddr && |
| 293 | (local != t->parms.iph.daddr || | 263 | (local != t->parms.iph.daddr || |
| 294 | !ipv4_is_multicast(local))) || | 264 | !ipv4_is_multicast(local))) || |
| 265 | key != t->parms.i_key || | ||
| 295 | !(t->dev->flags & IFF_UP)) | 266 | !(t->dev->flags & IFF_UP)) |
| 296 | continue; | 267 | continue; |
| 297 | 268 | ||
| 298 | if (!ipgre_key_match(&t->parms, flags, key)) | ||
| 299 | continue; | ||
| 300 | |||
| 301 | if (t->dev->type != ARPHRD_IPGRE && | 269 | if (t->dev->type != ARPHRD_IPGRE && |
| 302 | t->dev->type != dev_type) | 270 | t->dev->type != dev_type) |
| 303 | continue; | 271 | continue; |
| @@ -316,7 +284,7 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev, | |||
| 316 | } | 284 | } |
| 317 | } | 285 | } |
| 318 | 286 | ||
| 319 | for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) { | 287 | for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) { |
| 320 | if (t->parms.i_key != key || | 288 | if (t->parms.i_key != key || |
| 321 | !(t->dev->flags & IFF_UP)) | 289 | !(t->dev->flags & IFF_UP)) |
| 322 | continue; | 290 | continue; |
| @@ -454,10 +422,6 @@ static struct ip_tunnel *ipgre_tunnel_locate(struct net *net, | |||
| 454 | if (register_netdevice(dev) < 0) | 422 | if (register_netdevice(dev) < 0) |
| 455 | goto failed_free; | 423 | goto failed_free; |
| 456 | 424 | ||
| 457 | /* Can use a lockless transmit, unless we generate output sequences */ | ||
| 458 | if (!(nt->parms.o_flags & GRE_SEQ)) | ||
| 459 | dev->features |= NETIF_F_LLTX; | ||
| 460 | |||
| 461 | dev_hold(dev); | 425 | dev_hold(dev); |
| 462 | ipgre_tunnel_link(ign, nt); | 426 | ipgre_tunnel_link(ign, nt); |
| 463 | return nt; | 427 | return nt; |
| @@ -489,18 +453,17 @@ static void ipgre_err(struct sk_buff *skb, u32 info) | |||
| 489 | GRE tunnels with enabled checksum. Tell them "thank you". | 453 | GRE tunnels with enabled checksum. Tell them "thank you". |
| 490 | 454 | ||
| 491 | Well, I wonder, rfc1812 was written by Cisco employee, | 455 | Well, I wonder, rfc1812 was written by Cisco employee, |
| 492 | what the hell these idiots break standards established | 456 | what the hell these idiots break standrads established |
| 493 | by themselves??? | 457 | by themself??? |
| 494 | */ | 458 | */ |
| 495 | 459 | ||
| 496 | const struct iphdr *iph = (const struct iphdr *)skb->data; | 460 | const struct iphdr *iph = (const struct iphdr *)skb->data; |
| 497 | __be16 *p = (__be16 *)(skb->data+(iph->ihl<<2)); | 461 | __be16 *p = (__be16*)(skb->data+(iph->ihl<<2)); |
| 498 | int grehlen = (iph->ihl<<2) + 4; | 462 | int grehlen = (iph->ihl<<2) + 4; |
| 499 | const int type = icmp_hdr(skb)->type; | 463 | const int type = icmp_hdr(skb)->type; |
| 500 | const int code = icmp_hdr(skb)->code; | 464 | const int code = icmp_hdr(skb)->code; |
| 501 | struct ip_tunnel *t; | 465 | struct ip_tunnel *t; |
| 502 | __be16 flags; | 466 | __be16 flags; |
| 503 | __be32 key = 0; | ||
| 504 | 467 | ||
| 505 | flags = p[0]; | 468 | flags = p[0]; |
| 506 | if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { | 469 | if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { |
| @@ -517,9 +480,6 @@ static void ipgre_err(struct sk_buff *skb, u32 info) | |||
| 517 | if (skb_headlen(skb) < grehlen) | 480 | if (skb_headlen(skb) < grehlen) |
| 518 | return; | 481 | return; |
| 519 | 482 | ||
| 520 | if (flags & GRE_KEY) | ||
| 521 | key = *(((__be32 *)p) + (grehlen / 4) - 1); | ||
| 522 | |||
| 523 | switch (type) { | 483 | switch (type) { |
| 524 | default: | 484 | default: |
| 525 | case ICMP_PARAMETERPROB: | 485 | case ICMP_PARAMETERPROB: |
| @@ -531,6 +491,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info) | |||
| 531 | case ICMP_PORT_UNREACH: | 491 | case ICMP_PORT_UNREACH: |
| 532 | /* Impossible event. */ | 492 | /* Impossible event. */ |
| 533 | return; | 493 | return; |
| 494 | case ICMP_FRAG_NEEDED: | ||
| 495 | /* Soft state for pmtu is maintained by IP core. */ | ||
| 496 | return; | ||
| 534 | default: | 497 | default: |
| 535 | /* All others are translated to HOST_UNREACH. | 498 | /* All others are translated to HOST_UNREACH. |
| 536 | rfc2003 contains "deep thoughts" about NET_UNREACH, | 499 | rfc2003 contains "deep thoughts" about NET_UNREACH, |
| @@ -543,39 +506,38 @@ static void ipgre_err(struct sk_buff *skb, u32 info) | |||
| 543 | if (code != ICMP_EXC_TTL) | 506 | if (code != ICMP_EXC_TTL) |
| 544 | return; | 507 | return; |
| 545 | break; | 508 | break; |
| 546 | |||
| 547 | case ICMP_REDIRECT: | ||
| 548 | break; | ||
| 549 | } | 509 | } |
| 550 | 510 | ||
| 511 | rcu_read_lock(); | ||
| 551 | t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, | 512 | t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, |
| 552 | flags, key, p[1]); | 513 | flags & GRE_KEY ? |
| 553 | 514 | *(((__be32 *)p) + (grehlen / 4) - 1) : 0, | |
| 554 | if (t == NULL) | 515 | p[1]); |
| 555 | return; | 516 | if (t == NULL || t->parms.iph.daddr == 0 || |
| 556 | |||
| 557 | if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { | ||
| 558 | ipv4_update_pmtu(skb, dev_net(skb->dev), info, | ||
| 559 | t->parms.link, 0, IPPROTO_GRE, 0); | ||
| 560 | return; | ||
| 561 | } | ||
| 562 | if (type == ICMP_REDIRECT) { | ||
| 563 | ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, | ||
| 564 | IPPROTO_GRE, 0); | ||
| 565 | return; | ||
| 566 | } | ||
| 567 | if (t->parms.iph.daddr == 0 || | ||
| 568 | ipv4_is_multicast(t->parms.iph.daddr)) | 517 | ipv4_is_multicast(t->parms.iph.daddr)) |
| 569 | return; | 518 | goto out; |
| 570 | 519 | ||
| 571 | if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) | 520 | if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) |
| 572 | return; | 521 | goto out; |
| 573 | 522 | ||
| 574 | if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) | 523 | if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) |
| 575 | t->err_count++; | 524 | t->err_count++; |
| 576 | else | 525 | else |
| 577 | t->err_count = 1; | 526 | t->err_count = 1; |
| 578 | t->err_time = jiffies; | 527 | t->err_time = jiffies; |
| 528 | out: | ||
| 529 | rcu_read_unlock(); | ||
| 530 | } | ||
| 531 | |||
| 532 | static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb) | ||
| 533 | { | ||
| 534 | if (INET_ECN_is_ce(iph->tos)) { | ||
| 535 | if (skb->protocol == htons(ETH_P_IP)) { | ||
| 536 | IP_ECN_set_ce(ip_hdr(skb)); | ||
| 537 | } else if (skb->protocol == htons(ETH_P_IPV6)) { | ||
| 538 | IP6_ECN_set_ce(ipv6_hdr(skb)); | ||
| 539 | } | ||
| 540 | } | ||
| 579 | } | 541 | } |
| 580 | 542 | ||
| 581 | static inline u8 | 543 | static inline u8 |
| @@ -600,21 +562,20 @@ static int ipgre_rcv(struct sk_buff *skb) | |||
| 600 | struct ip_tunnel *tunnel; | 562 | struct ip_tunnel *tunnel; |
| 601 | int offset = 4; | 563 | int offset = 4; |
| 602 | __be16 gre_proto; | 564 | __be16 gre_proto; |
| 603 | int err; | ||
| 604 | 565 | ||
| 605 | if (!pskb_may_pull(skb, 16)) | 566 | if (!pskb_may_pull(skb, 16)) |
| 606 | goto drop; | 567 | goto drop_nolock; |
| 607 | 568 | ||
| 608 | iph = ip_hdr(skb); | 569 | iph = ip_hdr(skb); |
| 609 | h = skb->data; | 570 | h = skb->data; |
| 610 | flags = *(__be16 *)h; | 571 | flags = *(__be16*)h; |
| 611 | 572 | ||
| 612 | if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { | 573 | if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { |
| 613 | /* - Version must be 0. | 574 | /* - Version must be 0. |
| 614 | - We do not support routing headers. | 575 | - We do not support routing headers. |
| 615 | */ | 576 | */ |
| 616 | if (flags&(GRE_VERSION|GRE_ROUTING)) | 577 | if (flags&(GRE_VERSION|GRE_ROUTING)) |
| 617 | goto drop; | 578 | goto drop_nolock; |
| 618 | 579 | ||
| 619 | if (flags&GRE_CSUM) { | 580 | if (flags&GRE_CSUM) { |
| 620 | switch (skb->ip_summed) { | 581 | switch (skb->ip_summed) { |
| @@ -631,21 +592,21 @@ static int ipgre_rcv(struct sk_buff *skb) | |||
| 631 | offset += 4; | 592 | offset += 4; |
| 632 | } | 593 | } |
| 633 | if (flags&GRE_KEY) { | 594 | if (flags&GRE_KEY) { |
| 634 | key = *(__be32 *)(h + offset); | 595 | key = *(__be32*)(h + offset); |
| 635 | offset += 4; | 596 | offset += 4; |
| 636 | } | 597 | } |
| 637 | if (flags&GRE_SEQ) { | 598 | if (flags&GRE_SEQ) { |
| 638 | seqno = ntohl(*(__be32 *)(h + offset)); | 599 | seqno = ntohl(*(__be32*)(h + offset)); |
| 639 | offset += 4; | 600 | offset += 4; |
| 640 | } | 601 | } |
| 641 | } | 602 | } |
| 642 | 603 | ||
| 643 | gre_proto = *(__be16 *)(h + 2); | 604 | gre_proto = *(__be16 *)(h + 2); |
| 644 | 605 | ||
| 645 | tunnel = ipgre_tunnel_lookup(skb->dev, | 606 | rcu_read_lock(); |
| 646 | iph->saddr, iph->daddr, flags, key, | 607 | if ((tunnel = ipgre_tunnel_lookup(skb->dev, |
| 647 | gre_proto); | 608 | iph->saddr, iph->daddr, key, |
| 648 | if (tunnel) { | 609 | gre_proto))) { |
| 649 | struct pcpu_tstats *tstats; | 610 | struct pcpu_tstats *tstats; |
| 650 | 611 | ||
| 651 | secpath_reset(skb); | 612 | secpath_reset(skb); |
| @@ -704,33 +665,25 @@ static int ipgre_rcv(struct sk_buff *skb) | |||
| 704 | skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); | 665 | skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); |
| 705 | } | 666 | } |
| 706 | 667 | ||
| 668 | tstats = this_cpu_ptr(tunnel->dev->tstats); | ||
| 669 | tstats->rx_packets++; | ||
| 670 | tstats->rx_bytes += skb->len; | ||
| 671 | |||
| 707 | __skb_tunnel_rx(skb, tunnel->dev); | 672 | __skb_tunnel_rx(skb, tunnel->dev); |
| 708 | 673 | ||
| 709 | skb_reset_network_header(skb); | 674 | skb_reset_network_header(skb); |
| 710 | err = IP_ECN_decapsulate(iph, skb); | 675 | ipgre_ecn_decapsulate(iph, skb); |
| 711 | if (unlikely(err)) { | ||
| 712 | if (log_ecn_error) | ||
| 713 | net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", | ||
| 714 | &iph->saddr, iph->tos); | ||
| 715 | if (err > 1) { | ||
| 716 | ++tunnel->dev->stats.rx_frame_errors; | ||
| 717 | ++tunnel->dev->stats.rx_errors; | ||
| 718 | goto drop; | ||
| 719 | } | ||
| 720 | } | ||
| 721 | 676 | ||
| 722 | tstats = this_cpu_ptr(tunnel->dev->tstats); | 677 | netif_rx(skb); |
| 723 | u64_stats_update_begin(&tstats->syncp); | ||
| 724 | tstats->rx_packets++; | ||
| 725 | tstats->rx_bytes += skb->len; | ||
| 726 | u64_stats_update_end(&tstats->syncp); | ||
| 727 | 678 | ||
| 728 | gro_cells_receive(&tunnel->gro_cells, skb); | 679 | rcu_read_unlock(); |
| 729 | return 0; | 680 | return 0; |
| 730 | } | 681 | } |
| 731 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); | 682 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); |
| 732 | 683 | ||
| 733 | drop: | 684 | drop: |
| 685 | rcu_read_unlock(); | ||
| 686 | drop_nolock: | ||
| 734 | kfree_skb(skb); | 687 | kfree_skb(skb); |
| 735 | return 0; | 688 | return 0; |
| 736 | } | 689 | } |
| @@ -738,6 +691,7 @@ drop: | |||
| 738 | static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | 691 | static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) |
| 739 | { | 692 | { |
| 740 | struct ip_tunnel *tunnel = netdev_priv(dev); | 693 | struct ip_tunnel *tunnel = netdev_priv(dev); |
| 694 | struct pcpu_tstats *tstats; | ||
| 741 | const struct iphdr *old_iph = ip_hdr(skb); | 695 | const struct iphdr *old_iph = ip_hdr(skb); |
| 742 | const struct iphdr *tiph; | 696 | const struct iphdr *tiph; |
| 743 | struct flowi4 fl4; | 697 | struct flowi4 fl4; |
| @@ -750,21 +704,13 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev | |||
| 750 | int gre_hlen; | 704 | int gre_hlen; |
| 751 | __be32 dst; | 705 | __be32 dst; |
| 752 | int mtu; | 706 | int mtu; |
| 753 | u8 ttl; | ||
| 754 | |||
| 755 | if (skb->ip_summed == CHECKSUM_PARTIAL && | ||
| 756 | skb_checksum_help(skb)) | ||
| 757 | goto tx_error; | ||
| 758 | 707 | ||
| 759 | if (dev->type == ARPHRD_ETHER) | 708 | if (dev->type == ARPHRD_ETHER) |
| 760 | IPCB(skb)->flags = 0; | 709 | IPCB(skb)->flags = 0; |
| 761 | 710 | ||
| 762 | if (dev->header_ops && dev->type == ARPHRD_IPGRE) { | 711 | if (dev->header_ops && dev->type == ARPHRD_IPGRE) { |
| 763 | gre_hlen = 0; | 712 | gre_hlen = 0; |
| 764 | if (skb->protocol == htons(ETH_P_IP)) | 713 | tiph = (const struct iphdr *)skb->data; |
| 765 | tiph = (const struct iphdr *)skb->data; | ||
| 766 | else | ||
| 767 | tiph = &tunnel->parms.iph; | ||
| 768 | } else { | 714 | } else { |
| 769 | gre_hlen = tunnel->hlen; | 715 | gre_hlen = tunnel->hlen; |
| 770 | tiph = &tunnel->parms.iph; | 716 | tiph = &tunnel->parms.iph; |
| @@ -780,16 +726,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev | |||
| 780 | 726 | ||
| 781 | if (skb->protocol == htons(ETH_P_IP)) { | 727 | if (skb->protocol == htons(ETH_P_IP)) { |
| 782 | rt = skb_rtable(skb); | 728 | rt = skb_rtable(skb); |
| 783 | dst = rt_nexthop(rt, old_iph->daddr); | 729 | if ((dst = rt->rt_gateway) == 0) |
| 730 | goto tx_error_icmp; | ||
| 784 | } | 731 | } |
| 785 | #if IS_ENABLED(CONFIG_IPV6) | 732 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 786 | else if (skb->protocol == htons(ETH_P_IPV6)) { | 733 | else if (skb->protocol == htons(ETH_P_IPV6)) { |
| 734 | struct neighbour *neigh = dst_get_neighbour(skb_dst(skb)); | ||
| 787 | const struct in6_addr *addr6; | 735 | const struct in6_addr *addr6; |
| 788 | struct neighbour *neigh; | ||
| 789 | bool do_tx_error_icmp; | ||
| 790 | int addr_type; | 736 | int addr_type; |
| 791 | 737 | ||
| 792 | neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); | ||
| 793 | if (neigh == NULL) | 738 | if (neigh == NULL) |
| 794 | goto tx_error; | 739 | goto tx_error; |
| 795 | 740 | ||
| @@ -802,21 +747,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev | |||
| 802 | } | 747 | } |
| 803 | 748 | ||
| 804 | if ((addr_type & IPV6_ADDR_COMPATv4) == 0) | 749 | if ((addr_type & IPV6_ADDR_COMPATv4) == 0) |
| 805 | do_tx_error_icmp = true; | ||
| 806 | else { | ||
| 807 | do_tx_error_icmp = false; | ||
| 808 | dst = addr6->s6_addr32[3]; | ||
| 809 | } | ||
| 810 | neigh_release(neigh); | ||
| 811 | if (do_tx_error_icmp) | ||
| 812 | goto tx_error_icmp; | 750 | goto tx_error_icmp; |
| 751 | |||
| 752 | dst = addr6->s6_addr32[3]; | ||
| 813 | } | 753 | } |
| 814 | #endif | 754 | #endif |
| 815 | else | 755 | else |
| 816 | goto tx_error; | 756 | goto tx_error; |
| 817 | } | 757 | } |
| 818 | 758 | ||
| 819 | ttl = tiph->ttl; | ||
| 820 | tos = tiph->tos; | 759 | tos = tiph->tos; |
| 821 | if (tos == 1) { | 760 | if (tos == 1) { |
| 822 | tos = 0; | 761 | tos = 0; |
| @@ -848,7 +787,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev | |||
| 848 | mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; | 787 | mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; |
| 849 | 788 | ||
| 850 | if (skb_dst(skb)) | 789 | if (skb_dst(skb)) |
| 851 | skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); | 790 | skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); |
| 852 | 791 | ||
| 853 | if (skb->protocol == htons(ETH_P_IP)) { | 792 | if (skb->protocol == htons(ETH_P_IP)) { |
| 854 | df |= (old_iph->frag_off&htons(IP_DF)); | 793 | df |= (old_iph->frag_off&htons(IP_DF)); |
| @@ -860,7 +799,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev | |||
| 860 | goto tx_error; | 799 | goto tx_error; |
| 861 | } | 800 | } |
| 862 | } | 801 | } |
| 863 | #if IS_ENABLED(CONFIG_IPV6) | 802 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 864 | else if (skb->protocol == htons(ETH_P_IPV6)) { | 803 | else if (skb->protocol == htons(ETH_P_IPV6)) { |
| 865 | struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); | 804 | struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); |
| 866 | 805 | ||
| @@ -909,12 +848,11 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev | |||
| 909 | dev_kfree_skb(skb); | 848 | dev_kfree_skb(skb); |
| 910 | skb = new_skb; | 849 | skb = new_skb; |
| 911 | old_iph = ip_hdr(skb); | 850 | old_iph = ip_hdr(skb); |
| 912 | /* Warning : tiph value might point to freed memory */ | ||
| 913 | } | 851 | } |
| 914 | 852 | ||
| 853 | skb_reset_transport_header(skb); | ||
| 915 | skb_push(skb, gre_hlen); | 854 | skb_push(skb, gre_hlen); |
| 916 | skb_reset_network_header(skb); | 855 | skb_reset_network_header(skb); |
| 917 | skb_set_transport_header(skb, sizeof(*iph)); | ||
| 918 | memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); | 856 | memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); |
| 919 | IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | | 857 | IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | |
| 920 | IPSKB_REROUTED); | 858 | IPSKB_REROUTED); |
| @@ -933,12 +871,11 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev | |||
| 933 | iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb); | 871 | iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb); |
| 934 | iph->daddr = fl4.daddr; | 872 | iph->daddr = fl4.daddr; |
| 935 | iph->saddr = fl4.saddr; | 873 | iph->saddr = fl4.saddr; |
| 936 | iph->ttl = ttl; | ||
| 937 | 874 | ||
| 938 | if (ttl == 0) { | 875 | if ((iph->ttl = tiph->ttl) == 0) { |
| 939 | if (skb->protocol == htons(ETH_P_IP)) | 876 | if (skb->protocol == htons(ETH_P_IP)) |
| 940 | iph->ttl = old_iph->ttl; | 877 | iph->ttl = old_iph->ttl; |
| 941 | #if IS_ENABLED(CONFIG_IPV6) | 878 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 942 | else if (skb->protocol == htons(ETH_P_IPV6)) | 879 | else if (skb->protocol == htons(ETH_P_IPV6)) |
| 943 | iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit; | 880 | iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit; |
| 944 | #endif | 881 | #endif |
| @@ -951,7 +888,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev | |||
| 951 | htons(ETH_P_TEB) : skb->protocol; | 888 | htons(ETH_P_TEB) : skb->protocol; |
| 952 | 889 | ||
| 953 | if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { | 890 | if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { |
| 954 | __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4); | 891 | __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4); |
| 955 | 892 | ||
| 956 | if (tunnel->parms.o_flags&GRE_SEQ) { | 893 | if (tunnel->parms.o_flags&GRE_SEQ) { |
| 957 | ++tunnel->o_seqno; | 894 | ++tunnel->o_seqno; |
| @@ -964,17 +901,18 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev | |||
| 964 | } | 901 | } |
| 965 | if (tunnel->parms.o_flags&GRE_CSUM) { | 902 | if (tunnel->parms.o_flags&GRE_CSUM) { |
| 966 | *ptr = 0; | 903 | *ptr = 0; |
| 967 | *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr)); | 904 | *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr)); |
| 968 | } | 905 | } |
| 969 | } | 906 | } |
| 970 | 907 | ||
| 971 | iptunnel_xmit(skb, dev); | 908 | nf_reset(skb); |
| 909 | tstats = this_cpu_ptr(dev->tstats); | ||
| 910 | __IPTUNNEL_XMIT(tstats, &dev->stats); | ||
| 972 | return NETDEV_TX_OK; | 911 | return NETDEV_TX_OK; |
| 973 | 912 | ||
| 974 | #if IS_ENABLED(CONFIG_IPV6) | ||
| 975 | tx_error_icmp: | 913 | tx_error_icmp: |
| 976 | dst_link_failure(skb); | 914 | dst_link_failure(skb); |
| 977 | #endif | 915 | |
| 978 | tx_error: | 916 | tx_error: |
| 979 | dev->stats.tx_errors++; | 917 | dev->stats.tx_errors++; |
| 980 | dev_kfree_skb(skb); | 918 | dev_kfree_skb(skb); |
| @@ -1071,7 +1009,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) | |||
| 1071 | case SIOCADDTUNNEL: | 1009 | case SIOCADDTUNNEL: |
| 1072 | case SIOCCHGTUNNEL: | 1010 | case SIOCCHGTUNNEL: |
| 1073 | err = -EPERM; | 1011 | err = -EPERM; |
| 1074 | if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) | 1012 | if (!capable(CAP_NET_ADMIN)) |
| 1075 | goto done; | 1013 | goto done; |
| 1076 | 1014 | ||
| 1077 | err = -EFAULT; | 1015 | err = -EFAULT; |
| @@ -1146,7 +1084,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) | |||
| 1146 | 1084 | ||
| 1147 | case SIOCDELTUNNEL: | 1085 | case SIOCDELTUNNEL: |
| 1148 | err = -EPERM; | 1086 | err = -EPERM; |
| 1149 | if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) | 1087 | if (!capable(CAP_NET_ADMIN)) |
| 1150 | goto done; | 1088 | goto done; |
| 1151 | 1089 | ||
| 1152 | if (dev == ign->fb_tunnel_dev) { | 1090 | if (dev == ign->fb_tunnel_dev) { |
| @@ -1218,7 +1156,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, | |||
| 1218 | { | 1156 | { |
| 1219 | struct ip_tunnel *t = netdev_priv(dev); | 1157 | struct ip_tunnel *t = netdev_priv(dev); |
| 1220 | struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); | 1158 | struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); |
| 1221 | __be16 *p = (__be16 *)(iph+1); | 1159 | __be16 *p = (__be16*)(iph+1); |
| 1222 | 1160 | ||
| 1223 | memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); | 1161 | memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); |
| 1224 | p[0] = t->parms.o_flags; | 1162 | p[0] = t->parms.o_flags; |
| @@ -1302,23 +1240,15 @@ static const struct net_device_ops ipgre_netdev_ops = { | |||
| 1302 | .ndo_start_xmit = ipgre_tunnel_xmit, | 1240 | .ndo_start_xmit = ipgre_tunnel_xmit, |
| 1303 | .ndo_do_ioctl = ipgre_tunnel_ioctl, | 1241 | .ndo_do_ioctl = ipgre_tunnel_ioctl, |
| 1304 | .ndo_change_mtu = ipgre_tunnel_change_mtu, | 1242 | .ndo_change_mtu = ipgre_tunnel_change_mtu, |
| 1305 | .ndo_get_stats64 = ipgre_get_stats64, | 1243 | .ndo_get_stats = ipgre_get_stats, |
| 1306 | }; | 1244 | }; |
| 1307 | 1245 | ||
| 1308 | static void ipgre_dev_free(struct net_device *dev) | 1246 | static void ipgre_dev_free(struct net_device *dev) |
| 1309 | { | 1247 | { |
| 1310 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
| 1311 | |||
| 1312 | gro_cells_destroy(&tunnel->gro_cells); | ||
| 1313 | free_percpu(dev->tstats); | 1248 | free_percpu(dev->tstats); |
| 1314 | free_netdev(dev); | 1249 | free_netdev(dev); |
| 1315 | } | 1250 | } |
| 1316 | 1251 | ||
| 1317 | #define GRE_FEATURES (NETIF_F_SG | \ | ||
| 1318 | NETIF_F_FRAGLIST | \ | ||
| 1319 | NETIF_F_HIGHDMA | \ | ||
| 1320 | NETIF_F_HW_CSUM) | ||
| 1321 | |||
| 1322 | static void ipgre_tunnel_setup(struct net_device *dev) | 1252 | static void ipgre_tunnel_setup(struct net_device *dev) |
| 1323 | { | 1253 | { |
| 1324 | dev->netdev_ops = &ipgre_netdev_ops; | 1254 | dev->netdev_ops = &ipgre_netdev_ops; |
| @@ -1332,16 +1262,12 @@ static void ipgre_tunnel_setup(struct net_device *dev) | |||
| 1332 | dev->addr_len = 4; | 1262 | dev->addr_len = 4; |
| 1333 | dev->features |= NETIF_F_NETNS_LOCAL; | 1263 | dev->features |= NETIF_F_NETNS_LOCAL; |
| 1334 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 1264 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; |
| 1335 | |||
| 1336 | dev->features |= GRE_FEATURES; | ||
| 1337 | dev->hw_features |= GRE_FEATURES; | ||
| 1338 | } | 1265 | } |
| 1339 | 1266 | ||
| 1340 | static int ipgre_tunnel_init(struct net_device *dev) | 1267 | static int ipgre_tunnel_init(struct net_device *dev) |
| 1341 | { | 1268 | { |
| 1342 | struct ip_tunnel *tunnel; | 1269 | struct ip_tunnel *tunnel; |
| 1343 | struct iphdr *iph; | 1270 | struct iphdr *iph; |
| 1344 | int err; | ||
| 1345 | 1271 | ||
| 1346 | tunnel = netdev_priv(dev); | 1272 | tunnel = netdev_priv(dev); |
| 1347 | iph = &tunnel->parms.iph; | 1273 | iph = &tunnel->parms.iph; |
| @@ -1368,12 +1294,6 @@ static int ipgre_tunnel_init(struct net_device *dev) | |||
| 1368 | if (!dev->tstats) | 1294 | if (!dev->tstats) |
| 1369 | return -ENOMEM; | 1295 | return -ENOMEM; |
| 1370 | 1296 | ||
| 1371 | err = gro_cells_init(&tunnel->gro_cells, dev); | ||
| 1372 | if (err) { | ||
| 1373 | free_percpu(dev->tstats); | ||
| 1374 | return err; | ||
| 1375 | } | ||
| 1376 | |||
| 1377 | return 0; | 1297 | return 0; |
| 1378 | } | 1298 | } |
| 1379 | 1299 | ||
| @@ -1574,7 +1494,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = { | |||
| 1574 | .ndo_set_mac_address = eth_mac_addr, | 1494 | .ndo_set_mac_address = eth_mac_addr, |
| 1575 | .ndo_validate_addr = eth_validate_addr, | 1495 | .ndo_validate_addr = eth_validate_addr, |
| 1576 | .ndo_change_mtu = ipgre_tunnel_change_mtu, | 1496 | .ndo_change_mtu = ipgre_tunnel_change_mtu, |
| 1577 | .ndo_get_stats64 = ipgre_get_stats64, | 1497 | .ndo_get_stats = ipgre_get_stats, |
| 1578 | }; | 1498 | }; |
| 1579 | 1499 | ||
| 1580 | static void ipgre_tap_setup(struct net_device *dev) | 1500 | static void ipgre_tap_setup(struct net_device *dev) |
| @@ -1605,7 +1525,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nla | |||
| 1605 | return -EEXIST; | 1525 | return -EEXIST; |
| 1606 | 1526 | ||
| 1607 | if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) | 1527 | if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) |
| 1608 | eth_hw_addr_random(dev); | 1528 | random_ether_addr(dev->dev_addr); |
| 1609 | 1529 | ||
| 1610 | mtu = ipgre_tunnel_bind_dev(dev); | 1530 | mtu = ipgre_tunnel_bind_dev(dev); |
| 1611 | if (!tb[IFLA_MTU]) | 1531 | if (!tb[IFLA_MTU]) |
| @@ -1721,18 +1641,17 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) | |||
| 1721 | struct ip_tunnel *t = netdev_priv(dev); | 1641 | struct ip_tunnel *t = netdev_priv(dev); |
| 1722 | struct ip_tunnel_parm *p = &t->parms; | 1642 | struct ip_tunnel_parm *p = &t->parms; |
| 1723 | 1643 | ||
| 1724 | if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || | 1644 | NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link); |
| 1725 | nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) || | 1645 | NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags); |
| 1726 | nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) || | 1646 | NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags); |
| 1727 | nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || | 1647 | NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key); |
| 1728 | nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || | 1648 | NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key); |
| 1729 | nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) || | 1649 | NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr); |
| 1730 | nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) || | 1650 | NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr); |
| 1731 | nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) || | 1651 | NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl); |
| 1732 | nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) || | 1652 | NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos); |
| 1733 | nla_put_u8(skb, IFLA_GRE_PMTUDISC, | 1653 | NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF))); |
| 1734 | !!(p->iph.frag_off & htons(IP_DF)))) | 1654 | |
| 1735 | goto nla_put_failure; | ||
| 1736 | return 0; | 1655 | return 0; |
| 1737 | 1656 | ||
| 1738 | nla_put_failure: | 1657 | nla_put_failure: |
| @@ -1786,7 +1705,7 @@ static int __init ipgre_init(void) | |||
| 1786 | { | 1705 | { |
| 1787 | int err; | 1706 | int err; |
| 1788 | 1707 | ||
| 1789 | pr_info("GRE over IPv4 tunneling driver\n"); | 1708 | printk(KERN_INFO "GRE over IPv4 tunneling driver\n"); |
| 1790 | 1709 | ||
| 1791 | err = register_pernet_device(&ipgre_net_ops); | 1710 | err = register_pernet_device(&ipgre_net_ops); |
| 1792 | if (err < 0) | 1711 | if (err < 0) |
| @@ -1794,7 +1713,7 @@ static int __init ipgre_init(void) | |||
| 1794 | 1713 | ||
| 1795 | err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); | 1714 | err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); |
| 1796 | if (err < 0) { | 1715 | if (err < 0) { |
| 1797 | pr_info("%s: can't add protocol\n", __func__); | 1716 | printk(KERN_INFO "ipgre init: can't add protocol\n"); |
| 1798 | goto add_proto_failed; | 1717 | goto add_proto_failed; |
| 1799 | } | 1718 | } |
| 1800 | 1719 | ||
| @@ -1823,7 +1742,7 @@ static void __exit ipgre_fini(void) | |||
| 1823 | rtnl_link_unregister(&ipgre_tap_ops); | 1742 | rtnl_link_unregister(&ipgre_tap_ops); |
| 1824 | rtnl_link_unregister(&ipgre_link_ops); | 1743 | rtnl_link_unregister(&ipgre_link_ops); |
| 1825 | if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) | 1744 | if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) |
| 1826 | pr_info("%s: can't remove protocol\n", __func__); | 1745 | printk(KERN_INFO "ipgre close: can't remove protocol\n"); |
| 1827 | unregister_pernet_device(&ipgre_net_ops); | 1746 | unregister_pernet_device(&ipgre_net_ops); |
| 1828 | } | 1747 | } |
| 1829 | 1748 | ||
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index f1395a6fb35..073a9b01c40 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c | |||
| @@ -113,8 +113,7 @@ | |||
| 113 | * 2 of the License, or (at your option) any later version. | 113 | * 2 of the License, or (at your option) any later version. |
| 114 | */ | 114 | */ |
| 115 | 115 | ||
| 116 | #define pr_fmt(fmt) "IPv4: " fmt | 116 | #include <asm/system.h> |
| 117 | |||
| 118 | #include <linux/module.h> | 117 | #include <linux/module.h> |
| 119 | #include <linux/types.h> | 118 | #include <linux/types.h> |
| 120 | #include <linux/kernel.h> | 119 | #include <linux/kernel.h> |
| @@ -149,7 +148,7 @@ | |||
| 149 | /* | 148 | /* |
| 150 | * Process Router Attention IP option (RFC 2113) | 149 | * Process Router Attention IP option (RFC 2113) |
| 151 | */ | 150 | */ |
| 152 | bool ip_call_ra_chain(struct sk_buff *skb) | 151 | int ip_call_ra_chain(struct sk_buff *skb) |
| 153 | { | 152 | { |
| 154 | struct ip_ra_chain *ra; | 153 | struct ip_ra_chain *ra; |
| 155 | u8 protocol = ip_hdr(skb)->protocol; | 154 | u8 protocol = ip_hdr(skb)->protocol; |
| @@ -168,7 +167,7 @@ bool ip_call_ra_chain(struct sk_buff *skb) | |||
| 168 | net_eq(sock_net(sk), dev_net(dev))) { | 167 | net_eq(sock_net(sk), dev_net(dev))) { |
| 169 | if (ip_is_fragment(ip_hdr(skb))) { | 168 | if (ip_is_fragment(ip_hdr(skb))) { |
| 170 | if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) | 169 | if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) |
| 171 | return true; | 170 | return 1; |
| 172 | } | 171 | } |
| 173 | if (last) { | 172 | if (last) { |
| 174 | struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); | 173 | struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); |
| @@ -181,9 +180,9 @@ bool ip_call_ra_chain(struct sk_buff *skb) | |||
| 181 | 180 | ||
| 182 | if (last) { | 181 | if (last) { |
| 183 | raw_rcv(last, skb); | 182 | raw_rcv(last, skb); |
| 184 | return true; | 183 | return 1; |
| 185 | } | 184 | } |
| 186 | return false; | 185 | return 0; |
| 187 | } | 186 | } |
| 188 | 187 | ||
| 189 | static int ip_local_deliver_finish(struct sk_buff *skb) | 188 | static int ip_local_deliver_finish(struct sk_buff *skb) |
| @@ -198,19 +197,21 @@ static int ip_local_deliver_finish(struct sk_buff *skb) | |||
| 198 | rcu_read_lock(); | 197 | rcu_read_lock(); |
| 199 | { | 198 | { |
| 200 | int protocol = ip_hdr(skb)->protocol; | 199 | int protocol = ip_hdr(skb)->protocol; |
| 200 | int hash, raw; | ||
| 201 | const struct net_protocol *ipprot; | 201 | const struct net_protocol *ipprot; |
| 202 | int raw; | ||
| 203 | 202 | ||
| 204 | resubmit: | 203 | resubmit: |
| 205 | raw = raw_local_deliver(skb, protocol); | 204 | raw = raw_local_deliver(skb, protocol); |
| 206 | 205 | ||
| 207 | ipprot = rcu_dereference(inet_protos[protocol]); | 206 | hash = protocol & (MAX_INET_PROTOS - 1); |
| 207 | ipprot = rcu_dereference(inet_protos[hash]); | ||
| 208 | if (ipprot != NULL) { | 208 | if (ipprot != NULL) { |
| 209 | int ret; | 209 | int ret; |
| 210 | 210 | ||
| 211 | if (!net_eq(net, &init_net) && !ipprot->netns_ok) { | 211 | if (!net_eq(net, &init_net) && !ipprot->netns_ok) { |
| 212 | net_info_ratelimited("%s: proto %d isn't netns-ready\n", | 212 | if (net_ratelimit()) |
| 213 | __func__, protocol); | 213 | printk("%s: proto %d isn't netns-ready\n", |
| 214 | __func__, protocol); | ||
| 214 | kfree_skb(skb); | 215 | kfree_skb(skb); |
| 215 | goto out; | 216 | goto out; |
| 216 | } | 217 | } |
| @@ -264,7 +265,7 @@ int ip_local_deliver(struct sk_buff *skb) | |||
| 264 | ip_local_deliver_finish); | 265 | ip_local_deliver_finish); |
| 265 | } | 266 | } |
| 266 | 267 | ||
| 267 | static inline bool ip_rcv_options(struct sk_buff *skb) | 268 | static inline int ip_rcv_options(struct sk_buff *skb) |
| 268 | { | 269 | { |
| 269 | struct ip_options *opt; | 270 | struct ip_options *opt; |
| 270 | const struct iphdr *iph; | 271 | const struct iphdr *iph; |
| @@ -296,10 +297,10 @@ static inline bool ip_rcv_options(struct sk_buff *skb) | |||
| 296 | 297 | ||
| 297 | if (in_dev) { | 298 | if (in_dev) { |
| 298 | if (!IN_DEV_SOURCE_ROUTE(in_dev)) { | 299 | if (!IN_DEV_SOURCE_ROUTE(in_dev)) { |
| 299 | if (IN_DEV_LOG_MARTIANS(in_dev)) | 300 | if (IN_DEV_LOG_MARTIANS(in_dev) && |
| 300 | net_info_ratelimited("source route option %pI4 -> %pI4\n", | 301 | net_ratelimit()) |
| 301 | &iph->saddr, | 302 | printk(KERN_INFO "source route option %pI4 -> %pI4\n", |
| 302 | &iph->daddr); | 303 | &iph->saddr, &iph->daddr); |
| 303 | goto drop; | 304 | goto drop; |
| 304 | } | 305 | } |
| 305 | } | 306 | } |
| @@ -308,40 +309,31 @@ static inline bool ip_rcv_options(struct sk_buff *skb) | |||
| 308 | goto drop; | 309 | goto drop; |
| 309 | } | 310 | } |
| 310 | 311 | ||
| 311 | return false; | 312 | return 0; |
| 312 | drop: | 313 | drop: |
| 313 | return true; | 314 | return -1; |
| 314 | } | 315 | } |
| 315 | 316 | ||
| 316 | int sysctl_ip_early_demux __read_mostly = 1; | ||
| 317 | EXPORT_SYMBOL(sysctl_ip_early_demux); | ||
| 318 | |||
| 319 | static int ip_rcv_finish(struct sk_buff *skb) | 317 | static int ip_rcv_finish(struct sk_buff *skb) |
| 320 | { | 318 | { |
| 321 | const struct iphdr *iph = ip_hdr(skb); | 319 | const struct iphdr *iph = ip_hdr(skb); |
| 322 | struct rtable *rt; | 320 | struct rtable *rt; |
| 323 | 321 | ||
| 324 | if (sysctl_ip_early_demux && !skb_dst(skb)) { | ||
| 325 | const struct net_protocol *ipprot; | ||
| 326 | int protocol = iph->protocol; | ||
| 327 | |||
| 328 | ipprot = rcu_dereference(inet_protos[protocol]); | ||
| 329 | if (ipprot && ipprot->early_demux) { | ||
| 330 | ipprot->early_demux(skb); | ||
| 331 | /* must reload iph, skb->head might have changed */ | ||
| 332 | iph = ip_hdr(skb); | ||
| 333 | } | ||
| 334 | } | ||
| 335 | |||
| 336 | /* | 322 | /* |
| 337 | * Initialise the virtual path cache for the packet. It describes | 323 | * Initialise the virtual path cache for the packet. It describes |
| 338 | * how the packet travels inside Linux networking. | 324 | * how the packet travels inside Linux networking. |
| 339 | */ | 325 | */ |
| 340 | if (!skb_dst(skb)) { | 326 | if (skb_dst(skb) == NULL) { |
| 341 | int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, | 327 | int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, |
| 342 | iph->tos, skb->dev); | 328 | iph->tos, skb->dev); |
| 343 | if (unlikely(err)) { | 329 | if (unlikely(err)) { |
| 344 | if (err == -EXDEV) | 330 | if (err == -EHOSTUNREACH) |
| 331 | IP_INC_STATS_BH(dev_net(skb->dev), | ||
| 332 | IPSTATS_MIB_INADDRERRORS); | ||
| 333 | else if (err == -ENETUNREACH) | ||
| 334 | IP_INC_STATS_BH(dev_net(skb->dev), | ||
| 335 | IPSTATS_MIB_INNOROUTES); | ||
| 336 | else if (err == -EXDEV) | ||
| 345 | NET_INC_STATS_BH(dev_net(skb->dev), | 337 | NET_INC_STATS_BH(dev_net(skb->dev), |
| 346 | LINUX_MIB_IPRPFILTER); | 338 | LINUX_MIB_IPRPFILTER); |
| 347 | goto drop; | 339 | goto drop; |
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index f6289bf6f33..05d20cca9d6 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c | |||
| @@ -9,8 +9,6 @@ | |||
| 9 | * | 9 | * |
| 10 | */ | 10 | */ |
| 11 | 11 | ||
| 12 | #define pr_fmt(fmt) "IPv4: " fmt | ||
| 13 | |||
| 14 | #include <linux/capability.h> | 12 | #include <linux/capability.h> |
| 15 | #include <linux/module.h> | 13 | #include <linux/module.h> |
| 16 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
| @@ -27,7 +25,6 @@ | |||
| 27 | #include <net/icmp.h> | 25 | #include <net/icmp.h> |
| 28 | #include <net/route.h> | 26 | #include <net/route.h> |
| 29 | #include <net/cipso_ipv4.h> | 27 | #include <net/cipso_ipv4.h> |
| 30 | #include <net/ip_fib.h> | ||
| 31 | 28 | ||
| 32 | /* | 29 | /* |
| 33 | * Write options to IP header, record destination address to | 30 | * Write options to IP header, record destination address to |
| @@ -93,6 +90,7 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) | |||
| 93 | unsigned char *sptr, *dptr; | 90 | unsigned char *sptr, *dptr; |
| 94 | int soffset, doffset; | 91 | int soffset, doffset; |
| 95 | int optlen; | 92 | int optlen; |
| 93 | __be32 daddr; | ||
| 96 | 94 | ||
| 97 | memset(dopt, 0, sizeof(struct ip_options)); | 95 | memset(dopt, 0, sizeof(struct ip_options)); |
| 98 | 96 | ||
| @@ -104,6 +102,8 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) | |||
| 104 | sptr = skb_network_header(skb); | 102 | sptr = skb_network_header(skb); |
| 105 | dptr = dopt->__data; | 103 | dptr = dopt->__data; |
| 106 | 104 | ||
| 105 | daddr = skb_rtable(skb)->rt_spec_dst; | ||
| 106 | |||
| 107 | if (sopt->rr) { | 107 | if (sopt->rr) { |
| 108 | optlen = sptr[sopt->rr+1]; | 108 | optlen = sptr[sopt->rr+1]; |
| 109 | soffset = sptr[sopt->rr+2]; | 109 | soffset = sptr[sopt->rr+2]; |
| @@ -177,8 +177,6 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) | |||
| 177 | doffset -= 4; | 177 | doffset -= 4; |
| 178 | } | 178 | } |
| 179 | if (doffset > 3) { | 179 | if (doffset > 3) { |
| 180 | __be32 daddr = fib_compute_spec_dst(skb); | ||
| 181 | |||
| 182 | memcpy(&start[doffset-1], &daddr, 4); | 180 | memcpy(&start[doffset-1], &daddr, 4); |
| 183 | dopt->faddr = faddr; | 181 | dopt->faddr = faddr; |
| 184 | dptr[0] = start[0]; | 182 | dptr[0] = start[0]; |
| @@ -210,10 +208,10 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) | |||
| 210 | * Simple and stupid 8), but the most efficient way. | 208 | * Simple and stupid 8), but the most efficient way. |
| 211 | */ | 209 | */ |
| 212 | 210 | ||
| 213 | void ip_options_fragment(struct sk_buff *skb) | 211 | void ip_options_fragment(struct sk_buff * skb) |
| 214 | { | 212 | { |
| 215 | unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr); | 213 | unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr); |
| 216 | struct ip_options *opt = &(IPCB(skb)->opt); | 214 | struct ip_options * opt = &(IPCB(skb)->opt); |
| 217 | int l = opt->optlen; | 215 | int l = opt->optlen; |
| 218 | int optlen; | 216 | int optlen; |
| 219 | 217 | ||
| @@ -241,15 +239,6 @@ void ip_options_fragment(struct sk_buff *skb) | |||
| 241 | opt->ts_needtime = 0; | 239 | opt->ts_needtime = 0; |
| 242 | } | 240 | } |
| 243 | 241 | ||
| 244 | /* helper used by ip_options_compile() to call fib_compute_spec_dst() | ||
| 245 | * at most one time. | ||
| 246 | */ | ||
| 247 | static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb) | ||
| 248 | { | ||
| 249 | if (*spec_dst == htonl(INADDR_ANY)) | ||
| 250 | *spec_dst = fib_compute_spec_dst(skb); | ||
| 251 | } | ||
| 252 | |||
| 253 | /* | 242 | /* |
| 254 | * Verify options and fill pointers in struct options. | 243 | * Verify options and fill pointers in struct options. |
| 255 | * Caller should clear *opt, and set opt->data. | 244 | * Caller should clear *opt, and set opt->data. |
| @@ -257,14 +246,14 @@ static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb) | |||
| 257 | */ | 246 | */ |
| 258 | 247 | ||
| 259 | int ip_options_compile(struct net *net, | 248 | int ip_options_compile(struct net *net, |
| 260 | struct ip_options *opt, struct sk_buff *skb) | 249 | struct ip_options * opt, struct sk_buff * skb) |
| 261 | { | 250 | { |
| 262 | __be32 spec_dst = htonl(INADDR_ANY); | 251 | int l; |
| 263 | unsigned char *pp_ptr = NULL; | 252 | unsigned char * iph; |
| 253 | unsigned char * optptr; | ||
| 254 | int optlen; | ||
| 255 | unsigned char * pp_ptr = NULL; | ||
| 264 | struct rtable *rt = NULL; | 256 | struct rtable *rt = NULL; |
| 265 | unsigned char *optptr; | ||
| 266 | unsigned char *iph; | ||
| 267 | int optlen, l; | ||
| 268 | 257 | ||
| 269 | if (skb != NULL) { | 258 | if (skb != NULL) { |
| 270 | rt = skb_rtable(skb); | 259 | rt = skb_rtable(skb); |
| @@ -340,8 +329,7 @@ int ip_options_compile(struct net *net, | |||
| 340 | goto error; | 329 | goto error; |
| 341 | } | 330 | } |
| 342 | if (rt) { | 331 | if (rt) { |
| 343 | spec_dst_fill(&spec_dst, skb); | 332 | memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); |
| 344 | memcpy(&optptr[optptr[2]-1], &spec_dst, 4); | ||
| 345 | opt->is_changed = 1; | 333 | opt->is_changed = 1; |
| 346 | } | 334 | } |
| 347 | optptr[2] += 4; | 335 | optptr[2] += 4; |
| @@ -383,8 +371,7 @@ int ip_options_compile(struct net *net, | |||
| 383 | } | 371 | } |
| 384 | opt->ts = optptr - iph; | 372 | opt->ts = optptr - iph; |
| 385 | if (rt) { | 373 | if (rt) { |
| 386 | spec_dst_fill(&spec_dst, skb); | 374 | memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); |
| 387 | memcpy(&optptr[optptr[2]-1], &spec_dst, 4); | ||
| 388 | timeptr = &optptr[optptr[2]+3]; | 375 | timeptr = &optptr[optptr[2]+3]; |
| 389 | } | 376 | } |
| 390 | opt->ts_needaddr = 1; | 377 | opt->ts_needaddr = 1; |
| @@ -409,7 +396,7 @@ int ip_options_compile(struct net *net, | |||
| 409 | optptr[2] += 8; | 396 | optptr[2] += 8; |
| 410 | break; | 397 | break; |
| 411 | default: | 398 | default: |
| 412 | if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { | 399 | if (!skb && !capable(CAP_NET_RAW)) { |
| 413 | pp_ptr = optptr + 3; | 400 | pp_ptr = optptr + 3; |
| 414 | goto error; | 401 | goto error; |
| 415 | } | 402 | } |
| @@ -424,7 +411,7 @@ int ip_options_compile(struct net *net, | |||
| 424 | opt->is_changed = 1; | 411 | opt->is_changed = 1; |
| 425 | } | 412 | } |
| 426 | } else { | 413 | } else { |
| 427 | unsigned int overflow = optptr[3]>>4; | 414 | unsigned overflow = optptr[3]>>4; |
| 428 | if (overflow == 15) { | 415 | if (overflow == 15) { |
| 429 | pp_ptr = optptr + 3; | 416 | pp_ptr = optptr + 3; |
| 430 | goto error; | 417 | goto error; |
| @@ -445,7 +432,7 @@ int ip_options_compile(struct net *net, | |||
| 445 | opt->router_alert = optptr - iph; | 432 | opt->router_alert = optptr - iph; |
| 446 | break; | 433 | break; |
| 447 | case IPOPT_CIPSO: | 434 | case IPOPT_CIPSO: |
| 448 | if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) { | 435 | if ((!skb && !capable(CAP_NET_RAW)) || opt->cipso) { |
| 449 | pp_ptr = optptr; | 436 | pp_ptr = optptr; |
| 450 | goto error; | 437 | goto error; |
| 451 | } | 438 | } |
| @@ -458,7 +445,7 @@ int ip_options_compile(struct net *net, | |||
| 458 | case IPOPT_SEC: | 445 | case IPOPT_SEC: |
| 459 | case IPOPT_SID: | 446 | case IPOPT_SID: |
| 460 | default: | 447 | default: |
| 461 | if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { | 448 | if (!skb && !capable(CAP_NET_RAW)) { |
| 462 | pp_ptr = optptr; | 449 | pp_ptr = optptr; |
| 463 | goto error; | 450 | goto error; |
| 464 | } | 451 | } |
| @@ -484,20 +471,20 @@ EXPORT_SYMBOL(ip_options_compile); | |||
| 484 | * Undo all the changes done by ip_options_compile(). | 471 | * Undo all the changes done by ip_options_compile(). |
| 485 | */ | 472 | */ |
| 486 | 473 | ||
| 487 | void ip_options_undo(struct ip_options *opt) | 474 | void ip_options_undo(struct ip_options * opt) |
| 488 | { | 475 | { |
| 489 | if (opt->srr) { | 476 | if (opt->srr) { |
| 490 | unsigned char *optptr = opt->__data+opt->srr-sizeof(struct iphdr); | 477 | unsigned char * optptr = opt->__data+opt->srr-sizeof(struct iphdr); |
| 491 | memmove(optptr+7, optptr+3, optptr[1]-7); | 478 | memmove(optptr+7, optptr+3, optptr[1]-7); |
| 492 | memcpy(optptr+3, &opt->faddr, 4); | 479 | memcpy(optptr+3, &opt->faddr, 4); |
| 493 | } | 480 | } |
| 494 | if (opt->rr_needaddr) { | 481 | if (opt->rr_needaddr) { |
| 495 | unsigned char *optptr = opt->__data+opt->rr-sizeof(struct iphdr); | 482 | unsigned char * optptr = opt->__data+opt->rr-sizeof(struct iphdr); |
| 496 | optptr[2] -= 4; | 483 | optptr[2] -= 4; |
| 497 | memset(&optptr[optptr[2]-1], 0, 4); | 484 | memset(&optptr[optptr[2]-1], 0, 4); |
| 498 | } | 485 | } |
| 499 | if (opt->ts) { | 486 | if (opt->ts) { |
| 500 | unsigned char *optptr = opt->__data+opt->ts-sizeof(struct iphdr); | 487 | unsigned char * optptr = opt->__data+opt->ts-sizeof(struct iphdr); |
| 501 | if (opt->ts_needtime) { | 488 | if (opt->ts_needtime) { |
| 502 | optptr[2] -= 4; | 489 | optptr[2] -= 4; |
| 503 | memset(&optptr[optptr[2]-1], 0, 4); | 490 | memset(&optptr[optptr[2]-1], 0, 4); |
| @@ -560,8 +547,8 @@ int ip_options_get(struct net *net, struct ip_options_rcu **optp, | |||
| 560 | 547 | ||
| 561 | void ip_forward_options(struct sk_buff *skb) | 548 | void ip_forward_options(struct sk_buff *skb) |
| 562 | { | 549 | { |
| 563 | struct ip_options *opt = &(IPCB(skb)->opt); | 550 | struct ip_options * opt = &(IPCB(skb)->opt); |
| 564 | unsigned char *optptr; | 551 | unsigned char * optptr; |
| 565 | struct rtable *rt = skb_rtable(skb); | 552 | struct rtable *rt = skb_rtable(skb); |
| 566 | unsigned char *raw = skb_network_header(skb); | 553 | unsigned char *raw = skb_network_header(skb); |
| 567 | 554 | ||
| @@ -581,18 +568,15 @@ void ip_forward_options(struct sk_buff *skb) | |||
| 581 | ) { | 568 | ) { |
| 582 | if (srrptr + 3 > srrspace) | 569 | if (srrptr + 3 > srrspace) |
| 583 | break; | 570 | break; |
| 584 | if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0) | 571 | if (memcmp(&ip_hdr(skb)->daddr, &optptr[srrptr-1], 4) == 0) |
| 585 | break; | 572 | break; |
| 586 | } | 573 | } |
| 587 | if (srrptr + 3 <= srrspace) { | 574 | if (srrptr + 3 <= srrspace) { |
| 588 | opt->is_changed = 1; | 575 | opt->is_changed = 1; |
| 589 | ip_hdr(skb)->daddr = opt->nexthop; | ||
| 590 | ip_rt_get_source(&optptr[srrptr-1], skb, rt); | 576 | ip_rt_get_source(&optptr[srrptr-1], skb, rt); |
| 591 | optptr[2] = srrptr+4; | 577 | optptr[2] = srrptr+4; |
| 592 | } else { | 578 | } else if (net_ratelimit()) |
| 593 | net_crit_ratelimited("%s(): Argh! Destination lost!\n", | 579 | printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); |
| 594 | __func__); | ||
| 595 | } | ||
| 596 | if (opt->ts_needaddr) { | 580 | if (opt->ts_needaddr) { |
| 597 | optptr = raw + opt->ts; | 581 | optptr = raw + opt->ts; |
| 598 | ip_rt_get_source(&optptr[optptr[2]-9], skb, rt); | 582 | ip_rt_get_source(&optptr[optptr[2]-9], skb, rt); |
| @@ -656,7 +640,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) | |||
| 656 | } | 640 | } |
| 657 | if (srrptr <= srrspace) { | 641 | if (srrptr <= srrspace) { |
| 658 | opt->srr_is_hit = 1; | 642 | opt->srr_is_hit = 1; |
| 659 | opt->nexthop = nexthop; | 643 | iph->daddr = nexthop; |
| 660 | opt->is_changed = 1; | 644 | opt->is_changed = 1; |
| 661 | } | 645 | } |
| 662 | return 0; | 646 | return 0; |
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 3e98ed2bff5..8c6563361ab 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c | |||
| @@ -43,6 +43,7 @@ | |||
| 43 | */ | 43 | */ |
| 44 | 44 | ||
| 45 | #include <asm/uaccess.h> | 45 | #include <asm/uaccess.h> |
| 46 | #include <asm/system.h> | ||
| 46 | #include <linux/module.h> | 47 | #include <linux/module.h> |
| 47 | #include <linux/types.h> | 48 | #include <linux/types.h> |
| 48 | #include <linux/kernel.h> | 49 | #include <linux/kernel.h> |
| @@ -113,6 +114,19 @@ int ip_local_out(struct sk_buff *skb) | |||
| 113 | } | 114 | } |
| 114 | EXPORT_SYMBOL_GPL(ip_local_out); | 115 | EXPORT_SYMBOL_GPL(ip_local_out); |
| 115 | 116 | ||
| 117 | /* dev_loopback_xmit for use with netfilter. */ | ||
| 118 | static int ip_dev_loopback_xmit(struct sk_buff *newskb) | ||
| 119 | { | ||
| 120 | skb_reset_mac_header(newskb); | ||
| 121 | __skb_pull(newskb, skb_network_offset(newskb)); | ||
| 122 | newskb->pkt_type = PACKET_LOOPBACK; | ||
| 123 | newskb->ip_summed = CHECKSUM_UNNECESSARY; | ||
| 124 | WARN_ON(!skb_dst(newskb)); | ||
| 125 | skb_dst_force(newskb); | ||
| 126 | netif_rx_ni(newskb); | ||
| 127 | return 0; | ||
| 128 | } | ||
| 129 | |||
| 116 | static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) | 130 | static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) |
| 117 | { | 131 | { |
| 118 | int ttl = inet->uc_ttl; | 132 | int ttl = inet->uc_ttl; |
| @@ -170,7 +184,6 @@ static inline int ip_finish_output2(struct sk_buff *skb) | |||
| 170 | struct net_device *dev = dst->dev; | 184 | struct net_device *dev = dst->dev; |
| 171 | unsigned int hh_len = LL_RESERVED_SPACE(dev); | 185 | unsigned int hh_len = LL_RESERVED_SPACE(dev); |
| 172 | struct neighbour *neigh; | 186 | struct neighbour *neigh; |
| 173 | u32 nexthop; | ||
| 174 | 187 | ||
| 175 | if (rt->rt_type == RTN_MULTICAST) { | 188 | if (rt->rt_type == RTN_MULTICAST) { |
| 176 | IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); | 189 | IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); |
| @@ -188,25 +201,22 @@ static inline int ip_finish_output2(struct sk_buff *skb) | |||
| 188 | } | 201 | } |
| 189 | if (skb->sk) | 202 | if (skb->sk) |
| 190 | skb_set_owner_w(skb2, skb->sk); | 203 | skb_set_owner_w(skb2, skb->sk); |
| 191 | consume_skb(skb); | 204 | kfree_skb(skb); |
| 192 | skb = skb2; | 205 | skb = skb2; |
| 193 | } | 206 | } |
| 194 | 207 | ||
| 195 | rcu_read_lock_bh(); | 208 | rcu_read_lock(); |
| 196 | nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); | 209 | neigh = dst_get_neighbour(dst); |
| 197 | neigh = __ipv4_neigh_lookup_noref(dev, nexthop); | 210 | if (neigh) { |
| 198 | if (unlikely(!neigh)) | 211 | int res = neigh_output(neigh, skb); |
| 199 | neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); | ||
| 200 | if (!IS_ERR(neigh)) { | ||
| 201 | int res = dst_neigh_output(dst, neigh, skb); | ||
| 202 | 212 | ||
| 203 | rcu_read_unlock_bh(); | 213 | rcu_read_unlock(); |
| 204 | return res; | 214 | return res; |
| 205 | } | 215 | } |
| 206 | rcu_read_unlock_bh(); | 216 | rcu_read_unlock(); |
| 207 | 217 | ||
| 208 | net_dbg_ratelimited("%s: No header cache and no neighbour!\n", | 218 | if (net_ratelimit()) |
| 209 | __func__); | 219 | printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n"); |
| 210 | kfree_skb(skb); | 220 | kfree_skb(skb); |
| 211 | return -EINVAL; | 221 | return -EINVAL; |
| 212 | } | 222 | } |
| @@ -272,7 +282,7 @@ int ip_mc_output(struct sk_buff *skb) | |||
| 272 | if (newskb) | 282 | if (newskb) |
| 273 | NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, | 283 | NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, |
| 274 | newskb, NULL, newskb->dev, | 284 | newskb, NULL, newskb->dev, |
| 275 | dev_loopback_xmit); | 285 | ip_dev_loopback_xmit); |
| 276 | } | 286 | } |
| 277 | 287 | ||
| 278 | /* Multicasts with ttl 0 must not go beyond the host */ | 288 | /* Multicasts with ttl 0 must not go beyond the host */ |
| @@ -287,7 +297,7 @@ int ip_mc_output(struct sk_buff *skb) | |||
| 287 | struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); | 297 | struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); |
| 288 | if (newskb) | 298 | if (newskb) |
| 289 | NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, | 299 | NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, |
| 290 | NULL, newskb->dev, dev_loopback_xmit); | 300 | NULL, newskb->dev, ip_dev_loopback_xmit); |
| 291 | } | 301 | } |
| 292 | 302 | ||
| 293 | return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, | 303 | return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, |
| @@ -309,20 +319,6 @@ int ip_output(struct sk_buff *skb) | |||
| 309 | !(IPCB(skb)->flags & IPSKB_REROUTED)); | 319 | !(IPCB(skb)->flags & IPSKB_REROUTED)); |
| 310 | } | 320 | } |
| 311 | 321 | ||
| 312 | /* | ||
| 313 | * copy saddr and daddr, possibly using 64bit load/stores | ||
| 314 | * Equivalent to : | ||
| 315 | * iph->saddr = fl4->saddr; | ||
| 316 | * iph->daddr = fl4->daddr; | ||
| 317 | */ | ||
| 318 | static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) | ||
| 319 | { | ||
| 320 | BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) != | ||
| 321 | offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr)); | ||
| 322 | memcpy(&iph->saddr, &fl4->saddr, | ||
| 323 | sizeof(fl4->saddr) + sizeof(fl4->daddr)); | ||
| 324 | } | ||
| 325 | |||
| 326 | int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) | 322 | int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) |
| 327 | { | 323 | { |
| 328 | struct sock *sk = skb->sk; | 324 | struct sock *sk = skb->sk; |
| @@ -371,7 +367,7 @@ int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) | |||
| 371 | skb_dst_set_noref(skb, &rt->dst); | 367 | skb_dst_set_noref(skb, &rt->dst); |
| 372 | 368 | ||
| 373 | packet_routed: | 369 | packet_routed: |
| 374 | if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) | 370 | if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) |
| 375 | goto no_route; | 371 | goto no_route; |
| 376 | 372 | ||
| 377 | /* OK, we know where to send it, allocate and build IP header. */ | 373 | /* OK, we know where to send it, allocate and build IP header. */ |
| @@ -385,8 +381,8 @@ packet_routed: | |||
| 385 | iph->frag_off = 0; | 381 | iph->frag_off = 0; |
| 386 | iph->ttl = ip_select_ttl(inet, &rt->dst); | 382 | iph->ttl = ip_select_ttl(inet, &rt->dst); |
| 387 | iph->protocol = sk->sk_protocol; | 383 | iph->protocol = sk->sk_protocol; |
| 388 | ip_copy_addrs(iph, fl4); | 384 | iph->saddr = fl4->saddr; |
| 389 | 385 | iph->daddr = fl4->daddr; | |
| 390 | /* Transport layer set skb->h.foo itself. */ | 386 | /* Transport layer set skb->h.foo itself. */ |
| 391 | 387 | ||
| 392 | if (inet_opt && inet_opt->opt.optlen) { | 388 | if (inet_opt && inet_opt->opt.optlen) { |
| @@ -467,9 +463,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) | |||
| 467 | 463 | ||
| 468 | iph = ip_hdr(skb); | 464 | iph = ip_hdr(skb); |
| 469 | 465 | ||
| 470 | if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) || | 466 | if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { |
| 471 | (IPCB(skb)->frag_max_size && | ||
| 472 | IPCB(skb)->frag_max_size > dst_mtu(&rt->dst)))) { | ||
| 473 | IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); | 467 | IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); |
| 474 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, | 468 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, |
| 475 | htonl(ip_skb_dst_mtu(skb))); | 469 | htonl(ip_skb_dst_mtu(skb))); |
| @@ -595,10 +589,6 @@ slow_path_clean: | |||
| 595 | } | 589 | } |
| 596 | 590 | ||
| 597 | slow_path: | 591 | slow_path: |
| 598 | /* for offloaded checksums cleanup checksum before fragmentation */ | ||
| 599 | if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb)) | ||
| 600 | goto fail; | ||
| 601 | |||
| 602 | left = skb->len - hlen; /* Space per frame */ | 592 | left = skb->len - hlen; /* Space per frame */ |
| 603 | ptr = hlen; /* Where to start from */ | 593 | ptr = hlen; /* Where to start from */ |
| 604 | 594 | ||
| @@ -706,7 +696,7 @@ slow_path: | |||
| 706 | 696 | ||
| 707 | IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); | 697 | IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); |
| 708 | } | 698 | } |
| 709 | consume_skb(skb); | 699 | kfree_skb(skb); |
| 710 | IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); | 700 | IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); |
| 711 | return err; | 701 | return err; |
| 712 | 702 | ||
| @@ -797,7 +787,6 @@ static int __ip_append_data(struct sock *sk, | |||
| 797 | struct flowi4 *fl4, | 787 | struct flowi4 *fl4, |
| 798 | struct sk_buff_head *queue, | 788 | struct sk_buff_head *queue, |
| 799 | struct inet_cork *cork, | 789 | struct inet_cork *cork, |
| 800 | struct page_frag *pfrag, | ||
| 801 | int getfrag(void *from, char *to, int offset, | 790 | int getfrag(void *from, char *to, int offset, |
| 802 | int len, int odd, struct sk_buff *skb), | 791 | int len, int odd, struct sk_buff *skb), |
| 803 | void *from, int length, int transhdrlen, | 792 | void *from, int length, int transhdrlen, |
| @@ -992,30 +981,46 @@ alloc_new_skb: | |||
| 992 | } | 981 | } |
| 993 | } else { | 982 | } else { |
| 994 | int i = skb_shinfo(skb)->nr_frags; | 983 | int i = skb_shinfo(skb)->nr_frags; |
| 995 | 984 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; | |
| 996 | err = -ENOMEM; | 985 | struct page *page = cork->page; |
| 997 | if (!sk_page_frag_refill(sk, pfrag)) | 986 | int off = cork->off; |
| 998 | goto error; | 987 | unsigned int left; |
| 999 | 988 | ||
| 1000 | if (!skb_can_coalesce(skb, i, pfrag->page, | 989 | if (page && (left = PAGE_SIZE - off) > 0) { |
| 1001 | pfrag->offset)) { | 990 | if (copy >= left) |
| 1002 | err = -EMSGSIZE; | 991 | copy = left; |
| 1003 | if (i == MAX_SKB_FRAGS) | 992 | if (page != frag->page) { |
| 993 | if (i == MAX_SKB_FRAGS) { | ||
| 994 | err = -EMSGSIZE; | ||
| 995 | goto error; | ||
| 996 | } | ||
| 997 | get_page(page); | ||
| 998 | skb_fill_page_desc(skb, i, page, off, 0); | ||
| 999 | frag = &skb_shinfo(skb)->frags[i]; | ||
| 1000 | } | ||
| 1001 | } else if (i < MAX_SKB_FRAGS) { | ||
| 1002 | if (copy > PAGE_SIZE) | ||
| 1003 | copy = PAGE_SIZE; | ||
| 1004 | page = alloc_pages(sk->sk_allocation, 0); | ||
| 1005 | if (page == NULL) { | ||
| 1006 | err = -ENOMEM; | ||
| 1004 | goto error; | 1007 | goto error; |
| 1008 | } | ||
| 1009 | cork->page = page; | ||
| 1010 | cork->off = 0; | ||
| 1005 | 1011 | ||
| 1006 | __skb_fill_page_desc(skb, i, pfrag->page, | 1012 | skb_fill_page_desc(skb, i, page, 0, 0); |
| 1007 | pfrag->offset, 0); | 1013 | frag = &skb_shinfo(skb)->frags[i]; |
| 1008 | skb_shinfo(skb)->nr_frags = ++i; | 1014 | } else { |
| 1009 | get_page(pfrag->page); | 1015 | err = -EMSGSIZE; |
| 1016 | goto error; | ||
| 1010 | } | 1017 | } |
| 1011 | copy = min_t(int, copy, pfrag->size - pfrag->offset); | 1018 | if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { |
| 1012 | if (getfrag(from, | 1019 | err = -EFAULT; |
| 1013 | page_address(pfrag->page) + pfrag->offset, | 1020 | goto error; |
| 1014 | offset, copy, skb->len, skb) < 0) | 1021 | } |
| 1015 | goto error_efault; | 1022 | cork->off += copy; |
| 1016 | 1023 | frag->size += copy; | |
| 1017 | pfrag->offset += copy; | ||
| 1018 | skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); | ||
| 1019 | skb->len += copy; | 1024 | skb->len += copy; |
| 1020 | skb->data_len += copy; | 1025 | skb->data_len += copy; |
| 1021 | skb->truesize += copy; | 1026 | skb->truesize += copy; |
| @@ -1027,8 +1032,6 @@ alloc_new_skb: | |||
| 1027 | 1032 | ||
| 1028 | return 0; | 1033 | return 0; |
| 1029 | 1034 | ||
| 1030 | error_efault: | ||
| 1031 | err = -EFAULT; | ||
| 1032 | error: | 1035 | error: |
| 1033 | cork->length -= length; | 1036 | cork->length -= length; |
| 1034 | IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); | 1037 | IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); |
| @@ -1069,6 +1072,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, | |||
| 1069 | cork->dst = &rt->dst; | 1072 | cork->dst = &rt->dst; |
| 1070 | cork->length = 0; | 1073 | cork->length = 0; |
| 1071 | cork->tx_flags = ipc->tx_flags; | 1074 | cork->tx_flags = ipc->tx_flags; |
| 1075 | cork->page = NULL; | ||
| 1076 | cork->off = 0; | ||
| 1072 | 1077 | ||
| 1073 | return 0; | 1078 | return 0; |
| 1074 | } | 1079 | } |
| @@ -1105,8 +1110,7 @@ int ip_append_data(struct sock *sk, struct flowi4 *fl4, | |||
| 1105 | transhdrlen = 0; | 1110 | transhdrlen = 0; |
| 1106 | } | 1111 | } |
| 1107 | 1112 | ||
| 1108 | return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, | 1113 | return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag, |
| 1109 | sk_page_frag(sk), getfrag, | ||
| 1110 | from, length, transhdrlen, flags); | 1114 | from, length, transhdrlen, flags); |
| 1111 | } | 1115 | } |
| 1112 | 1116 | ||
| @@ -1225,7 +1229,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, | |||
| 1225 | if (len > size) | 1229 | if (len > size) |
| 1226 | len = size; | 1230 | len = size; |
| 1227 | if (skb_can_coalesce(skb, i, page, offset)) { | 1231 | if (skb_can_coalesce(skb, i, page, offset)) { |
| 1228 | skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len); | 1232 | skb_shinfo(skb)->frags[i-1].size += len; |
| 1229 | } else if (i < MAX_SKB_FRAGS) { | 1233 | } else if (i < MAX_SKB_FRAGS) { |
| 1230 | get_page(page); | 1234 | get_page(page); |
| 1231 | skb_fill_page_desc(skb, i, page, offset, len); | 1235 | skb_fill_page_desc(skb, i, page, offset, len); |
| @@ -1329,10 +1333,11 @@ struct sk_buff *__ip_make_skb(struct sock *sk, | |||
| 1329 | iph->ihl = 5; | 1333 | iph->ihl = 5; |
| 1330 | iph->tos = inet->tos; | 1334 | iph->tos = inet->tos; |
| 1331 | iph->frag_off = df; | 1335 | iph->frag_off = df; |
| 1336 | ip_select_ident(iph, &rt->dst, sk); | ||
| 1332 | iph->ttl = ttl; | 1337 | iph->ttl = ttl; |
| 1333 | iph->protocol = sk->sk_protocol; | 1338 | iph->protocol = sk->sk_protocol; |
| 1334 | ip_copy_addrs(iph, fl4); | 1339 | iph->saddr = fl4->saddr; |
| 1335 | ip_select_ident(iph, &rt->dst, sk); | 1340 | iph->daddr = fl4->daddr; |
| 1336 | 1341 | ||
| 1337 | if (opt) { | 1342 | if (opt) { |
| 1338 | iph->ihl += opt->optlen>>2; | 1343 | iph->ihl += opt->optlen>>2; |
| @@ -1357,8 +1362,9 @@ out: | |||
| 1357 | return skb; | 1362 | return skb; |
| 1358 | } | 1363 | } |
| 1359 | 1364 | ||
| 1360 | int ip_send_skb(struct net *net, struct sk_buff *skb) | 1365 | int ip_send_skb(struct sk_buff *skb) |
| 1361 | { | 1366 | { |
| 1367 | struct net *net = sock_net(skb->sk); | ||
| 1362 | int err; | 1368 | int err; |
| 1363 | 1369 | ||
| 1364 | err = ip_local_out(skb); | 1370 | err = ip_local_out(skb); |
| @@ -1381,7 +1387,7 @@ int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) | |||
| 1381 | return 0; | 1387 | return 0; |
| 1382 | 1388 | ||
| 1383 | /* Netfilter gets whole the not fragmented skb. */ | 1389 | /* Netfilter gets whole the not fragmented skb. */ |
| 1384 | return ip_send_skb(sock_net(sk), skb); | 1390 | return ip_send_skb(skb); |
| 1385 | } | 1391 | } |
| 1386 | 1392 | ||
| 1387 | /* | 1393 | /* |
| @@ -1428,8 +1434,7 @@ struct sk_buff *ip_make_skb(struct sock *sk, | |||
| 1428 | if (err) | 1434 | if (err) |
| 1429 | return ERR_PTR(err); | 1435 | return ERR_PTR(err); |
| 1430 | 1436 | ||
| 1431 | err = __ip_append_data(sk, fl4, &queue, &cork, | 1437 | err = __ip_append_data(sk, fl4, &queue, &cork, getfrag, |
| 1432 | ¤t->task_frag, getfrag, | ||
| 1433 | from, length, transhdrlen, flags); | 1438 | from, length, transhdrlen, flags); |
| 1434 | if (err) { | 1439 | if (err) { |
| 1435 | __ip_flush_pending_frames(sk, &queue, &cork); | 1440 | __ip_flush_pending_frames(sk, &queue, &cork); |
| @@ -1454,34 +1459,19 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, | |||
| 1454 | 1459 | ||
| 1455 | /* | 1460 | /* |
| 1456 | * Generic function to send a packet as reply to another packet. | 1461 | * Generic function to send a packet as reply to another packet. |
| 1457 | * Used to send some TCP resets/acks so far. | 1462 | * Used to send TCP resets so far. ICMP should use this function too. |
| 1458 | * | 1463 | * |
| 1459 | * Use a fake percpu inet socket to avoid false sharing and contention. | 1464 | * Should run single threaded per socket because it uses the sock |
| 1465 | * structure to pass arguments. | ||
| 1460 | */ | 1466 | */ |
| 1461 | static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { | 1467 | void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, |
| 1462 | .sk = { | 1468 | struct ip_reply_arg *arg, unsigned int len) |
| 1463 | .__sk_common = { | ||
| 1464 | .skc_refcnt = ATOMIC_INIT(1), | ||
| 1465 | }, | ||
| 1466 | .sk_wmem_alloc = ATOMIC_INIT(1), | ||
| 1467 | .sk_allocation = GFP_ATOMIC, | ||
| 1468 | .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE), | ||
| 1469 | }, | ||
| 1470 | .pmtudisc = IP_PMTUDISC_WANT, | ||
| 1471 | .uc_ttl = -1, | ||
| 1472 | }; | ||
| 1473 | |||
| 1474 | void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, | ||
| 1475 | __be32 saddr, const struct ip_reply_arg *arg, | ||
| 1476 | unsigned int len) | ||
| 1477 | { | 1469 | { |
| 1470 | struct inet_sock *inet = inet_sk(sk); | ||
| 1478 | struct ip_options_data replyopts; | 1471 | struct ip_options_data replyopts; |
| 1479 | struct ipcm_cookie ipc; | 1472 | struct ipcm_cookie ipc; |
| 1480 | struct flowi4 fl4; | 1473 | struct flowi4 fl4; |
| 1481 | struct rtable *rt = skb_rtable(skb); | 1474 | struct rtable *rt = skb_rtable(skb); |
| 1482 | struct sk_buff *nskb; | ||
| 1483 | struct sock *sk; | ||
| 1484 | struct inet_sock *inet; | ||
| 1485 | 1475 | ||
| 1486 | if (ip_options_echo(&replyopts.opt.opt, skb)) | 1476 | if (ip_options_echo(&replyopts.opt.opt, skb)) |
| 1487 | return; | 1477 | return; |
| @@ -1498,41 +1488,39 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, | |||
| 1498 | } | 1488 | } |
| 1499 | 1489 | ||
| 1500 | flowi4_init_output(&fl4, arg->bound_dev_if, 0, | 1490 | flowi4_init_output(&fl4, arg->bound_dev_if, 0, |
| 1501 | RT_TOS(arg->tos), | 1491 | RT_TOS(ip_hdr(skb)->tos), |
| 1502 | RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, | 1492 | RT_SCOPE_UNIVERSE, sk->sk_protocol, |
| 1503 | ip_reply_arg_flowi_flags(arg), | 1493 | ip_reply_arg_flowi_flags(arg), |
| 1504 | daddr, saddr, | 1494 | daddr, rt->rt_spec_dst, |
| 1505 | tcp_hdr(skb)->source, tcp_hdr(skb)->dest); | 1495 | tcp_hdr(skb)->source, tcp_hdr(skb)->dest); |
| 1506 | security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); | 1496 | security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); |
| 1507 | rt = ip_route_output_key(net, &fl4); | 1497 | rt = ip_route_output_key(sock_net(sk), &fl4); |
| 1508 | if (IS_ERR(rt)) | 1498 | if (IS_ERR(rt)) |
| 1509 | return; | 1499 | return; |
| 1510 | 1500 | ||
| 1511 | inet = &get_cpu_var(unicast_sock); | 1501 | /* And let IP do all the hard work. |
| 1512 | 1502 | ||
| 1513 | inet->tos = arg->tos; | 1503 | This chunk is not reenterable, hence spinlock. |
| 1514 | sk = &inet->sk; | 1504 | Note that it uses the fact, that this function is called |
| 1505 | with locally disabled BH and that sk cannot be already spinlocked. | ||
| 1506 | */ | ||
| 1507 | bh_lock_sock(sk); | ||
| 1508 | inet->tos = ip_hdr(skb)->tos; | ||
| 1515 | sk->sk_priority = skb->priority; | 1509 | sk->sk_priority = skb->priority; |
| 1516 | sk->sk_protocol = ip_hdr(skb)->protocol; | 1510 | sk->sk_protocol = ip_hdr(skb)->protocol; |
| 1517 | sk->sk_bound_dev_if = arg->bound_dev_if; | 1511 | sk->sk_bound_dev_if = arg->bound_dev_if; |
| 1518 | sock_net_set(sk, net); | ||
| 1519 | __skb_queue_head_init(&sk->sk_write_queue); | ||
| 1520 | sk->sk_sndbuf = sysctl_wmem_default; | ||
| 1521 | ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, | 1512 | ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, |
| 1522 | &ipc, &rt, MSG_DONTWAIT); | 1513 | &ipc, &rt, MSG_DONTWAIT); |
| 1523 | nskb = skb_peek(&sk->sk_write_queue); | 1514 | if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { |
| 1524 | if (nskb) { | ||
| 1525 | if (arg->csumoffset >= 0) | 1515 | if (arg->csumoffset >= 0) |
| 1526 | *((__sum16 *)skb_transport_header(nskb) + | 1516 | *((__sum16 *)skb_transport_header(skb) + |
| 1527 | arg->csumoffset) = csum_fold(csum_add(nskb->csum, | 1517 | arg->csumoffset) = csum_fold(csum_add(skb->csum, |
| 1528 | arg->csum)); | 1518 | arg->csum)); |
| 1529 | nskb->ip_summed = CHECKSUM_NONE; | 1519 | skb->ip_summed = CHECKSUM_NONE; |
| 1530 | skb_orphan(nskb); | ||
| 1531 | skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); | ||
| 1532 | ip_push_pending_frames(sk, &fl4); | 1520 | ip_push_pending_frames(sk, &fl4); |
| 1533 | } | 1521 | } |
| 1534 | 1522 | ||
| 1535 | put_cpu_var(unicast_sock); | 1523 | bh_unlock_sock(sk); |
| 1536 | 1524 | ||
| 1537 | ip_rt_put(rt); | 1525 | ip_rt_put(rt); |
| 1538 | } | 1526 | } |
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index d9c4f113d70..8905e92f896 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c | |||
| @@ -33,14 +33,12 @@ | |||
| 33 | #include <linux/netfilter.h> | 33 | #include <linux/netfilter.h> |
| 34 | #include <linux/route.h> | 34 | #include <linux/route.h> |
| 35 | #include <linux/mroute.h> | 35 | #include <linux/mroute.h> |
| 36 | #include <net/inet_ecn.h> | ||
| 37 | #include <net/route.h> | 36 | #include <net/route.h> |
| 38 | #include <net/xfrm.h> | 37 | #include <net/xfrm.h> |
| 39 | #include <net/compat.h> | 38 | #include <net/compat.h> |
| 40 | #if IS_ENABLED(CONFIG_IPV6) | 39 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 41 | #include <net/transp_v6.h> | 40 | #include <net/transp_v6.h> |
| 42 | #endif | 41 | #endif |
| 43 | #include <net/ip_fib.h> | ||
| 44 | 42 | ||
| 45 | #include <linux/errqueue.h> | 43 | #include <linux/errqueue.h> |
| 46 | #include <asm/uaccess.h> | 44 | #include <asm/uaccess.h> |
| @@ -56,13 +54,20 @@ | |||
| 56 | /* | 54 | /* |
| 57 | * SOL_IP control messages. | 55 | * SOL_IP control messages. |
| 58 | */ | 56 | */ |
| 59 | #define PKTINFO_SKB_CB(__skb) ((struct in_pktinfo *)((__skb)->cb)) | ||
| 60 | 57 | ||
| 61 | static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) | 58 | static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) |
| 62 | { | 59 | { |
| 63 | struct in_pktinfo info = *PKTINFO_SKB_CB(skb); | 60 | struct in_pktinfo info; |
| 61 | struct rtable *rt = skb_rtable(skb); | ||
| 64 | 62 | ||
| 65 | info.ipi_addr.s_addr = ip_hdr(skb)->daddr; | 63 | info.ipi_addr.s_addr = ip_hdr(skb)->daddr; |
| 64 | if (rt) { | ||
| 65 | info.ipi_ifindex = rt->rt_iif; | ||
| 66 | info.ipi_spec_dst.s_addr = rt->rt_spec_dst; | ||
| 67 | } else { | ||
| 68 | info.ipi_ifindex = 0; | ||
| 69 | info.ipi_spec_dst.s_addr = 0; | ||
| 70 | } | ||
| 66 | 71 | ||
| 67 | put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); | 72 | put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); |
| 68 | } | 73 | } |
| @@ -91,7 +96,7 @@ static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb) | |||
| 91 | static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) | 96 | static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) |
| 92 | { | 97 | { |
| 93 | unsigned char optbuf[sizeof(struct ip_options) + 40]; | 98 | unsigned char optbuf[sizeof(struct ip_options) + 40]; |
| 94 | struct ip_options *opt = (struct ip_options *)optbuf; | 99 | struct ip_options * opt = (struct ip_options *)optbuf; |
| 95 | 100 | ||
| 96 | if (IPCB(skb)->opt.optlen == 0) | 101 | if (IPCB(skb)->opt.optlen == 0) |
| 97 | return; | 102 | return; |
| @@ -148,7 +153,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) | |||
| 148 | void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) | 153 | void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) |
| 149 | { | 154 | { |
| 150 | struct inet_sock *inet = inet_sk(skb->sk); | 155 | struct inet_sock *inet = inet_sk(skb->sk); |
| 151 | unsigned int flags = inet->cmsg_flags; | 156 | unsigned flags = inet->cmsg_flags; |
| 152 | 157 | ||
| 153 | /* Ordered by supposed usage frequency */ | 158 | /* Ordered by supposed usage frequency */ |
| 154 | if (flags & 1) | 159 | if (flags & 1) |
| @@ -446,6 +451,11 @@ out: | |||
| 446 | } | 451 | } |
| 447 | 452 | ||
| 448 | 453 | ||
| 454 | static void opt_kfree_rcu(struct rcu_head *head) | ||
| 455 | { | ||
| 456 | kfree(container_of(head, struct ip_options_rcu, rcu)); | ||
| 457 | } | ||
| 458 | |||
| 449 | /* | 459 | /* |
| 450 | * Socket option code for IP. This is the end of the line after any | 460 | * Socket option code for IP. This is the end of the line after any |
| 451 | * TCP,UDP etc options on an IP socket. | 461 | * TCP,UDP etc options on an IP socket. |
| @@ -457,28 +467,18 @@ static int do_ip_setsockopt(struct sock *sk, int level, | |||
| 457 | struct inet_sock *inet = inet_sk(sk); | 467 | struct inet_sock *inet = inet_sk(sk); |
| 458 | int val = 0, err; | 468 | int val = 0, err; |
| 459 | 469 | ||
| 460 | switch (optname) { | 470 | if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) | |
| 461 | case IP_PKTINFO: | 471 | (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) | |
| 462 | case IP_RECVTTL: | 472 | (1<<IP_RETOPTS) | (1<<IP_TOS) | |
| 463 | case IP_RECVOPTS: | 473 | (1<<IP_TTL) | (1<<IP_HDRINCL) | |
| 464 | case IP_RECVTOS: | 474 | (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | |
| 465 | case IP_RETOPTS: | 475 | (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | |
| 466 | case IP_TOS: | 476 | (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) | |
| 467 | case IP_TTL: | 477 | (1<<IP_MINTTL) | (1<<IP_NODEFRAG))) || |
| 468 | case IP_HDRINCL: | 478 | optname == IP_MULTICAST_TTL || |
| 469 | case IP_MTU_DISCOVER: | 479 | optname == IP_MULTICAST_ALL || |
| 470 | case IP_RECVERR: | 480 | optname == IP_MULTICAST_LOOP || |
| 471 | case IP_ROUTER_ALERT: | 481 | optname == IP_RECVORIGDSTADDR) { |
| 472 | case IP_FREEBIND: | ||
| 473 | case IP_PASSSEC: | ||
| 474 | case IP_TRANSPARENT: | ||
| 475 | case IP_MINTTL: | ||
| 476 | case IP_NODEFRAG: | ||
| 477 | case IP_UNICAST_IF: | ||
| 478 | case IP_MULTICAST_TTL: | ||
| 479 | case IP_MULTICAST_ALL: | ||
| 480 | case IP_MULTICAST_LOOP: | ||
| 481 | case IP_RECVORIGDSTADDR: | ||
| 482 | if (optlen >= sizeof(int)) { | 482 | if (optlen >= sizeof(int)) { |
| 483 | if (get_user(val, (int __user *) optval)) | 483 | if (get_user(val, (int __user *) optval)) |
| 484 | return -EFAULT; | 484 | return -EFAULT; |
| @@ -514,7 +514,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, | |||
| 514 | sock_owned_by_user(sk)); | 514 | sock_owned_by_user(sk)); |
| 515 | if (inet->is_icsk) { | 515 | if (inet->is_icsk) { |
| 516 | struct inet_connection_sock *icsk = inet_csk(sk); | 516 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 517 | #if IS_ENABLED(CONFIG_IPV6) | 517 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 518 | if (sk->sk_family == PF_INET || | 518 | if (sk->sk_family == PF_INET || |
| 519 | (!((1 << sk->sk_state) & | 519 | (!((1 << sk->sk_state) & |
| 520 | (TCPF_LISTEN | TCPF_CLOSE)) && | 520 | (TCPF_LISTEN | TCPF_CLOSE)) && |
| @@ -525,13 +525,13 @@ static int do_ip_setsockopt(struct sock *sk, int level, | |||
| 525 | if (opt) | 525 | if (opt) |
| 526 | icsk->icsk_ext_hdr_len += opt->opt.optlen; | 526 | icsk->icsk_ext_hdr_len += opt->opt.optlen; |
| 527 | icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); | 527 | icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); |
| 528 | #if IS_ENABLED(CONFIG_IPV6) | 528 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 529 | } | 529 | } |
| 530 | #endif | 530 | #endif |
| 531 | } | 531 | } |
| 532 | rcu_assign_pointer(inet->inet_opt, opt); | 532 | rcu_assign_pointer(inet->inet_opt, opt); |
| 533 | if (old) | 533 | if (old) |
| 534 | kfree_rcu(old, rcu); | 534 | call_rcu(&old->rcu, opt_kfree_rcu); |
| 535 | break; | 535 | break; |
| 536 | } | 536 | } |
| 537 | case IP_PKTINFO: | 537 | case IP_PKTINFO: |
| @@ -578,8 +578,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, | |||
| 578 | break; | 578 | break; |
| 579 | case IP_TOS: /* This sets both TOS and Precedence */ | 579 | case IP_TOS: /* This sets both TOS and Precedence */ |
| 580 | if (sk->sk_type == SOCK_STREAM) { | 580 | if (sk->sk_type == SOCK_STREAM) { |
| 581 | val &= ~INET_ECN_MASK; | 581 | val &= ~3; |
| 582 | val |= inet->tos & INET_ECN_MASK; | 582 | val |= inet->tos & 3; |
| 583 | } | 583 | } |
| 584 | if (inet->tos != val) { | 584 | if (inet->tos != val) { |
| 585 | inet->tos = val; | 585 | inet->tos = val; |
| @@ -590,7 +590,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, | |||
| 590 | case IP_TTL: | 590 | case IP_TTL: |
| 591 | if (optlen < 1) | 591 | if (optlen < 1) |
| 592 | goto e_inval; | 592 | goto e_inval; |
| 593 | if (val != -1 && (val < 1 || val > 255)) | 593 | if (val != -1 && (val < 0 || val > 255)) |
| 594 | goto e_inval; | 594 | goto e_inval; |
| 595 | inet->uc_ttl = val; | 595 | inet->uc_ttl = val; |
| 596 | break; | 596 | break; |
| @@ -634,35 +634,6 @@ static int do_ip_setsockopt(struct sock *sk, int level, | |||
| 634 | goto e_inval; | 634 | goto e_inval; |
| 635 | inet->mc_loop = !!val; | 635 | inet->mc_loop = !!val; |
| 636 | break; | 636 | break; |
| 637 | case IP_UNICAST_IF: | ||
| 638 | { | ||
| 639 | struct net_device *dev = NULL; | ||
| 640 | int ifindex; | ||
| 641 | |||
| 642 | if (optlen != sizeof(int)) | ||
| 643 | goto e_inval; | ||
| 644 | |||
| 645 | ifindex = (__force int)ntohl((__force __be32)val); | ||
| 646 | if (ifindex == 0) { | ||
| 647 | inet->uc_index = 0; | ||
| 648 | err = 0; | ||
| 649 | break; | ||
| 650 | } | ||
| 651 | |||
| 652 | dev = dev_get_by_index(sock_net(sk), ifindex); | ||
| 653 | err = -EADDRNOTAVAIL; | ||
| 654 | if (!dev) | ||
| 655 | break; | ||
| 656 | dev_put(dev); | ||
| 657 | |||
| 658 | err = -EINVAL; | ||
| 659 | if (sk->sk_bound_dev_if) | ||
| 660 | break; | ||
| 661 | |||
| 662 | inet->uc_index = ifindex; | ||
| 663 | err = 0; | ||
| 664 | break; | ||
| 665 | } | ||
| 666 | case IP_MULTICAST_IF: | 637 | case IP_MULTICAST_IF: |
| 667 | { | 638 | { |
| 668 | struct ip_mreqn mreq; | 639 | struct ip_mreqn mreq; |
| @@ -683,15 +654,10 @@ static int do_ip_setsockopt(struct sock *sk, int level, | |||
| 683 | break; | 654 | break; |
| 684 | } else { | 655 | } else { |
| 685 | memset(&mreq, 0, sizeof(mreq)); | 656 | memset(&mreq, 0, sizeof(mreq)); |
| 686 | if (optlen >= sizeof(struct ip_mreq)) { | 657 | if (optlen >= sizeof(struct in_addr) && |
| 687 | if (copy_from_user(&mreq, optval, | 658 | copy_from_user(&mreq.imr_address, optval, |
| 688 | sizeof(struct ip_mreq))) | 659 | sizeof(struct in_addr))) |
| 689 | break; | 660 | break; |
| 690 | } else if (optlen >= sizeof(struct in_addr)) { | ||
| 691 | if (copy_from_user(&mreq.imr_address, optval, | ||
| 692 | sizeof(struct in_addr))) | ||
| 693 | break; | ||
| 694 | } | ||
| 695 | } | 661 | } |
| 696 | 662 | ||
| 697 | if (!mreq.imr_ifindex) { | 663 | if (!mreq.imr_ifindex) { |
| @@ -989,14 +955,13 @@ mc_msf_out: | |||
| 989 | case IP_IPSEC_POLICY: | 955 | case IP_IPSEC_POLICY: |
| 990 | case IP_XFRM_POLICY: | 956 | case IP_XFRM_POLICY: |
| 991 | err = -EPERM; | 957 | err = -EPERM; |
| 992 | if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) | 958 | if (!capable(CAP_NET_ADMIN)) |
| 993 | break; | 959 | break; |
| 994 | err = xfrm_user_policy(sk, optname, optval, optlen); | 960 | err = xfrm_user_policy(sk, optname, optval, optlen); |
| 995 | break; | 961 | break; |
| 996 | 962 | ||
| 997 | case IP_TRANSPARENT: | 963 | case IP_TRANSPARENT: |
| 998 | if (!!val && !ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && | 964 | if (!capable(CAP_NET_ADMIN)) { |
| 999 | !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { | ||
| 1000 | err = -EPERM; | 965 | err = -EPERM; |
| 1001 | break; | 966 | break; |
| 1002 | } | 967 | } |
| @@ -1026,27 +991,20 @@ e_inval: | |||
| 1026 | } | 991 | } |
| 1027 | 992 | ||
| 1028 | /** | 993 | /** |
| 1029 | * ipv4_pktinfo_prepare - transfert some info from rtable to skb | 994 | * ip_queue_rcv_skb - Queue an skb into sock receive queue |
| 1030 | * @sk: socket | 995 | * @sk: socket |
| 1031 | * @skb: buffer | 996 | * @skb: buffer |
| 1032 | * | 997 | * |
| 1033 | * To support IP_CMSG_PKTINFO option, we store rt_iif and specific | 998 | * Queues an skb into socket receive queue. If IP_CMSG_PKTINFO option |
| 1034 | * destination in skb->cb[] before dst drop. | 999 | * is not set, we drop skb dst entry now, while dst cache line is hot. |
| 1035 | * This way, receiver doesnt make cache line misses to read rtable. | ||
| 1036 | */ | 1000 | */ |
| 1037 | void ipv4_pktinfo_prepare(struct sk_buff *skb) | 1001 | int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) |
| 1038 | { | 1002 | { |
| 1039 | struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); | 1003 | if (!(inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO)) |
| 1040 | 1004 | skb_dst_drop(skb); | |
| 1041 | if (skb_rtable(skb)) { | 1005 | return sock_queue_rcv_skb(sk, skb); |
| 1042 | pktinfo->ipi_ifindex = inet_iif(skb); | ||
| 1043 | pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); | ||
| 1044 | } else { | ||
| 1045 | pktinfo->ipi_ifindex = 0; | ||
| 1046 | pktinfo->ipi_spec_dst.s_addr = 0; | ||
| 1047 | } | ||
| 1048 | skb_dst_drop(skb); | ||
| 1049 | } | 1006 | } |
| 1007 | EXPORT_SYMBOL(ip_queue_rcv_skb); | ||
| 1050 | 1008 | ||
| 1051 | int ip_setsockopt(struct sock *sk, int level, | 1009 | int ip_setsockopt(struct sock *sk, int level, |
| 1052 | int optname, char __user *optval, unsigned int optlen) | 1010 | int optname, char __user *optval, unsigned int optlen) |
| @@ -1109,7 +1067,7 @@ EXPORT_SYMBOL(compat_ip_setsockopt); | |||
| 1109 | */ | 1067 | */ |
| 1110 | 1068 | ||
| 1111 | static int do_ip_getsockopt(struct sock *sk, int level, int optname, | 1069 | static int do_ip_getsockopt(struct sock *sk, int level, int optname, |
| 1112 | char __user *optval, int __user *optlen, unsigned int flags) | 1070 | char __user *optval, int __user *optlen, unsigned flags) |
| 1113 | { | 1071 | { |
| 1114 | struct inet_sock *inet = inet_sk(sk); | 1072 | struct inet_sock *inet = inet_sk(sk); |
| 1115 | int val; | 1073 | int val; |
| @@ -1218,9 +1176,6 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, | |||
| 1218 | case IP_MULTICAST_LOOP: | 1176 | case IP_MULTICAST_LOOP: |
| 1219 | val = inet->mc_loop; | 1177 | val = inet->mc_loop; |
| 1220 | break; | 1178 | break; |
| 1221 | case IP_UNICAST_IF: | ||
| 1222 | val = (__force int)htonl((__u32) inet->uc_index); | ||
| 1223 | break; | ||
| 1224 | case IP_MULTICAST_IF: | 1179 | case IP_MULTICAST_IF: |
| 1225 | { | 1180 | { |
| 1226 | struct in_addr addr; | 1181 | struct in_addr addr; |
| @@ -1299,10 +1254,6 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, | |||
| 1299 | int hlim = inet->mc_ttl; | 1254 | int hlim = inet->mc_ttl; |
| 1300 | put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim); | 1255 | put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim); |
| 1301 | } | 1256 | } |
| 1302 | if (inet->cmsg_flags & IP_CMSG_TOS) { | ||
| 1303 | int tos = inet->rcv_tos; | ||
| 1304 | put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos); | ||
| 1305 | } | ||
| 1306 | len -= msg.msg_controllen; | 1257 | len -= msg.msg_controllen; |
| 1307 | return put_user(len, optlen); | 1258 | return put_user(len, optlen); |
| 1308 | } | 1259 | } |
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c deleted file mode 100644 index c3a4233c0ac..00000000000 --- a/net/ipv4/ip_vti.c +++ /dev/null | |||
| @@ -1,942 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * Linux NET3: IP/IP protocol decoder modified to support | ||
| 3 | * virtual tunnel interface | ||
| 4 | * | ||
| 5 | * Authors: | ||
| 6 | * Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012 | ||
| 7 | * | ||
| 8 | * This program is free software; you can redistribute it and/or | ||
| 9 | * modify it under the terms of the GNU General Public License | ||
| 10 | * as published by the Free Software Foundation; either version | ||
| 11 | * 2 of the License, or (at your option) any later version. | ||
| 12 | * | ||
| 13 | */ | ||
| 14 | |||
| 15 | /* | ||
| 16 | This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c | ||
| 17 | |||
| 18 | For comments look at net/ipv4/ip_gre.c --ANK | ||
| 19 | */ | ||
| 20 | |||
| 21 | |||
| 22 | #include <linux/capability.h> | ||
| 23 | #include <linux/module.h> | ||
| 24 | #include <linux/types.h> | ||
| 25 | #include <linux/kernel.h> | ||
| 26 | #include <linux/uaccess.h> | ||
| 27 | #include <linux/skbuff.h> | ||
| 28 | #include <linux/netdevice.h> | ||
| 29 | #include <linux/in.h> | ||
| 30 | #include <linux/tcp.h> | ||
| 31 | #include <linux/udp.h> | ||
| 32 | #include <linux/if_arp.h> | ||
| 33 | #include <linux/mroute.h> | ||
| 34 | #include <linux/init.h> | ||
| 35 | #include <linux/netfilter_ipv4.h> | ||
| 36 | #include <linux/if_ether.h> | ||
| 37 | |||
| 38 | #include <net/sock.h> | ||
| 39 | #include <net/ip.h> | ||
| 40 | #include <net/icmp.h> | ||
| 41 | #include <net/ipip.h> | ||
| 42 | #include <net/inet_ecn.h> | ||
| 43 | #include <net/xfrm.h> | ||
| 44 | #include <net/net_namespace.h> | ||
| 45 | #include <net/netns/generic.h> | ||
| 46 | |||
| 47 | #define HASH_SIZE 16 | ||
| 48 | #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&(HASH_SIZE-1)) | ||
| 49 | |||
| 50 | static struct rtnl_link_ops vti_link_ops __read_mostly; | ||
| 51 | |||
| 52 | static int vti_net_id __read_mostly; | ||
| 53 | struct vti_net { | ||
| 54 | struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; | ||
| 55 | struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; | ||
| 56 | struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; | ||
| 57 | struct ip_tunnel __rcu *tunnels_wc[1]; | ||
| 58 | struct ip_tunnel __rcu **tunnels[4]; | ||
| 59 | |||
| 60 | struct net_device *fb_tunnel_dev; | ||
| 61 | }; | ||
| 62 | |||
| 63 | static int vti_fb_tunnel_init(struct net_device *dev); | ||
| 64 | static int vti_tunnel_init(struct net_device *dev); | ||
| 65 | static void vti_tunnel_setup(struct net_device *dev); | ||
| 66 | static void vti_dev_free(struct net_device *dev); | ||
| 67 | static int vti_tunnel_bind_dev(struct net_device *dev); | ||
| 68 | |||
| 69 | #define VTI_XMIT(stats1, stats2) do { \ | ||
| 70 | int err; \ | ||
| 71 | int pkt_len = skb->len; \ | ||
| 72 | err = dst_output(skb); \ | ||
| 73 | if (net_xmit_eval(err) == 0) { \ | ||
| 74 | u64_stats_update_begin(&(stats1)->syncp); \ | ||
| 75 | (stats1)->tx_bytes += pkt_len; \ | ||
| 76 | (stats1)->tx_packets++; \ | ||
| 77 | u64_stats_update_end(&(stats1)->syncp); \ | ||
| 78 | } else { \ | ||
| 79 | (stats2)->tx_errors++; \ | ||
| 80 | (stats2)->tx_aborted_errors++; \ | ||
| 81 | } \ | ||
| 82 | } while (0) | ||
| 83 | |||
| 84 | |||
| 85 | static struct rtnl_link_stats64 *vti_get_stats64(struct net_device *dev, | ||
| 86 | struct rtnl_link_stats64 *tot) | ||
| 87 | { | ||
| 88 | int i; | ||
| 89 | |||
| 90 | for_each_possible_cpu(i) { | ||
| 91 | const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); | ||
| 92 | u64 rx_packets, rx_bytes, tx_packets, tx_bytes; | ||
| 93 | unsigned int start; | ||
| 94 | |||
| 95 | do { | ||
| 96 | start = u64_stats_fetch_begin_bh(&tstats->syncp); | ||
| 97 | rx_packets = tstats->rx_packets; | ||
| 98 | tx_packets = tstats->tx_packets; | ||
| 99 | rx_bytes = tstats->rx_bytes; | ||
| 100 | tx_bytes = tstats->tx_bytes; | ||
| 101 | } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); | ||
| 102 | |||
| 103 | tot->rx_packets += rx_packets; | ||
| 104 | tot->tx_packets += tx_packets; | ||
| 105 | tot->rx_bytes += rx_bytes; | ||
| 106 | tot->tx_bytes += tx_bytes; | ||
| 107 | } | ||
| 108 | |||
| 109 | tot->multicast = dev->stats.multicast; | ||
| 110 | tot->rx_crc_errors = dev->stats.rx_crc_errors; | ||
| 111 | tot->rx_fifo_errors = dev->stats.rx_fifo_errors; | ||
| 112 | tot->rx_length_errors = dev->stats.rx_length_errors; | ||
| 113 | tot->rx_errors = dev->stats.rx_errors; | ||
| 114 | tot->tx_fifo_errors = dev->stats.tx_fifo_errors; | ||
| 115 | tot->tx_carrier_errors = dev->stats.tx_carrier_errors; | ||
| 116 | tot->tx_dropped = dev->stats.tx_dropped; | ||
| 117 | tot->tx_aborted_errors = dev->stats.tx_aborted_errors; | ||
| 118 | tot->tx_errors = dev->stats.tx_errors; | ||
| 119 | |||
| 120 | return tot; | ||
| 121 | } | ||
| 122 | |||
| 123 | static struct ip_tunnel *vti_tunnel_lookup(struct net *net, | ||
| 124 | __be32 remote, __be32 local) | ||
| 125 | { | ||
| 126 | unsigned h0 = HASH(remote); | ||
| 127 | unsigned h1 = HASH(local); | ||
| 128 | struct ip_tunnel *t; | ||
| 129 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
| 130 | |||
| 131 | for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1]) | ||
| 132 | if (local == t->parms.iph.saddr && | ||
| 133 | remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) | ||
| 134 | return t; | ||
| 135 | for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0]) | ||
| 136 | if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) | ||
| 137 | return t; | ||
| 138 | |||
| 139 | for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1]) | ||
| 140 | if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) | ||
| 141 | return t; | ||
| 142 | |||
| 143 | for_each_ip_tunnel_rcu(t, ipn->tunnels_wc[0]) | ||
| 144 | if (t && (t->dev->flags&IFF_UP)) | ||
| 145 | return t; | ||
| 146 | return NULL; | ||
| 147 | } | ||
| 148 | |||
| 149 | static struct ip_tunnel __rcu **__vti_bucket(struct vti_net *ipn, | ||
| 150 | struct ip_tunnel_parm *parms) | ||
| 151 | { | ||
| 152 | __be32 remote = parms->iph.daddr; | ||
| 153 | __be32 local = parms->iph.saddr; | ||
| 154 | unsigned h = 0; | ||
| 155 | int prio = 0; | ||
| 156 | |||
| 157 | if (remote) { | ||
| 158 | prio |= 2; | ||
| 159 | h ^= HASH(remote); | ||
| 160 | } | ||
| 161 | if (local) { | ||
| 162 | prio |= 1; | ||
| 163 | h ^= HASH(local); | ||
| 164 | } | ||
| 165 | return &ipn->tunnels[prio][h]; | ||
| 166 | } | ||
| 167 | |||
| 168 | static inline struct ip_tunnel __rcu **vti_bucket(struct vti_net *ipn, | ||
| 169 | struct ip_tunnel *t) | ||
| 170 | { | ||
| 171 | return __vti_bucket(ipn, &t->parms); | ||
| 172 | } | ||
| 173 | |||
| 174 | static void vti_tunnel_unlink(struct vti_net *ipn, struct ip_tunnel *t) | ||
| 175 | { | ||
| 176 | struct ip_tunnel __rcu **tp; | ||
| 177 | struct ip_tunnel *iter; | ||
| 178 | |||
| 179 | for (tp = vti_bucket(ipn, t); | ||
| 180 | (iter = rtnl_dereference(*tp)) != NULL; | ||
| 181 | tp = &iter->next) { | ||
| 182 | if (t == iter) { | ||
| 183 | rcu_assign_pointer(*tp, t->next); | ||
| 184 | break; | ||
| 185 | } | ||
| 186 | } | ||
| 187 | } | ||
| 188 | |||
| 189 | static void vti_tunnel_link(struct vti_net *ipn, struct ip_tunnel *t) | ||
| 190 | { | ||
| 191 | struct ip_tunnel __rcu **tp = vti_bucket(ipn, t); | ||
| 192 | |||
| 193 | rcu_assign_pointer(t->next, rtnl_dereference(*tp)); | ||
| 194 | rcu_assign_pointer(*tp, t); | ||
| 195 | } | ||
| 196 | |||
| 197 | static struct ip_tunnel *vti_tunnel_locate(struct net *net, | ||
| 198 | struct ip_tunnel_parm *parms, | ||
| 199 | int create) | ||
| 200 | { | ||
| 201 | __be32 remote = parms->iph.daddr; | ||
| 202 | __be32 local = parms->iph.saddr; | ||
| 203 | struct ip_tunnel *t, *nt; | ||
| 204 | struct ip_tunnel __rcu **tp; | ||
| 205 | struct net_device *dev; | ||
| 206 | char name[IFNAMSIZ]; | ||
| 207 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
| 208 | |||
| 209 | for (tp = __vti_bucket(ipn, parms); | ||
| 210 | (t = rtnl_dereference(*tp)) != NULL; | ||
| 211 | tp = &t->next) { | ||
| 212 | if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) | ||
| 213 | return t; | ||
| 214 | } | ||
| 215 | if (!create) | ||
| 216 | return NULL; | ||
| 217 | |||
| 218 | if (parms->name[0]) | ||
| 219 | strlcpy(name, parms->name, IFNAMSIZ); | ||
| 220 | else | ||
| 221 | strcpy(name, "vti%d"); | ||
| 222 | |||
| 223 | dev = alloc_netdev(sizeof(*t), name, vti_tunnel_setup); | ||
| 224 | if (dev == NULL) | ||
| 225 | return NULL; | ||
| 226 | |||
| 227 | dev_net_set(dev, net); | ||
| 228 | |||
| 229 | nt = netdev_priv(dev); | ||
| 230 | nt->parms = *parms; | ||
| 231 | dev->rtnl_link_ops = &vti_link_ops; | ||
| 232 | |||
| 233 | vti_tunnel_bind_dev(dev); | ||
| 234 | |||
| 235 | if (register_netdevice(dev) < 0) | ||
| 236 | goto failed_free; | ||
| 237 | |||
| 238 | dev_hold(dev); | ||
| 239 | vti_tunnel_link(ipn, nt); | ||
| 240 | return nt; | ||
| 241 | |||
| 242 | failed_free: | ||
| 243 | free_netdev(dev); | ||
| 244 | return NULL; | ||
| 245 | } | ||
| 246 | |||
| 247 | static void vti_tunnel_uninit(struct net_device *dev) | ||
| 248 | { | ||
| 249 | struct net *net = dev_net(dev); | ||
| 250 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
| 251 | |||
| 252 | vti_tunnel_unlink(ipn, netdev_priv(dev)); | ||
| 253 | dev_put(dev); | ||
| 254 | } | ||
| 255 | |||
| 256 | static int vti_err(struct sk_buff *skb, u32 info) | ||
| 257 | { | ||
| 258 | |||
| 259 | /* All the routers (except for Linux) return only | ||
| 260 | * 8 bytes of packet payload. It means, that precise relaying of | ||
| 261 | * ICMP in the real Internet is absolutely infeasible. | ||
| 262 | */ | ||
| 263 | struct iphdr *iph = (struct iphdr *)skb->data; | ||
| 264 | const int type = icmp_hdr(skb)->type; | ||
| 265 | const int code = icmp_hdr(skb)->code; | ||
| 266 | struct ip_tunnel *t; | ||
| 267 | int err; | ||
| 268 | |||
| 269 | switch (type) { | ||
| 270 | default: | ||
| 271 | case ICMP_PARAMETERPROB: | ||
| 272 | return 0; | ||
| 273 | |||
| 274 | case ICMP_DEST_UNREACH: | ||
| 275 | switch (code) { | ||
| 276 | case ICMP_SR_FAILED: | ||
| 277 | case ICMP_PORT_UNREACH: | ||
| 278 | /* Impossible event. */ | ||
| 279 | return 0; | ||
| 280 | default: | ||
| 281 | /* All others are translated to HOST_UNREACH. */ | ||
| 282 | break; | ||
| 283 | } | ||
| 284 | break; | ||
| 285 | case ICMP_TIME_EXCEEDED: | ||
| 286 | if (code != ICMP_EXC_TTL) | ||
| 287 | return 0; | ||
| 288 | break; | ||
| 289 | } | ||
| 290 | |||
| 291 | err = -ENOENT; | ||
| 292 | |||
| 293 | t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); | ||
| 294 | if (t == NULL) | ||
| 295 | goto out; | ||
| 296 | |||
| 297 | if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { | ||
| 298 | ipv4_update_pmtu(skb, dev_net(skb->dev), info, | ||
| 299 | t->parms.link, 0, IPPROTO_IPIP, 0); | ||
| 300 | err = 0; | ||
| 301 | goto out; | ||
| 302 | } | ||
| 303 | |||
| 304 | err = 0; | ||
| 305 | if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) | ||
| 306 | goto out; | ||
| 307 | |||
| 308 | if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) | ||
| 309 | t->err_count++; | ||
| 310 | else | ||
| 311 | t->err_count = 1; | ||
| 312 | t->err_time = jiffies; | ||
| 313 | out: | ||
| 314 | return err; | ||
| 315 | } | ||
| 316 | |||
| 317 | /* We dont digest the packet therefore let the packet pass */ | ||
| 318 | static int vti_rcv(struct sk_buff *skb) | ||
| 319 | { | ||
| 320 | struct ip_tunnel *tunnel; | ||
| 321 | const struct iphdr *iph = ip_hdr(skb); | ||
| 322 | |||
| 323 | tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); | ||
| 324 | if (tunnel != NULL) { | ||
| 325 | struct pcpu_tstats *tstats; | ||
| 326 | |||
| 327 | if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) | ||
| 328 | return -1; | ||
| 329 | |||
| 330 | tstats = this_cpu_ptr(tunnel->dev->tstats); | ||
| 331 | u64_stats_update_begin(&tstats->syncp); | ||
| 332 | tstats->rx_packets++; | ||
| 333 | tstats->rx_bytes += skb->len; | ||
| 334 | u64_stats_update_end(&tstats->syncp); | ||
| 335 | |||
| 336 | skb->mark = 0; | ||
| 337 | secpath_reset(skb); | ||
| 338 | skb->dev = tunnel->dev; | ||
| 339 | return 1; | ||
| 340 | } | ||
| 341 | |||
| 342 | return -1; | ||
| 343 | } | ||
| 344 | |||
| 345 | /* This function assumes it is being called from dev_queue_xmit() | ||
| 346 | * and that skb is filled properly by that function. | ||
| 347 | */ | ||
| 348 | |||
| 349 | static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | ||
| 350 | { | ||
| 351 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
| 352 | struct pcpu_tstats *tstats; | ||
| 353 | struct iphdr *tiph = &tunnel->parms.iph; | ||
| 354 | u8 tos; | ||
| 355 | struct rtable *rt; /* Route to the other host */ | ||
| 356 | struct net_device *tdev; /* Device to other host */ | ||
| 357 | struct iphdr *old_iph = ip_hdr(skb); | ||
| 358 | __be32 dst = tiph->daddr; | ||
| 359 | struct flowi4 fl4; | ||
| 360 | |||
| 361 | if (skb->protocol != htons(ETH_P_IP)) | ||
| 362 | goto tx_error; | ||
| 363 | |||
| 364 | tos = old_iph->tos; | ||
| 365 | |||
| 366 | memset(&fl4, 0, sizeof(fl4)); | ||
| 367 | flowi4_init_output(&fl4, tunnel->parms.link, | ||
| 368 | be32_to_cpu(tunnel->parms.i_key), RT_TOS(tos), | ||
| 369 | RT_SCOPE_UNIVERSE, | ||
| 370 | IPPROTO_IPIP, 0, | ||
| 371 | dst, tiph->saddr, 0, 0); | ||
| 372 | rt = ip_route_output_key(dev_net(dev), &fl4); | ||
| 373 | if (IS_ERR(rt)) { | ||
| 374 | dev->stats.tx_carrier_errors++; | ||
| 375 | goto tx_error_icmp; | ||
| 376 | } | ||
| 377 | /* if there is no transform then this tunnel is not functional. | ||
| 378 | * Or if the xfrm is not mode tunnel. | ||
| 379 | */ | ||
| 380 | if (!rt->dst.xfrm || | ||
| 381 | rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) { | ||
| 382 | dev->stats.tx_carrier_errors++; | ||
| 383 | goto tx_error_icmp; | ||
| 384 | } | ||
| 385 | tdev = rt->dst.dev; | ||
| 386 | |||
| 387 | if (tdev == dev) { | ||
| 388 | ip_rt_put(rt); | ||
| 389 | dev->stats.collisions++; | ||
| 390 | goto tx_error; | ||
| 391 | } | ||
| 392 | |||
| 393 | if (tunnel->err_count > 0) { | ||
| 394 | if (time_before(jiffies, | ||
| 395 | tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { | ||
| 396 | tunnel->err_count--; | ||
| 397 | dst_link_failure(skb); | ||
| 398 | } else | ||
| 399 | tunnel->err_count = 0; | ||
| 400 | } | ||
| 401 | |||
| 402 | IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | | ||
| 403 | IPSKB_REROUTED); | ||
| 404 | skb_dst_drop(skb); | ||
| 405 | skb_dst_set(skb, &rt->dst); | ||
| 406 | nf_reset(skb); | ||
| 407 | skb->dev = skb_dst(skb)->dev; | ||
| 408 | |||
| 409 | tstats = this_cpu_ptr(dev->tstats); | ||
| 410 | VTI_XMIT(tstats, &dev->stats); | ||
| 411 | return NETDEV_TX_OK; | ||
| 412 | |||
| 413 | tx_error_icmp: | ||
| 414 | dst_link_failure(skb); | ||
| 415 | tx_error: | ||
| 416 | dev->stats.tx_errors++; | ||
| 417 | dev_kfree_skb(skb); | ||
| 418 | return NETDEV_TX_OK; | ||
| 419 | } | ||
| 420 | |||
| 421 | static int vti_tunnel_bind_dev(struct net_device *dev) | ||
| 422 | { | ||
| 423 | struct net_device *tdev = NULL; | ||
| 424 | struct ip_tunnel *tunnel; | ||
| 425 | struct iphdr *iph; | ||
| 426 | |||
| 427 | tunnel = netdev_priv(dev); | ||
| 428 | iph = &tunnel->parms.iph; | ||
| 429 | |||
| 430 | if (iph->daddr) { | ||
| 431 | struct rtable *rt; | ||
| 432 | struct flowi4 fl4; | ||
| 433 | memset(&fl4, 0, sizeof(fl4)); | ||
| 434 | flowi4_init_output(&fl4, tunnel->parms.link, | ||
| 435 | be32_to_cpu(tunnel->parms.i_key), | ||
| 436 | RT_TOS(iph->tos), RT_SCOPE_UNIVERSE, | ||
| 437 | IPPROTO_IPIP, 0, | ||
| 438 | iph->daddr, iph->saddr, 0, 0); | ||
| 439 | rt = ip_route_output_key(dev_net(dev), &fl4); | ||
| 440 | if (!IS_ERR(rt)) { | ||
| 441 | tdev = rt->dst.dev; | ||
| 442 | ip_rt_put(rt); | ||
| 443 | } | ||
| 444 | dev->flags |= IFF_POINTOPOINT; | ||
| 445 | } | ||
| 446 | |||
| 447 | if (!tdev && tunnel->parms.link) | ||
| 448 | tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); | ||
| 449 | |||
| 450 | if (tdev) { | ||
| 451 | dev->hard_header_len = tdev->hard_header_len + | ||
| 452 | sizeof(struct iphdr); | ||
| 453 | dev->mtu = tdev->mtu; | ||
| 454 | } | ||
| 455 | dev->iflink = tunnel->parms.link; | ||
| 456 | return dev->mtu; | ||
| 457 | } | ||
| 458 | |||
| 459 | static int | ||
| 460 | vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) | ||
| 461 | { | ||
| 462 | int err = 0; | ||
| 463 | struct ip_tunnel_parm p; | ||
| 464 | struct ip_tunnel *t; | ||
| 465 | struct net *net = dev_net(dev); | ||
| 466 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
| 467 | |||
| 468 | switch (cmd) { | ||
| 469 | case SIOCGETTUNNEL: | ||
| 470 | t = NULL; | ||
| 471 | if (dev == ipn->fb_tunnel_dev) { | ||
| 472 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, | ||
| 473 | sizeof(p))) { | ||
| 474 | err = -EFAULT; | ||
| 475 | break; | ||
| 476 | } | ||
| 477 | t = vti_tunnel_locate(net, &p, 0); | ||
| 478 | } | ||
| 479 | if (t == NULL) | ||
| 480 | t = netdev_priv(dev); | ||
| 481 | memcpy(&p, &t->parms, sizeof(p)); | ||
| 482 | p.i_flags |= GRE_KEY | VTI_ISVTI; | ||
| 483 | p.o_flags |= GRE_KEY; | ||
| 484 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) | ||
| 485 | err = -EFAULT; | ||
| 486 | break; | ||
| 487 | |||
| 488 | case SIOCADDTUNNEL: | ||
| 489 | case SIOCCHGTUNNEL: | ||
| 490 | err = -EPERM; | ||
| 491 | if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) | ||
| 492 | goto done; | ||
| 493 | |||
| 494 | err = -EFAULT; | ||
| 495 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) | ||
| 496 | goto done; | ||
| 497 | |||
| 498 | err = -EINVAL; | ||
| 499 | if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || | ||
| 500 | p.iph.ihl != 5) | ||
| 501 | goto done; | ||
| 502 | |||
| 503 | t = vti_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); | ||
| 504 | |||
| 505 | if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { | ||
| 506 | if (t != NULL) { | ||
| 507 | if (t->dev != dev) { | ||
| 508 | err = -EEXIST; | ||
| 509 | break; | ||
| 510 | } | ||
| 511 | } else { | ||
| 512 | if (((dev->flags&IFF_POINTOPOINT) && | ||
| 513 | !p.iph.daddr) || | ||
| 514 | (!(dev->flags&IFF_POINTOPOINT) && | ||
| 515 | p.iph.daddr)) { | ||
| 516 | err = -EINVAL; | ||
| 517 | break; | ||
| 518 | } | ||
| 519 | t = netdev_priv(dev); | ||
| 520 | vti_tunnel_unlink(ipn, t); | ||
| 521 | synchronize_net(); | ||
| 522 | t->parms.iph.saddr = p.iph.saddr; | ||
| 523 | t->parms.iph.daddr = p.iph.daddr; | ||
| 524 | t->parms.i_key = p.i_key; | ||
| 525 | t->parms.o_key = p.o_key; | ||
| 526 | t->parms.iph.protocol = IPPROTO_IPIP; | ||
| 527 | memcpy(dev->dev_addr, &p.iph.saddr, 4); | ||
| 528 | memcpy(dev->broadcast, &p.iph.daddr, 4); | ||
| 529 | vti_tunnel_link(ipn, t); | ||
| 530 | netdev_state_change(dev); | ||
| 531 | } | ||
| 532 | } | ||
| 533 | |||
| 534 | if (t) { | ||
| 535 | err = 0; | ||
| 536 | if (cmd == SIOCCHGTUNNEL) { | ||
| 537 | t->parms.i_key = p.i_key; | ||
| 538 | t->parms.o_key = p.o_key; | ||
| 539 | if (t->parms.link != p.link) { | ||
| 540 | t->parms.link = p.link; | ||
| 541 | vti_tunnel_bind_dev(dev); | ||
| 542 | netdev_state_change(dev); | ||
| 543 | } | ||
| 544 | } | ||
| 545 | p.i_flags |= GRE_KEY | VTI_ISVTI; | ||
| 546 | p.o_flags |= GRE_KEY; | ||
| 547 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, | ||
| 548 | sizeof(p))) | ||
| 549 | err = -EFAULT; | ||
| 550 | } else | ||
| 551 | err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); | ||
| 552 | break; | ||
| 553 | |||
| 554 | case SIOCDELTUNNEL: | ||
| 555 | err = -EPERM; | ||
| 556 | if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) | ||
| 557 | goto done; | ||
| 558 | |||
| 559 | if (dev == ipn->fb_tunnel_dev) { | ||
| 560 | err = -EFAULT; | ||
| 561 | if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, | ||
| 562 | sizeof(p))) | ||
| 563 | goto done; | ||
| 564 | err = -ENOENT; | ||
| 565 | |||
| 566 | t = vti_tunnel_locate(net, &p, 0); | ||
| 567 | if (t == NULL) | ||
| 568 | goto done; | ||
| 569 | err = -EPERM; | ||
| 570 | if (t->dev == ipn->fb_tunnel_dev) | ||
| 571 | goto done; | ||
| 572 | dev = t->dev; | ||
| 573 | } | ||
| 574 | unregister_netdevice(dev); | ||
| 575 | err = 0; | ||
| 576 | break; | ||
| 577 | |||
| 578 | default: | ||
| 579 | err = -EINVAL; | ||
| 580 | } | ||
| 581 | |||
| 582 | done: | ||
| 583 | return err; | ||
| 584 | } | ||
| 585 | |||
| 586 | static int vti_tunnel_change_mtu(struct net_device *dev, int new_mtu) | ||
| 587 | { | ||
| 588 | if (new_mtu < 68 || new_mtu > 0xFFF8) | ||
| 589 | return -EINVAL; | ||
| 590 | dev->mtu = new_mtu; | ||
| 591 | return 0; | ||
| 592 | } | ||
| 593 | |||
| 594 | static const struct net_device_ops vti_netdev_ops = { | ||
| 595 | .ndo_init = vti_tunnel_init, | ||
| 596 | .ndo_uninit = vti_tunnel_uninit, | ||
| 597 | .ndo_start_xmit = vti_tunnel_xmit, | ||
| 598 | .ndo_do_ioctl = vti_tunnel_ioctl, | ||
| 599 | .ndo_change_mtu = vti_tunnel_change_mtu, | ||
| 600 | .ndo_get_stats64 = vti_get_stats64, | ||
| 601 | }; | ||
| 602 | |||
| 603 | static void vti_dev_free(struct net_device *dev) | ||
| 604 | { | ||
| 605 | free_percpu(dev->tstats); | ||
| 606 | free_netdev(dev); | ||
| 607 | } | ||
| 608 | |||
| 609 | static void vti_tunnel_setup(struct net_device *dev) | ||
| 610 | { | ||
| 611 | dev->netdev_ops = &vti_netdev_ops; | ||
| 612 | dev->destructor = vti_dev_free; | ||
| 613 | |||
| 614 | dev->type = ARPHRD_TUNNEL; | ||
| 615 | dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); | ||
| 616 | dev->mtu = ETH_DATA_LEN; | ||
| 617 | dev->flags = IFF_NOARP; | ||
| 618 | dev->iflink = 0; | ||
| 619 | dev->addr_len = 4; | ||
| 620 | dev->features |= NETIF_F_NETNS_LOCAL; | ||
| 621 | dev->features |= NETIF_F_LLTX; | ||
| 622 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | ||
| 623 | } | ||
| 624 | |||
| 625 | static int vti_tunnel_init(struct net_device *dev) | ||
| 626 | { | ||
| 627 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
| 628 | |||
| 629 | tunnel->dev = dev; | ||
| 630 | strcpy(tunnel->parms.name, dev->name); | ||
| 631 | |||
| 632 | memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); | ||
| 633 | memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); | ||
| 634 | |||
| 635 | dev->tstats = alloc_percpu(struct pcpu_tstats); | ||
| 636 | if (!dev->tstats) | ||
| 637 | return -ENOMEM; | ||
| 638 | |||
| 639 | return 0; | ||
| 640 | } | ||
| 641 | |||
| 642 | static int __net_init vti_fb_tunnel_init(struct net_device *dev) | ||
| 643 | { | ||
| 644 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
| 645 | struct iphdr *iph = &tunnel->parms.iph; | ||
| 646 | struct vti_net *ipn = net_generic(dev_net(dev), vti_net_id); | ||
| 647 | |||
| 648 | tunnel->dev = dev; | ||
| 649 | strcpy(tunnel->parms.name, dev->name); | ||
| 650 | |||
| 651 | iph->version = 4; | ||
| 652 | iph->protocol = IPPROTO_IPIP; | ||
| 653 | iph->ihl = 5; | ||
| 654 | |||
| 655 | dev->tstats = alloc_percpu(struct pcpu_tstats); | ||
| 656 | if (!dev->tstats) | ||
| 657 | return -ENOMEM; | ||
| 658 | |||
| 659 | dev_hold(dev); | ||
| 660 | rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); | ||
| 661 | return 0; | ||
| 662 | } | ||
| 663 | |||
| 664 | static struct xfrm_tunnel vti_handler __read_mostly = { | ||
| 665 | .handler = vti_rcv, | ||
| 666 | .err_handler = vti_err, | ||
| 667 | .priority = 1, | ||
| 668 | }; | ||
| 669 | |||
| 670 | static void vti_destroy_tunnels(struct vti_net *ipn, struct list_head *head) | ||
| 671 | { | ||
| 672 | int prio; | ||
| 673 | |||
| 674 | for (prio = 1; prio < 4; prio++) { | ||
| 675 | int h; | ||
| 676 | for (h = 0; h < HASH_SIZE; h++) { | ||
| 677 | struct ip_tunnel *t; | ||
| 678 | |||
| 679 | t = rtnl_dereference(ipn->tunnels[prio][h]); | ||
| 680 | while (t != NULL) { | ||
| 681 | unregister_netdevice_queue(t->dev, head); | ||
| 682 | t = rtnl_dereference(t->next); | ||
| 683 | } | ||
| 684 | } | ||
| 685 | } | ||
| 686 | } | ||
| 687 | |||
| 688 | static int __net_init vti_init_net(struct net *net) | ||
| 689 | { | ||
| 690 | int err; | ||
| 691 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
| 692 | |||
| 693 | ipn->tunnels[0] = ipn->tunnels_wc; | ||
| 694 | ipn->tunnels[1] = ipn->tunnels_l; | ||
| 695 | ipn->tunnels[2] = ipn->tunnels_r; | ||
| 696 | ipn->tunnels[3] = ipn->tunnels_r_l; | ||
| 697 | |||
| 698 | ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), | ||
| 699 | "ip_vti0", | ||
| 700 | vti_tunnel_setup); | ||
| 701 | if (!ipn->fb_tunnel_dev) { | ||
| 702 | err = -ENOMEM; | ||
| 703 | goto err_alloc_dev; | ||
| 704 | } | ||
| 705 | dev_net_set(ipn->fb_tunnel_dev, net); | ||
| 706 | |||
| 707 | err = vti_fb_tunnel_init(ipn->fb_tunnel_dev); | ||
| 708 | if (err) | ||
| 709 | goto err_reg_dev; | ||
| 710 | ipn->fb_tunnel_dev->rtnl_link_ops = &vti_link_ops; | ||
| 711 | |||
| 712 | err = register_netdev(ipn->fb_tunnel_dev); | ||
| 713 | if (err) | ||
| 714 | goto err_reg_dev; | ||
| 715 | return 0; | ||
| 716 | |||
| 717 | err_reg_dev: | ||
| 718 | vti_dev_free(ipn->fb_tunnel_dev); | ||
| 719 | err_alloc_dev: | ||
| 720 | /* nothing */ | ||
| 721 | return err; | ||
| 722 | } | ||
| 723 | |||
| 724 | static void __net_exit vti_exit_net(struct net *net) | ||
| 725 | { | ||
| 726 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
| 727 | LIST_HEAD(list); | ||
| 728 | |||
| 729 | rtnl_lock(); | ||
| 730 | vti_destroy_tunnels(ipn, &list); | ||
| 731 | unregister_netdevice_many(&list); | ||
| 732 | rtnl_unlock(); | ||
| 733 | } | ||
| 734 | |||
| 735 | static struct pernet_operations vti_net_ops = { | ||
| 736 | .init = vti_init_net, | ||
| 737 | .exit = vti_exit_net, | ||
| 738 | .id = &vti_net_id, | ||
| 739 | .size = sizeof(struct vti_net), | ||
| 740 | }; | ||
| 741 | |||
| 742 | static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) | ||
| 743 | { | ||
| 744 | return 0; | ||
| 745 | } | ||
| 746 | |||
| 747 | static void vti_netlink_parms(struct nlattr *data[], | ||
| 748 | struct ip_tunnel_parm *parms) | ||
| 749 | { | ||
| 750 | memset(parms, 0, sizeof(*parms)); | ||
| 751 | |||
| 752 | parms->iph.protocol = IPPROTO_IPIP; | ||
| 753 | |||
| 754 | if (!data) | ||
| 755 | return; | ||
| 756 | |||
| 757 | if (data[IFLA_VTI_LINK]) | ||
| 758 | parms->link = nla_get_u32(data[IFLA_VTI_LINK]); | ||
| 759 | |||
| 760 | if (data[IFLA_VTI_IKEY]) | ||
| 761 | parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]); | ||
| 762 | |||
| 763 | if (data[IFLA_VTI_OKEY]) | ||
| 764 | parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); | ||
| 765 | |||
| 766 | if (data[IFLA_VTI_LOCAL]) | ||
| 767 | parms->iph.saddr = nla_get_be32(data[IFLA_VTI_LOCAL]); | ||
| 768 | |||
| 769 | if (data[IFLA_VTI_REMOTE]) | ||
| 770 | parms->iph.daddr = nla_get_be32(data[IFLA_VTI_REMOTE]); | ||
| 771 | |||
| 772 | } | ||
| 773 | |||
| 774 | static int vti_newlink(struct net *src_net, struct net_device *dev, | ||
| 775 | struct nlattr *tb[], struct nlattr *data[]) | ||
| 776 | { | ||
| 777 | struct ip_tunnel *nt; | ||
| 778 | struct net *net = dev_net(dev); | ||
| 779 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
| 780 | int mtu; | ||
| 781 | int err; | ||
| 782 | |||
| 783 | nt = netdev_priv(dev); | ||
| 784 | vti_netlink_parms(data, &nt->parms); | ||
| 785 | |||
| 786 | if (vti_tunnel_locate(net, &nt->parms, 0)) | ||
| 787 | return -EEXIST; | ||
| 788 | |||
| 789 | mtu = vti_tunnel_bind_dev(dev); | ||
| 790 | if (!tb[IFLA_MTU]) | ||
| 791 | dev->mtu = mtu; | ||
| 792 | |||
| 793 | err = register_netdevice(dev); | ||
| 794 | if (err) | ||
| 795 | goto out; | ||
| 796 | |||
| 797 | dev_hold(dev); | ||
| 798 | vti_tunnel_link(ipn, nt); | ||
| 799 | |||
| 800 | out: | ||
| 801 | return err; | ||
| 802 | } | ||
| 803 | |||
| 804 | static int vti_changelink(struct net_device *dev, struct nlattr *tb[], | ||
| 805 | struct nlattr *data[]) | ||
| 806 | { | ||
| 807 | struct ip_tunnel *t, *nt; | ||
| 808 | struct net *net = dev_net(dev); | ||
| 809 | struct vti_net *ipn = net_generic(net, vti_net_id); | ||
| 810 | struct ip_tunnel_parm p; | ||
| 811 | int mtu; | ||
| 812 | |||
| 813 | if (dev == ipn->fb_tunnel_dev) | ||
| 814 | return -EINVAL; | ||
| 815 | |||
| 816 | nt = netdev_priv(dev); | ||
| 817 | vti_netlink_parms(data, &p); | ||
| 818 | |||
| 819 | t = vti_tunnel_locate(net, &p, 0); | ||
| 820 | |||
| 821 | if (t) { | ||
| 822 | if (t->dev != dev) | ||
| 823 | return -EEXIST; | ||
| 824 | } else { | ||
| 825 | t = nt; | ||
| 826 | |||
| 827 | vti_tunnel_unlink(ipn, t); | ||
| 828 | t->parms.iph.saddr = p.iph.saddr; | ||
| 829 | t->parms.iph.daddr = p.iph.daddr; | ||
| 830 | t->parms.i_key = p.i_key; | ||
| 831 | t->parms.o_key = p.o_key; | ||
| 832 | if (dev->type != ARPHRD_ETHER) { | ||
| 833 | memcpy(dev->dev_addr, &p.iph.saddr, 4); | ||
| 834 | memcpy(dev->broadcast, &p.iph.daddr, 4); | ||
| 835 | } | ||
| 836 | vti_tunnel_link(ipn, t); | ||
| 837 | netdev_state_change(dev); | ||
| 838 | } | ||
| 839 | |||
| 840 | if (t->parms.link != p.link) { | ||
| 841 | t->parms.link = p.link; | ||
| 842 | mtu = vti_tunnel_bind_dev(dev); | ||
| 843 | if (!tb[IFLA_MTU]) | ||
| 844 | dev->mtu = mtu; | ||
| 845 | netdev_state_change(dev); | ||
| 846 | } | ||
| 847 | |||
| 848 | return 0; | ||
| 849 | } | ||
| 850 | |||
| 851 | static size_t vti_get_size(const struct net_device *dev) | ||
| 852 | { | ||
| 853 | return | ||
| 854 | /* IFLA_VTI_LINK */ | ||
| 855 | nla_total_size(4) + | ||
| 856 | /* IFLA_VTI_IKEY */ | ||
| 857 | nla_total_size(4) + | ||
| 858 | /* IFLA_VTI_OKEY */ | ||
| 859 | nla_total_size(4) + | ||
| 860 | /* IFLA_VTI_LOCAL */ | ||
| 861 | nla_total_size(4) + | ||
| 862 | /* IFLA_VTI_REMOTE */ | ||
| 863 | nla_total_size(4) + | ||
| 864 | 0; | ||
| 865 | } | ||
| 866 | |||
| 867 | static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev) | ||
| 868 | { | ||
| 869 | struct ip_tunnel *t = netdev_priv(dev); | ||
| 870 | struct ip_tunnel_parm *p = &t->parms; | ||
| 871 | |||
| 872 | nla_put_u32(skb, IFLA_VTI_LINK, p->link); | ||
| 873 | nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key); | ||
| 874 | nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key); | ||
| 875 | nla_put_be32(skb, IFLA_VTI_LOCAL, p->iph.saddr); | ||
| 876 | nla_put_be32(skb, IFLA_VTI_REMOTE, p->iph.daddr); | ||
| 877 | |||
| 878 | return 0; | ||
| 879 | } | ||
| 880 | |||
| 881 | static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = { | ||
| 882 | [IFLA_VTI_LINK] = { .type = NLA_U32 }, | ||
| 883 | [IFLA_VTI_IKEY] = { .type = NLA_U32 }, | ||
| 884 | [IFLA_VTI_OKEY] = { .type = NLA_U32 }, | ||
| 885 | [IFLA_VTI_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, | ||
| 886 | [IFLA_VTI_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, | ||
| 887 | }; | ||
| 888 | |||
| 889 | static struct rtnl_link_ops vti_link_ops __read_mostly = { | ||
| 890 | .kind = "vti", | ||
| 891 | .maxtype = IFLA_VTI_MAX, | ||
| 892 | .policy = vti_policy, | ||
| 893 | .priv_size = sizeof(struct ip_tunnel), | ||
| 894 | .setup = vti_tunnel_setup, | ||
| 895 | .validate = vti_tunnel_validate, | ||
| 896 | .newlink = vti_newlink, | ||
| 897 | .changelink = vti_changelink, | ||
| 898 | .get_size = vti_get_size, | ||
| 899 | .fill_info = vti_fill_info, | ||
| 900 | }; | ||
| 901 | |||
| 902 | static int __init vti_init(void) | ||
| 903 | { | ||
| 904 | int err; | ||
| 905 | |||
| 906 | pr_info("IPv4 over IPSec tunneling driver\n"); | ||
| 907 | |||
| 908 | err = register_pernet_device(&vti_net_ops); | ||
| 909 | if (err < 0) | ||
| 910 | return err; | ||
| 911 | err = xfrm4_mode_tunnel_input_register(&vti_handler); | ||
| 912 | if (err < 0) { | ||
| 913 | unregister_pernet_device(&vti_net_ops); | ||
| 914 | pr_info(KERN_INFO "vti init: can't register tunnel\n"); | ||
| 915 | } | ||
| 916 | |||
| 917 | err = rtnl_link_register(&vti_link_ops); | ||
| 918 | if (err < 0) | ||
| 919 | goto rtnl_link_failed; | ||
| 920 | |||
| 921 | return err; | ||
| 922 | |||
| 923 | rtnl_link_failed: | ||
| 924 | xfrm4_mode_tunnel_input_deregister(&vti_handler); | ||
| 925 | unregister_pernet_device(&vti_net_ops); | ||
| 926 | return err; | ||
| 927 | } | ||
| 928 | |||
| 929 | static void __exit vti_fini(void) | ||
| 930 | { | ||
| 931 | rtnl_link_unregister(&vti_link_ops); | ||
| 932 | if (xfrm4_mode_tunnel_input_deregister(&vti_handler)) | ||
| 933 | pr_info("vti close: can't deregister tunnel\n"); | ||
| 934 | |||
| 935 | unregister_pernet_device(&vti_net_ops); | ||
| 936 | } | ||
| 937 | |||
| 938 | module_init(vti_init); | ||
| 939 | module_exit(vti_fini); | ||
| 940 | MODULE_LICENSE("GPL"); | ||
| 941 | MODULE_ALIAS_RTNL_LINK("vti"); | ||
| 942 | MODULE_ALIAS_NETDEV("ip_vti0"); | ||
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index d3ab47e19a8..c857f6f49b0 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c | |||
| @@ -31,26 +31,17 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) | |||
| 31 | struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); | 31 | struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); |
| 32 | struct xfrm_state *x; | 32 | struct xfrm_state *x; |
| 33 | 33 | ||
| 34 | switch (icmp_hdr(skb)->type) { | 34 | if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || |
| 35 | case ICMP_DEST_UNREACH: | 35 | icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) |
| 36 | if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) | ||
| 37 | return; | ||
| 38 | case ICMP_REDIRECT: | ||
| 39 | break; | ||
| 40 | default: | ||
| 41 | return; | 36 | return; |
| 42 | } | ||
| 43 | 37 | ||
| 44 | spi = htonl(ntohs(ipch->cpi)); | 38 | spi = htonl(ntohs(ipch->cpi)); |
| 45 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, | 39 | x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, |
| 46 | spi, IPPROTO_COMP, AF_INET); | 40 | spi, IPPROTO_COMP, AF_INET); |
| 47 | if (!x) | 41 | if (!x) |
| 48 | return; | 42 | return; |
| 49 | 43 | NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n", | |
| 50 | if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) | 44 | spi, &iph->daddr); |
| 51 | ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); | ||
| 52 | else | ||
| 53 | ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0); | ||
| 54 | xfrm_state_put(x); | 45 | xfrm_state_put(x); |
| 55 | } | 46 | } |
| 56 | 47 | ||
| @@ -165,11 +156,11 @@ static const struct net_protocol ipcomp4_protocol = { | |||
| 165 | static int __init ipcomp4_init(void) | 156 | static int __init ipcomp4_init(void) |
| 166 | { | 157 | { |
| 167 | if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) { | 158 | if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) { |
| 168 | pr_info("%s: can't add xfrm type\n", __func__); | 159 | printk(KERN_INFO "ipcomp init: can't add xfrm type\n"); |
| 169 | return -EAGAIN; | 160 | return -EAGAIN; |
| 170 | } | 161 | } |
| 171 | if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) { | 162 | if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) { |
| 172 | pr_info("%s: can't add protocol\n", __func__); | 163 | printk(KERN_INFO "ipcomp init: can't add protocol\n"); |
| 173 | xfrm_unregister_type(&ipcomp_type, AF_INET); | 164 | xfrm_unregister_type(&ipcomp_type, AF_INET); |
| 174 | return -EAGAIN; | 165 | return -EAGAIN; |
| 175 | } | 166 | } |
| @@ -179,9 +170,9 @@ static int __init ipcomp4_init(void) | |||
| 179 | static void __exit ipcomp4_fini(void) | 170 | static void __exit ipcomp4_fini(void) |
| 180 | { | 171 | { |
| 181 | if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) | 172 | if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) |
| 182 | pr_info("%s: can't remove protocol\n", __func__); | 173 | printk(KERN_INFO "ip ipcomp close: can't remove protocol\n"); |
| 183 | if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0) | 174 | if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0) |
| 184 | pr_info("%s: can't remove xfrm type\n", __func__); | 175 | printk(KERN_INFO "ip ipcomp close: can't remove xfrm type\n"); |
| 185 | } | 176 | } |
| 186 | 177 | ||
| 187 | module_init(ipcomp4_init); | 178 | module_init(ipcomp4_init); |
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index a2e50ae80b5..004bb74b41c 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c | |||
| @@ -54,7 +54,6 @@ | |||
| 54 | #include <linux/delay.h> | 54 | #include <linux/delay.h> |
| 55 | #include <linux/nfs_fs.h> | 55 | #include <linux/nfs_fs.h> |
| 56 | #include <linux/slab.h> | 56 | #include <linux/slab.h> |
| 57 | #include <linux/export.h> | ||
| 58 | #include <net/net_namespace.h> | 57 | #include <net/net_namespace.h> |
| 59 | #include <net/arp.h> | 58 | #include <net/arp.h> |
| 60 | #include <net/ip.h> | 59 | #include <net/ip.h> |
| @@ -136,14 +135,12 @@ __be32 ic_myaddr = NONE; /* My IP address */ | |||
| 136 | static __be32 ic_netmask = NONE; /* Netmask for local subnet */ | 135 | static __be32 ic_netmask = NONE; /* Netmask for local subnet */ |
| 137 | __be32 ic_gateway = NONE; /* Gateway IP address */ | 136 | __be32 ic_gateway = NONE; /* Gateway IP address */ |
| 138 | 137 | ||
| 139 | __be32 ic_addrservaddr = NONE; /* IP Address of the IP addresses'server */ | ||
| 140 | |||
| 141 | __be32 ic_servaddr = NONE; /* Boot server IP address */ | 138 | __be32 ic_servaddr = NONE; /* Boot server IP address */ |
| 142 | 139 | ||
| 143 | __be32 root_server_addr = NONE; /* Address of NFS server */ | 140 | __be32 root_server_addr = NONE; /* Address of NFS server */ |
| 144 | u8 root_server_path[256] = { 0, }; /* Path to mount as root */ | 141 | u8 root_server_path[256] = { 0, }; /* Path to mount as root */ |
| 145 | 142 | ||
| 146 | __be32 ic_dev_xid; /* Device under configuration */ | 143 | u32 ic_dev_xid; /* Device under configuration */ |
| 147 | 144 | ||
| 148 | /* vendor class identifier */ | 145 | /* vendor class identifier */ |
| 149 | static char vendor_class_identifier[253] __initdata; | 146 | static char vendor_class_identifier[253] __initdata; |
| @@ -216,7 +213,7 @@ static int __init ic_open_devs(void) | |||
| 216 | if (!(dev->flags & IFF_LOOPBACK)) | 213 | if (!(dev->flags & IFF_LOOPBACK)) |
| 217 | continue; | 214 | continue; |
| 218 | if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) | 215 | if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) |
| 219 | pr_err("IP-Config: Failed to open %s\n", dev->name); | 216 | printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name); |
| 220 | } | 217 | } |
| 221 | 218 | ||
| 222 | for_each_netdev(&init_net, dev) { | 219 | for_each_netdev(&init_net, dev) { |
| @@ -225,8 +222,7 @@ static int __init ic_open_devs(void) | |||
| 225 | if (dev->mtu >= 364) | 222 | if (dev->mtu >= 364) |
| 226 | able |= IC_BOOTP; | 223 | able |= IC_BOOTP; |
| 227 | else | 224 | else |
| 228 | pr_warn("DHCP/BOOTP: Ignoring device %s, MTU %d too small", | 225 | printk(KERN_WARNING "DHCP/BOOTP: Ignoring device %s, MTU %d too small", dev->name, dev->mtu); |
| 229 | dev->name, dev->mtu); | ||
| 230 | if (!(dev->flags & IFF_NOARP)) | 226 | if (!(dev->flags & IFF_NOARP)) |
| 231 | able |= IC_RARP; | 227 | able |= IC_RARP; |
| 232 | able &= ic_proto_enabled; | 228 | able &= ic_proto_enabled; |
| @@ -234,8 +230,7 @@ static int __init ic_open_devs(void) | |||
| 234 | continue; | 230 | continue; |
| 235 | oflags = dev->flags; | 231 | oflags = dev->flags; |
| 236 | if (dev_change_flags(dev, oflags | IFF_UP) < 0) { | 232 | if (dev_change_flags(dev, oflags | IFF_UP) < 0) { |
| 237 | pr_err("IP-Config: Failed to open %s\n", | 233 | printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name); |
| 238 | dev->name); | ||
| 239 | continue; | 234 | continue; |
| 240 | } | 235 | } |
| 241 | if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) { | 236 | if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) { |
| @@ -277,10 +272,9 @@ have_carrier: | |||
| 277 | 272 | ||
| 278 | if (!ic_first_dev) { | 273 | if (!ic_first_dev) { |
| 279 | if (user_dev_name[0]) | 274 | if (user_dev_name[0]) |
| 280 | pr_err("IP-Config: Device `%s' not found\n", | 275 | printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name); |
| 281 | user_dev_name); | ||
| 282 | else | 276 | else |
| 283 | pr_err("IP-Config: No network devices available\n"); | 277 | printk(KERN_ERR "IP-Config: No network devices available.\n"); |
| 284 | return -ENODEV; | 278 | return -ENODEV; |
| 285 | } | 279 | } |
| 286 | return 0; | 280 | return 0; |
| @@ -364,20 +358,17 @@ static int __init ic_setup_if(void) | |||
| 364 | strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name); | 358 | strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name); |
| 365 | set_sockaddr(sin, ic_myaddr, 0); | 359 | set_sockaddr(sin, ic_myaddr, 0); |
| 366 | if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) { | 360 | if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) { |
| 367 | pr_err("IP-Config: Unable to set interface address (%d)\n", | 361 | printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err); |
| 368 | err); | ||
| 369 | return -1; | 362 | return -1; |
| 370 | } | 363 | } |
| 371 | set_sockaddr(sin, ic_netmask, 0); | 364 | set_sockaddr(sin, ic_netmask, 0); |
| 372 | if ((err = ic_devinet_ioctl(SIOCSIFNETMASK, &ir)) < 0) { | 365 | if ((err = ic_devinet_ioctl(SIOCSIFNETMASK, &ir)) < 0) { |
| 373 | pr_err("IP-Config: Unable to set interface netmask (%d)\n", | 366 | printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err); |
| 374 | err); | ||
| 375 | return -1; | 367 | return -1; |
| 376 | } | 368 | } |
| 377 | set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0); | 369 | set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0); |
| 378 | if ((err = ic_devinet_ioctl(SIOCSIFBRDADDR, &ir)) < 0) { | 370 | if ((err = ic_devinet_ioctl(SIOCSIFBRDADDR, &ir)) < 0) { |
| 379 | pr_err("IP-Config: Unable to set interface broadcast address (%d)\n", | 371 | printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err); |
| 380 | err); | ||
| 381 | return -1; | 372 | return -1; |
| 382 | } | 373 | } |
| 383 | /* Handle the case where we need non-standard MTU on the boot link (a network | 374 | /* Handle the case where we need non-standard MTU on the boot link (a network |
| @@ -388,8 +379,8 @@ static int __init ic_setup_if(void) | |||
| 388 | strcpy(ir.ifr_name, ic_dev->name); | 379 | strcpy(ir.ifr_name, ic_dev->name); |
| 389 | ir.ifr_mtu = ic_dev_mtu; | 380 | ir.ifr_mtu = ic_dev_mtu; |
| 390 | if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0) | 381 | if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0) |
| 391 | pr_err("IP-Config: Unable to set interface mtu to %d (%d)\n", | 382 | printk(KERN_ERR "IP-Config: Unable to set interface mtu to %d (%d).\n", |
| 392 | ic_dev_mtu, err); | 383 | ic_dev_mtu, err); |
| 393 | } | 384 | } |
| 394 | return 0; | 385 | return 0; |
| 395 | } | 386 | } |
| @@ -404,7 +395,7 @@ static int __init ic_setup_routes(void) | |||
| 404 | 395 | ||
| 405 | memset(&rm, 0, sizeof(rm)); | 396 | memset(&rm, 0, sizeof(rm)); |
| 406 | if ((ic_gateway ^ ic_myaddr) & ic_netmask) { | 397 | if ((ic_gateway ^ ic_myaddr) & ic_netmask) { |
| 407 | pr_err("IP-Config: Gateway not on directly connected network\n"); | 398 | printk(KERN_ERR "IP-Config: Gateway not on directly connected network.\n"); |
| 408 | return -1; | 399 | return -1; |
| 409 | } | 400 | } |
| 410 | set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0); | 401 | set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0); |
| @@ -412,8 +403,7 @@ static int __init ic_setup_routes(void) | |||
| 412 | set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0); | 403 | set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0); |
| 413 | rm.rt_flags = RTF_UP | RTF_GATEWAY; | 404 | rm.rt_flags = RTF_UP | RTF_GATEWAY; |
| 414 | if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) { | 405 | if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) { |
| 415 | pr_err("IP-Config: Cannot add default route (%d)\n", | 406 | printk(KERN_ERR "IP-Config: Cannot add default route (%d).\n", err); |
| 416 | err); | ||
| 417 | return -1; | 407 | return -1; |
| 418 | } | 408 | } |
| 419 | } | 409 | } |
| @@ -446,8 +436,8 @@ static int __init ic_defaults(void) | |||
| 446 | else if (IN_CLASSC(ntohl(ic_myaddr))) | 436 | else if (IN_CLASSC(ntohl(ic_myaddr))) |
| 447 | ic_netmask = htonl(IN_CLASSC_NET); | 437 | ic_netmask = htonl(IN_CLASSC_NET); |
| 448 | else { | 438 | else { |
| 449 | pr_err("IP-Config: Unable to guess netmask for address %pI4\n", | 439 | printk(KERN_ERR "IP-Config: Unable to guess netmask for address %pI4\n", |
| 450 | &ic_myaddr); | 440 | &ic_myaddr); |
| 451 | return -1; | 441 | return -1; |
| 452 | } | 442 | } |
| 453 | printk("IP-Config: Guessing netmask %pI4\n", &ic_netmask); | 443 | printk("IP-Config: Guessing netmask %pI4\n", &ic_netmask); |
| @@ -560,7 +550,6 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt | |||
| 560 | if (ic_myaddr == NONE) | 550 | if (ic_myaddr == NONE) |
| 561 | ic_myaddr = tip; | 551 | ic_myaddr = tip; |
| 562 | ic_servaddr = sip; | 552 | ic_servaddr = sip; |
| 563 | ic_addrservaddr = sip; | ||
| 564 | ic_got_reply = IC_RARP; | 553 | ic_got_reply = IC_RARP; |
| 565 | 554 | ||
| 566 | drop_unlock: | 555 | drop_unlock: |
| @@ -586,17 +575,6 @@ static void __init ic_rarp_send_if(struct ic_device *d) | |||
| 586 | #endif | 575 | #endif |
| 587 | 576 | ||
| 588 | /* | 577 | /* |
| 589 | * Predefine Nameservers | ||
| 590 | */ | ||
| 591 | static inline void __init ic_nameservers_predef(void) | ||
| 592 | { | ||
| 593 | int i; | ||
| 594 | |||
| 595 | for (i = 0; i < CONF_NAMESERVERS_MAX; i++) | ||
| 596 | ic_nameservers[i] = NONE; | ||
| 597 | } | ||
| 598 | |||
| 599 | /* | ||
| 600 | * DHCP/BOOTP support. | 578 | * DHCP/BOOTP support. |
| 601 | */ | 579 | */ |
| 602 | 580 | ||
| @@ -709,8 +687,8 @@ ic_dhcp_init_options(u8 *options) | |||
| 709 | e += len; | 687 | e += len; |
| 710 | } | 688 | } |
| 711 | if (*vendor_class_identifier) { | 689 | if (*vendor_class_identifier) { |
| 712 | pr_info("DHCP: sending class identifier \"%s\"\n", | 690 | printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n", |
| 713 | vendor_class_identifier); | 691 | vendor_class_identifier); |
| 714 | *e++ = 60; /* Class-identifier */ | 692 | *e++ = 60; /* Class-identifier */ |
| 715 | len = strlen(vendor_class_identifier); | 693 | len = strlen(vendor_class_identifier); |
| 716 | *e++ = len; | 694 | *e++ = len; |
| @@ -761,7 +739,10 @@ static void __init ic_bootp_init_ext(u8 *e) | |||
| 761 | */ | 739 | */ |
| 762 | static inline void __init ic_bootp_init(void) | 740 | static inline void __init ic_bootp_init(void) |
| 763 | { | 741 | { |
| 764 | ic_nameservers_predef(); | 742 | int i; |
| 743 | |||
| 744 | for (i = 0; i < CONF_NAMESERVERS_MAX; i++) | ||
| 745 | ic_nameservers[i] = NONE; | ||
| 765 | 746 | ||
| 766 | dev_add_pack(&bootp_packet_type); | 747 | dev_add_pack(&bootp_packet_type); |
| 767 | } | 748 | } |
| @@ -785,15 +766,13 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d | |||
| 785 | struct sk_buff *skb; | 766 | struct sk_buff *skb; |
| 786 | struct bootp_pkt *b; | 767 | struct bootp_pkt *b; |
| 787 | struct iphdr *h; | 768 | struct iphdr *h; |
| 788 | int hlen = LL_RESERVED_SPACE(dev); | ||
| 789 | int tlen = dev->needed_tailroom; | ||
| 790 | 769 | ||
| 791 | /* Allocate packet */ | 770 | /* Allocate packet */ |
| 792 | skb = alloc_skb(sizeof(struct bootp_pkt) + hlen + tlen + 15, | 771 | skb = alloc_skb(sizeof(struct bootp_pkt) + LL_ALLOCATED_SPACE(dev) + 15, |
| 793 | GFP_KERNEL); | 772 | GFP_KERNEL); |
| 794 | if (!skb) | 773 | if (!skb) |
| 795 | return; | 774 | return; |
| 796 | skb_reserve(skb, hlen); | 775 | skb_reserve(skb, LL_RESERVED_SPACE(dev)); |
| 797 | b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt)); | 776 | b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt)); |
| 798 | memset(b, 0, sizeof(struct bootp_pkt)); | 777 | memset(b, 0, sizeof(struct bootp_pkt)); |
| 799 | 778 | ||
| @@ -819,6 +798,8 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d | |||
| 819 | b->op = BOOTP_REQUEST; | 798 | b->op = BOOTP_REQUEST; |
| 820 | if (dev->type < 256) /* check for false types */ | 799 | if (dev->type < 256) /* check for false types */ |
| 821 | b->htype = dev->type; | 800 | b->htype = dev->type; |
| 801 | else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */ | ||
| 802 | b->htype = ARPHRD_IEEE802; | ||
| 822 | else if (dev->type == ARPHRD_FDDI) | 803 | else if (dev->type == ARPHRD_FDDI) |
| 823 | b->htype = ARPHRD_ETHER; | 804 | b->htype = ARPHRD_ETHER; |
| 824 | else { | 805 | else { |
| @@ -844,13 +825,8 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d | |||
| 844 | skb->dev = dev; | 825 | skb->dev = dev; |
| 845 | skb->protocol = htons(ETH_P_IP); | 826 | skb->protocol = htons(ETH_P_IP); |
| 846 | if (dev_hard_header(skb, dev, ntohs(skb->protocol), | 827 | if (dev_hard_header(skb, dev, ntohs(skb->protocol), |
| 847 | dev->broadcast, dev->dev_addr, skb->len) < 0) { | 828 | dev->broadcast, dev->dev_addr, skb->len) < 0 || |
| 848 | kfree_skb(skb); | 829 | dev_queue_xmit(skb) < 0) |
| 849 | printk("E"); | ||
| 850 | return; | ||
| 851 | } | ||
| 852 | |||
| 853 | if (dev_queue_xmit(skb) < 0) | ||
| 854 | printk("E"); | 830 | printk("E"); |
| 855 | } | 831 | } |
| 856 | 832 | ||
| @@ -875,9 +851,9 @@ static int __init ic_bootp_string(char *dest, char *src, int len, int max) | |||
| 875 | */ | 851 | */ |
| 876 | static void __init ic_do_bootp_ext(u8 *ext) | 852 | static void __init ic_do_bootp_ext(u8 *ext) |
| 877 | { | 853 | { |
| 878 | u8 servers; | 854 | u8 servers; |
| 879 | int i; | 855 | int i; |
| 880 | __be16 mtu; | 856 | u16 mtu; |
| 881 | 857 | ||
| 882 | #ifdef IPCONFIG_DEBUG | 858 | #ifdef IPCONFIG_DEBUG |
| 883 | u8 *c; | 859 | u8 *c; |
| @@ -964,7 +940,9 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str | |||
| 964 | 940 | ||
| 965 | /* Fragments are not supported */ | 941 | /* Fragments are not supported */ |
| 966 | if (ip_is_fragment(h)) { | 942 | if (ip_is_fragment(h)) { |
| 967 | net_err_ratelimited("DHCP/BOOTP: Ignoring fragmented reply\n"); | 943 | if (net_ratelimit()) |
| 944 | printk(KERN_ERR "DHCP/BOOTP: Ignoring fragmented " | ||
| 945 | "reply.\n"); | ||
| 968 | goto drop; | 946 | goto drop; |
| 969 | } | 947 | } |
| 970 | 948 | ||
| @@ -1012,14 +990,17 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str | |||
| 1012 | /* Is it a reply to our BOOTP request? */ | 990 | /* Is it a reply to our BOOTP request? */ |
| 1013 | if (b->op != BOOTP_REPLY || | 991 | if (b->op != BOOTP_REPLY || |
| 1014 | b->xid != d->xid) { | 992 | b->xid != d->xid) { |
| 1015 | net_err_ratelimited("DHCP/BOOTP: Reply not for us, op[%x] xid[%x]\n", | 993 | if (net_ratelimit()) |
| 1016 | b->op, b->xid); | 994 | printk(KERN_ERR "DHCP/BOOTP: Reply not for us, " |
| 995 | "op[%x] xid[%x]\n", | ||
| 996 | b->op, b->xid); | ||
| 1017 | goto drop_unlock; | 997 | goto drop_unlock; |
| 1018 | } | 998 | } |
| 1019 | 999 | ||
| 1020 | /* Is it a reply for the device we are configuring? */ | 1000 | /* Is it a reply for the device we are configuring? */ |
| 1021 | if (b->xid != ic_dev_xid) { | 1001 | if (b->xid != ic_dev_xid) { |
| 1022 | net_err_ratelimited("DHCP/BOOTP: Ignoring delayed packet\n"); | 1002 | if (net_ratelimit()) |
| 1003 | printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet\n"); | ||
| 1023 | goto drop_unlock; | 1004 | goto drop_unlock; |
| 1024 | } | 1005 | } |
| 1025 | 1006 | ||
| @@ -1071,7 +1052,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str | |||
| 1071 | ic_servaddr = server_id; | 1052 | ic_servaddr = server_id; |
| 1072 | #ifdef IPCONFIG_DEBUG | 1053 | #ifdef IPCONFIG_DEBUG |
| 1073 | printk("DHCP: Offered address %pI4 by server %pI4\n", | 1054 | printk("DHCP: Offered address %pI4 by server %pI4\n", |
| 1074 | &ic_myaddr, &b->iph.saddr); | 1055 | &ic_myaddr, &ic_servaddr); |
| 1075 | #endif | 1056 | #endif |
| 1076 | /* The DHCP indicated server address takes | 1057 | /* The DHCP indicated server address takes |
| 1077 | * precedence over the bootp header one if | 1058 | * precedence over the bootp header one if |
| @@ -1116,7 +1097,6 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str | |||
| 1116 | ic_dev = dev; | 1097 | ic_dev = dev; |
| 1117 | ic_myaddr = b->your_ip; | 1098 | ic_myaddr = b->your_ip; |
| 1118 | ic_servaddr = b->server_ip; | 1099 | ic_servaddr = b->server_ip; |
| 1119 | ic_addrservaddr = b->iph.saddr; | ||
| 1120 | if (ic_gateway == NONE && b->relay_ip) | 1100 | if (ic_gateway == NONE && b->relay_ip) |
| 1121 | ic_gateway = b->relay_ip; | 1101 | ic_gateway = b->relay_ip; |
| 1122 | if (ic_nameservers[0] == NONE) | 1102 | if (ic_nameservers[0] == NONE) |
| @@ -1158,17 +1138,17 @@ static int __init ic_dynamic(void) | |||
| 1158 | * are missing, and without DHCP/BOOTP/RARP we are unable to get it. | 1138 | * are missing, and without DHCP/BOOTP/RARP we are unable to get it. |
| 1159 | */ | 1139 | */ |
| 1160 | if (!ic_proto_enabled) { | 1140 | if (!ic_proto_enabled) { |
| 1161 | pr_err("IP-Config: Incomplete network configuration information\n"); | 1141 | printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n"); |
| 1162 | return -1; | 1142 | return -1; |
| 1163 | } | 1143 | } |
| 1164 | 1144 | ||
| 1165 | #ifdef IPCONFIG_BOOTP | 1145 | #ifdef IPCONFIG_BOOTP |
| 1166 | if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP) | 1146 | if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP) |
| 1167 | pr_err("DHCP/BOOTP: No suitable device found\n"); | 1147 | printk(KERN_ERR "DHCP/BOOTP: No suitable device found.\n"); |
| 1168 | #endif | 1148 | #endif |
| 1169 | #ifdef IPCONFIG_RARP | 1149 | #ifdef IPCONFIG_RARP |
| 1170 | if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP) | 1150 | if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP) |
| 1171 | pr_err("RARP: No suitable device found\n"); | 1151 | printk(KERN_ERR "RARP: No suitable device found.\n"); |
| 1172 | #endif | 1152 | #endif |
| 1173 | 1153 | ||
| 1174 | if (!ic_proto_have_if) | 1154 | if (!ic_proto_have_if) |
| @@ -1195,17 +1175,17 @@ static int __init ic_dynamic(void) | |||
| 1195 | * [Actually we could now, but the nothing else running note still | 1175 | * [Actually we could now, but the nothing else running note still |
| 1196 | * applies.. - AC] | 1176 | * applies.. - AC] |
| 1197 | */ | 1177 | */ |
| 1198 | pr_notice("Sending %s%s%s requests .", | 1178 | printk(KERN_NOTICE "Sending %s%s%s requests .", |
| 1199 | do_bootp | 1179 | do_bootp |
| 1200 | ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "", | 1180 | ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "", |
| 1201 | (do_bootp && do_rarp) ? " and " : "", | 1181 | (do_bootp && do_rarp) ? " and " : "", |
| 1202 | do_rarp ? "RARP" : ""); | 1182 | do_rarp ? "RARP" : ""); |
| 1203 | 1183 | ||
| 1204 | start_jiffies = jiffies; | 1184 | start_jiffies = jiffies; |
| 1205 | d = ic_first_dev; | 1185 | d = ic_first_dev; |
| 1206 | retries = CONF_SEND_RETRIES; | 1186 | retries = CONF_SEND_RETRIES; |
| 1207 | get_random_bytes(&timeout, sizeof(timeout)); | 1187 | get_random_bytes(&timeout, sizeof(timeout)); |
| 1208 | timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM); | 1188 | timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM); |
| 1209 | for (;;) { | 1189 | for (;;) { |
| 1210 | /* Track the device we are configuring */ | 1190 | /* Track the device we are configuring */ |
| 1211 | ic_dev_xid = d->xid; | 1191 | ic_dev_xid = d->xid; |
| @@ -1228,13 +1208,13 @@ static int __init ic_dynamic(void) | |||
| 1228 | (ic_proto_enabled & IC_USE_DHCP) && | 1208 | (ic_proto_enabled & IC_USE_DHCP) && |
| 1229 | ic_dhcp_msgtype != DHCPACK) { | 1209 | ic_dhcp_msgtype != DHCPACK) { |
| 1230 | ic_got_reply = 0; | 1210 | ic_got_reply = 0; |
| 1231 | pr_cont(","); | 1211 | printk(KERN_CONT ","); |
| 1232 | continue; | 1212 | continue; |
| 1233 | } | 1213 | } |
| 1234 | #endif /* IPCONFIG_DHCP */ | 1214 | #endif /* IPCONFIG_DHCP */ |
| 1235 | 1215 | ||
| 1236 | if (ic_got_reply) { | 1216 | if (ic_got_reply) { |
| 1237 | pr_cont(" OK\n"); | 1217 | printk(KERN_CONT " OK\n"); |
| 1238 | break; | 1218 | break; |
| 1239 | } | 1219 | } |
| 1240 | 1220 | ||
| @@ -1242,7 +1222,7 @@ static int __init ic_dynamic(void) | |||
| 1242 | continue; | 1222 | continue; |
| 1243 | 1223 | ||
| 1244 | if (! --retries) { | 1224 | if (! --retries) { |
| 1245 | pr_cont(" timed out!\n"); | 1225 | printk(KERN_CONT " timed out!\n"); |
| 1246 | break; | 1226 | break; |
| 1247 | } | 1227 | } |
| 1248 | 1228 | ||
| @@ -1252,7 +1232,7 @@ static int __init ic_dynamic(void) | |||
| 1252 | if (timeout > CONF_TIMEOUT_MAX) | 1232 | if (timeout > CONF_TIMEOUT_MAX) |
| 1253 | timeout = CONF_TIMEOUT_MAX; | 1233 | timeout = CONF_TIMEOUT_MAX; |
| 1254 | 1234 | ||
| 1255 | pr_cont("."); | 1235 | printk(KERN_CONT "."); |
| 1256 | } | 1236 | } |
| 1257 | 1237 | ||
| 1258 | #ifdef IPCONFIG_BOOTP | 1238 | #ifdef IPCONFIG_BOOTP |
| @@ -1272,8 +1252,8 @@ static int __init ic_dynamic(void) | |||
| 1272 | printk("IP-Config: Got %s answer from %pI4, ", | 1252 | printk("IP-Config: Got %s answer from %pI4, ", |
| 1273 | ((ic_got_reply & IC_RARP) ? "RARP" | 1253 | ((ic_got_reply & IC_RARP) ? "RARP" |
| 1274 | : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), | 1254 | : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), |
| 1275 | &ic_addrservaddr); | 1255 | &ic_servaddr); |
| 1276 | pr_cont("my address is %pI4\n", &ic_myaddr); | 1256 | printk(KERN_CONT "my address is %pI4\n", &ic_myaddr); |
| 1277 | 1257 | ||
| 1278 | return 0; | 1258 | return 0; |
| 1279 | } | 1259 | } |
| @@ -1391,7 +1371,6 @@ static int __init ip_auto_config(void) | |||
| 1391 | int retries = CONF_OPEN_RETRIES; | 1371 | int retries = CONF_OPEN_RETRIES; |
| 1392 | #endif | 1372 | #endif |
| 1393 | int err; | 1373 | int err; |
| 1394 | unsigned int i; | ||
| 1395 | 1374 | ||
| 1396 | #ifdef CONFIG_PROC_FS | 1375 | #ifdef CONFIG_PROC_FS |
| 1397 | proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops); | 1376 | proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops); |
| @@ -1450,22 +1429,24 @@ static int __init ip_auto_config(void) | |||
| 1450 | */ | 1429 | */ |
| 1451 | #ifdef CONFIG_ROOT_NFS | 1430 | #ifdef CONFIG_ROOT_NFS |
| 1452 | if (ROOT_DEV == Root_NFS) { | 1431 | if (ROOT_DEV == Root_NFS) { |
| 1453 | pr_err("IP-Config: Retrying forever (NFS root)...\n"); | 1432 | printk(KERN_ERR |
| 1433 | "IP-Config: Retrying forever (NFS root)...\n"); | ||
| 1454 | goto try_try_again; | 1434 | goto try_try_again; |
| 1455 | } | 1435 | } |
| 1456 | #endif | 1436 | #endif |
| 1457 | 1437 | ||
| 1458 | if (--retries) { | 1438 | if (--retries) { |
| 1459 | pr_err("IP-Config: Reopening network devices...\n"); | 1439 | printk(KERN_ERR |
| 1440 | "IP-Config: Reopening network devices...\n"); | ||
| 1460 | goto try_try_again; | 1441 | goto try_try_again; |
| 1461 | } | 1442 | } |
| 1462 | 1443 | ||
| 1463 | /* Oh, well. At least we tried. */ | 1444 | /* Oh, well. At least we tried. */ |
| 1464 | pr_err("IP-Config: Auto-configuration of network failed\n"); | 1445 | printk(KERN_ERR "IP-Config: Auto-configuration of network failed.\n"); |
| 1465 | return -1; | 1446 | return -1; |
| 1466 | } | 1447 | } |
| 1467 | #else /* !DYNAMIC */ | 1448 | #else /* !DYNAMIC */ |
| 1468 | pr_err("IP-Config: Incomplete network configuration information\n"); | 1449 | printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n"); |
| 1469 | ic_close_devs(); | 1450 | ic_close_devs(); |
| 1470 | return -1; | 1451 | return -1; |
| 1471 | #endif /* IPCONFIG_DYNAMIC */ | 1452 | #endif /* IPCONFIG_DYNAMIC */ |
| @@ -1503,26 +1484,19 @@ static int __init ip_auto_config(void) | |||
| 1503 | /* | 1484 | /* |
| 1504 | * Clue in the operator. | 1485 | * Clue in the operator. |
| 1505 | */ | 1486 | */ |
| 1506 | pr_info("IP-Config: Complete:\n"); | 1487 | printk("IP-Config: Complete:\n"); |
| 1507 | 1488 | printk(" device=%s", ic_dev->name); | |
| 1508 | pr_info(" device=%s, hwaddr=%*phC, ipaddr=%pI4, mask=%pI4, gw=%pI4\n", | 1489 | printk(KERN_CONT ", addr=%pI4", &ic_myaddr); |
| 1509 | ic_dev->name, ic_dev->addr_len, ic_dev->dev_addr, | 1490 | printk(KERN_CONT ", mask=%pI4", &ic_netmask); |
| 1510 | &ic_myaddr, &ic_netmask, &ic_gateway); | 1491 | printk(KERN_CONT ", gw=%pI4", &ic_gateway); |
| 1511 | pr_info(" host=%s, domain=%s, nis-domain=%s\n", | 1492 | printk(KERN_CONT ",\n host=%s, domain=%s, nis-domain=%s", |
| 1512 | utsname()->nodename, ic_domain, utsname()->domainname); | 1493 | utsname()->nodename, ic_domain, utsname()->domainname); |
| 1513 | pr_info(" bootserver=%pI4, rootserver=%pI4, rootpath=%s", | 1494 | printk(KERN_CONT ",\n bootserver=%pI4", &ic_servaddr); |
| 1514 | &ic_servaddr, &root_server_addr, root_server_path); | 1495 | printk(KERN_CONT ", rootserver=%pI4", &root_server_addr); |
| 1496 | printk(KERN_CONT ", rootpath=%s", root_server_path); | ||
| 1515 | if (ic_dev_mtu) | 1497 | if (ic_dev_mtu) |
| 1516 | pr_cont(", mtu=%d", ic_dev_mtu); | 1498 | printk(KERN_CONT ", mtu=%d", ic_dev_mtu); |
| 1517 | for (i = 0; i < CONF_NAMESERVERS_MAX; i++) | 1499 | printk(KERN_CONT "\n"); |
| 1518 | if (ic_nameservers[i] != NONE) { | ||
| 1519 | pr_info(" nameserver%u=%pI4", | ||
| 1520 | i, &ic_nameservers[i]); | ||
| 1521 | break; | ||
| 1522 | } | ||
| 1523 | for (i++; i < CONF_NAMESERVERS_MAX; i++) | ||
| 1524 | if (ic_nameservers[i] != NONE) | ||
| 1525 | pr_cont(", nameserver%u=%pI4\n", i, &ic_nameservers[i]); | ||
| 1526 | #endif /* !SILENT */ | 1500 | #endif /* !SILENT */ |
| 1527 | 1501 | ||
| 1528 | return 0; | 1502 | return 0; |
| @@ -1593,8 +1567,6 @@ static int __init ip_auto_config_setup(char *addrs) | |||
| 1593 | return 1; | 1567 | return 1; |
| 1594 | } | 1568 | } |
| 1595 | 1569 | ||
| 1596 | ic_nameservers_predef(); | ||
| 1597 | |||
| 1598 | /* Parse string for static IP assignment. */ | 1570 | /* Parse string for static IP assignment. */ |
| 1599 | ip = addrs; | 1571 | ip = addrs; |
| 1600 | while (ip && *ip) { | 1572 | while (ip && *ip) { |
| @@ -1638,20 +1610,6 @@ static int __init ip_auto_config_setup(char *addrs) | |||
| 1638 | ic_enable = 0; | 1610 | ic_enable = 0; |
| 1639 | } | 1611 | } |
| 1640 | break; | 1612 | break; |
| 1641 | case 7: | ||
| 1642 | if (CONF_NAMESERVERS_MAX >= 1) { | ||
| 1643 | ic_nameservers[0] = in_aton(ip); | ||
| 1644 | if (ic_nameservers[0] == ANY) | ||
| 1645 | ic_nameservers[0] = NONE; | ||
| 1646 | } | ||
| 1647 | break; | ||
| 1648 | case 8: | ||
| 1649 | if (CONF_NAMESERVERS_MAX >= 2) { | ||
| 1650 | ic_nameservers[1] = in_aton(ip); | ||
| 1651 | if (ic_nameservers[1] == ANY) | ||
| 1652 | ic_nameservers[1] = NONE; | ||
| 1653 | } | ||
| 1654 | break; | ||
| 1655 | } | 1613 | } |
| 1656 | } | 1614 | } |
| 1657 | ip = cp; | 1615 | ip = cp; |
| @@ -1660,21 +1618,22 @@ static int __init ip_auto_config_setup(char *addrs) | |||
| 1660 | 1618 | ||
| 1661 | return 1; | 1619 | return 1; |
| 1662 | } | 1620 | } |
| 1663 | __setup("ip=", ip_auto_config_setup); | ||
| 1664 | 1621 | ||
| 1665 | static int __init nfsaddrs_config_setup(char *addrs) | 1622 | static int __init nfsaddrs_config_setup(char *addrs) |
| 1666 | { | 1623 | { |
| 1667 | return ip_auto_config_setup(addrs); | 1624 | return ip_auto_config_setup(addrs); |
| 1668 | } | 1625 | } |
| 1669 | __setup("nfsaddrs=", nfsaddrs_config_setup); | ||
| 1670 | 1626 | ||
| 1671 | static int __init vendor_class_identifier_setup(char *addrs) | 1627 | static int __init vendor_class_identifier_setup(char *addrs) |
| 1672 | { | 1628 | { |
| 1673 | if (strlcpy(vendor_class_identifier, addrs, | 1629 | if (strlcpy(vendor_class_identifier, addrs, |
| 1674 | sizeof(vendor_class_identifier)) | 1630 | sizeof(vendor_class_identifier)) |
| 1675 | >= sizeof(vendor_class_identifier)) | 1631 | >= sizeof(vendor_class_identifier)) |
| 1676 | pr_warn("DHCP: vendorclass too long, truncated to \"%s\"", | 1632 | printk(KERN_WARNING "DHCP: vendorclass too long, truncated to \"%s\"", |
| 1677 | vendor_class_identifier); | 1633 | vendor_class_identifier); |
| 1678 | return 1; | 1634 | return 1; |
| 1679 | } | 1635 | } |
| 1636 | |||
| 1637 | __setup("ip=", ip_auto_config_setup); | ||
| 1638 | __setup("nfsaddrs=", nfsaddrs_config_setup); | ||
| 1680 | __setup("dhcpclass=", vendor_class_identifier_setup); | 1639 | __setup("dhcpclass=", vendor_class_identifier_setup); |
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 191fc24a745..6f06f7f39ea 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c | |||
| @@ -120,10 +120,6 @@ | |||
| 120 | #define HASH_SIZE 16 | 120 | #define HASH_SIZE 16 |
| 121 | #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) | 121 | #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) |
| 122 | 122 | ||
| 123 | static bool log_ecn_error = true; | ||
| 124 | module_param(log_ecn_error, bool, 0644); | ||
| 125 | MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); | ||
| 126 | |||
| 127 | static int ipip_net_id __read_mostly; | 123 | static int ipip_net_id __read_mostly; |
| 128 | struct ipip_net { | 124 | struct ipip_net { |
| 129 | struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; | 125 | struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; |
| @@ -138,43 +134,43 @@ struct ipip_net { | |||
| 138 | static int ipip_tunnel_init(struct net_device *dev); | 134 | static int ipip_tunnel_init(struct net_device *dev); |
| 139 | static void ipip_tunnel_setup(struct net_device *dev); | 135 | static void ipip_tunnel_setup(struct net_device *dev); |
| 140 | static void ipip_dev_free(struct net_device *dev); | 136 | static void ipip_dev_free(struct net_device *dev); |
| 141 | static struct rtnl_link_ops ipip_link_ops __read_mostly; | ||
| 142 | 137 | ||
| 143 | static struct rtnl_link_stats64 *ipip_get_stats64(struct net_device *dev, | 138 | /* |
| 144 | struct rtnl_link_stats64 *tot) | 139 | * Locking : hash tables are protected by RCU and RTNL |
| 140 | */ | ||
| 141 | |||
| 142 | #define for_each_ip_tunnel_rcu(start) \ | ||
| 143 | for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) | ||
| 144 | |||
| 145 | /* often modified stats are per cpu, other are shared (netdev->stats) */ | ||
| 146 | struct pcpu_tstats { | ||
| 147 | unsigned long rx_packets; | ||
| 148 | unsigned long rx_bytes; | ||
| 149 | unsigned long tx_packets; | ||
| 150 | unsigned long tx_bytes; | ||
| 151 | }; | ||
| 152 | |||
| 153 | static struct net_device_stats *ipip_get_stats(struct net_device *dev) | ||
| 145 | { | 154 | { |
| 155 | struct pcpu_tstats sum = { 0 }; | ||
| 146 | int i; | 156 | int i; |
| 147 | 157 | ||
| 148 | for_each_possible_cpu(i) { | 158 | for_each_possible_cpu(i) { |
| 149 | const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); | 159 | const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); |
| 150 | u64 rx_packets, rx_bytes, tx_packets, tx_bytes; | ||
| 151 | unsigned int start; | ||
| 152 | |||
| 153 | do { | ||
| 154 | start = u64_stats_fetch_begin_bh(&tstats->syncp); | ||
| 155 | rx_packets = tstats->rx_packets; | ||
| 156 | tx_packets = tstats->tx_packets; | ||
| 157 | rx_bytes = tstats->rx_bytes; | ||
| 158 | tx_bytes = tstats->tx_bytes; | ||
| 159 | } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); | ||
| 160 | |||
| 161 | tot->rx_packets += rx_packets; | ||
| 162 | tot->tx_packets += tx_packets; | ||
| 163 | tot->rx_bytes += rx_bytes; | ||
| 164 | tot->tx_bytes += tx_bytes; | ||
| 165 | } | ||
| 166 | |||
| 167 | tot->tx_fifo_errors = dev->stats.tx_fifo_errors; | ||
| 168 | tot->tx_carrier_errors = dev->stats.tx_carrier_errors; | ||
| 169 | tot->tx_dropped = dev->stats.tx_dropped; | ||
| 170 | tot->tx_aborted_errors = dev->stats.tx_aborted_errors; | ||
| 171 | tot->tx_errors = dev->stats.tx_errors; | ||
| 172 | tot->collisions = dev->stats.collisions; | ||
| 173 | 160 | ||
| 174 | return tot; | 161 | sum.rx_packets += tstats->rx_packets; |
| 162 | sum.rx_bytes += tstats->rx_bytes; | ||
| 163 | sum.tx_packets += tstats->tx_packets; | ||
| 164 | sum.tx_bytes += tstats->tx_bytes; | ||
| 165 | } | ||
| 166 | dev->stats.rx_packets = sum.rx_packets; | ||
| 167 | dev->stats.rx_bytes = sum.rx_bytes; | ||
| 168 | dev->stats.tx_packets = sum.tx_packets; | ||
| 169 | dev->stats.tx_bytes = sum.tx_bytes; | ||
| 170 | return &dev->stats; | ||
| 175 | } | 171 | } |
| 176 | 172 | ||
| 177 | static struct ip_tunnel *ipip_tunnel_lookup(struct net *net, | 173 | static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, |
| 178 | __be32 remote, __be32 local) | 174 | __be32 remote, __be32 local) |
| 179 | { | 175 | { |
| 180 | unsigned int h0 = HASH(remote); | 176 | unsigned int h0 = HASH(remote); |
| @@ -182,16 +178,16 @@ static struct ip_tunnel *ipip_tunnel_lookup(struct net *net, | |||
| 182 | struct ip_tunnel *t; | 178 | struct ip_tunnel *t; |
| 183 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | 179 | struct ipip_net *ipn = net_generic(net, ipip_net_id); |
| 184 | 180 | ||
| 185 | for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1]) | 181 | for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1]) |
| 186 | if (local == t->parms.iph.saddr && | 182 | if (local == t->parms.iph.saddr && |
| 187 | remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) | 183 | remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) |
| 188 | return t; | 184 | return t; |
| 189 | 185 | ||
| 190 | for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0]) | 186 | for_each_ip_tunnel_rcu(ipn->tunnels_r[h0]) |
| 191 | if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) | 187 | if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) |
| 192 | return t; | 188 | return t; |
| 193 | 189 | ||
| 194 | for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1]) | 190 | for_each_ip_tunnel_rcu(ipn->tunnels_l[h1]) |
| 195 | if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) | 191 | if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) |
| 196 | return t; | 192 | return t; |
| 197 | 193 | ||
| @@ -249,33 +245,7 @@ static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) | |||
| 249 | rcu_assign_pointer(*tp, t); | 245 | rcu_assign_pointer(*tp, t); |
| 250 | } | 246 | } |
| 251 | 247 | ||
| 252 | static int ipip_tunnel_create(struct net_device *dev) | 248 | static struct ip_tunnel * ipip_tunnel_locate(struct net *net, |
| 253 | { | ||
| 254 | struct ip_tunnel *t = netdev_priv(dev); | ||
| 255 | struct net *net = dev_net(dev); | ||
| 256 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | ||
| 257 | int err; | ||
| 258 | |||
| 259 | err = ipip_tunnel_init(dev); | ||
| 260 | if (err < 0) | ||
| 261 | goto out; | ||
| 262 | |||
| 263 | err = register_netdevice(dev); | ||
| 264 | if (err < 0) | ||
| 265 | goto out; | ||
| 266 | |||
| 267 | strcpy(t->parms.name, dev->name); | ||
| 268 | dev->rtnl_link_ops = &ipip_link_ops; | ||
| 269 | |||
| 270 | dev_hold(dev); | ||
| 271 | ipip_tunnel_link(ipn, t); | ||
| 272 | return 0; | ||
| 273 | |||
| 274 | out: | ||
| 275 | return err; | ||
| 276 | } | ||
| 277 | |||
| 278 | static struct ip_tunnel *ipip_tunnel_locate(struct net *net, | ||
| 279 | struct ip_tunnel_parm *parms, int create) | 249 | struct ip_tunnel_parm *parms, int create) |
| 280 | { | 250 | { |
| 281 | __be32 remote = parms->iph.daddr; | 251 | __be32 remote = parms->iph.daddr; |
| @@ -309,9 +279,16 @@ static struct ip_tunnel *ipip_tunnel_locate(struct net *net, | |||
| 309 | nt = netdev_priv(dev); | 279 | nt = netdev_priv(dev); |
| 310 | nt->parms = *parms; | 280 | nt->parms = *parms; |
| 311 | 281 | ||
| 312 | if (ipip_tunnel_create(dev) < 0) | 282 | if (ipip_tunnel_init(dev) < 0) |
| 283 | goto failed_free; | ||
| 284 | |||
| 285 | if (register_netdevice(dev) < 0) | ||
| 313 | goto failed_free; | 286 | goto failed_free; |
| 314 | 287 | ||
| 288 | strcpy(nt->parms.name, dev->name); | ||
| 289 | |||
| 290 | dev_hold(dev); | ||
| 291 | ipip_tunnel_link(ipn, nt); | ||
| 315 | return nt; | 292 | return nt; |
| 316 | 293 | ||
| 317 | failed_free: | 294 | failed_free: |
| @@ -326,7 +303,7 @@ static void ipip_tunnel_uninit(struct net_device *dev) | |||
| 326 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | 303 | struct ipip_net *ipn = net_generic(net, ipip_net_id); |
| 327 | 304 | ||
| 328 | if (dev == ipn->fb_tunnel_dev) | 305 | if (dev == ipn->fb_tunnel_dev) |
| 329 | RCU_INIT_POINTER(ipn->tunnels_wc[0], NULL); | 306 | rcu_assign_pointer(ipn->tunnels_wc[0], NULL); |
| 330 | else | 307 | else |
| 331 | ipip_tunnel_unlink(ipn, netdev_priv(dev)); | 308 | ipip_tunnel_unlink(ipn, netdev_priv(dev)); |
| 332 | dev_put(dev); | 309 | dev_put(dev); |
| @@ -356,6 +333,9 @@ static int ipip_err(struct sk_buff *skb, u32 info) | |||
| 356 | case ICMP_PORT_UNREACH: | 333 | case ICMP_PORT_UNREACH: |
| 357 | /* Impossible event. */ | 334 | /* Impossible event. */ |
| 358 | return 0; | 335 | return 0; |
| 336 | case ICMP_FRAG_NEEDED: | ||
| 337 | /* Soft state for pmtu is maintained by IP core. */ | ||
| 338 | return 0; | ||
| 359 | default: | 339 | default: |
| 360 | /* All others are translated to HOST_UNREACH. | 340 | /* All others are translated to HOST_UNREACH. |
| 361 | rfc2003 contains "deep thoughts" about NET_UNREACH, | 341 | rfc2003 contains "deep thoughts" about NET_UNREACH, |
| @@ -368,30 +348,13 @@ static int ipip_err(struct sk_buff *skb, u32 info) | |||
| 368 | if (code != ICMP_EXC_TTL) | 348 | if (code != ICMP_EXC_TTL) |
| 369 | return 0; | 349 | return 0; |
| 370 | break; | 350 | break; |
| 371 | case ICMP_REDIRECT: | ||
| 372 | break; | ||
| 373 | } | 351 | } |
| 374 | 352 | ||
| 375 | err = -ENOENT; | 353 | err = -ENOENT; |
| 376 | t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); | ||
| 377 | if (t == NULL) | ||
| 378 | goto out; | ||
| 379 | 354 | ||
| 380 | if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { | 355 | rcu_read_lock(); |
| 381 | ipv4_update_pmtu(skb, dev_net(skb->dev), info, | 356 | t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); |
| 382 | t->dev->ifindex, 0, IPPROTO_IPIP, 0); | 357 | if (t == NULL || t->parms.iph.daddr == 0) |
| 383 | err = 0; | ||
| 384 | goto out; | ||
| 385 | } | ||
| 386 | |||
| 387 | if (type == ICMP_REDIRECT) { | ||
| 388 | ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0, | ||
| 389 | IPPROTO_IPIP, 0); | ||
| 390 | err = 0; | ||
| 391 | goto out; | ||
| 392 | } | ||
| 393 | |||
| 394 | if (t->parms.iph.daddr == 0) | ||
| 395 | goto out; | 358 | goto out; |
| 396 | 359 | ||
| 397 | err = 0; | 360 | err = 0; |
| @@ -404,22 +367,34 @@ static int ipip_err(struct sk_buff *skb, u32 info) | |||
| 404 | t->err_count = 1; | 367 | t->err_count = 1; |
| 405 | t->err_time = jiffies; | 368 | t->err_time = jiffies; |
| 406 | out: | 369 | out: |
| 407 | 370 | rcu_read_unlock(); | |
| 408 | return err; | 371 | return err; |
| 409 | } | 372 | } |
| 410 | 373 | ||
| 374 | static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph, | ||
| 375 | struct sk_buff *skb) | ||
| 376 | { | ||
| 377 | struct iphdr *inner_iph = ip_hdr(skb); | ||
| 378 | |||
| 379 | if (INET_ECN_is_ce(outer_iph->tos)) | ||
| 380 | IP_ECN_set_ce(inner_iph); | ||
| 381 | } | ||
| 382 | |||
| 411 | static int ipip_rcv(struct sk_buff *skb) | 383 | static int ipip_rcv(struct sk_buff *skb) |
| 412 | { | 384 | { |
| 413 | struct ip_tunnel *tunnel; | 385 | struct ip_tunnel *tunnel; |
| 414 | const struct iphdr *iph = ip_hdr(skb); | 386 | const struct iphdr *iph = ip_hdr(skb); |
| 415 | int err; | ||
| 416 | 387 | ||
| 388 | rcu_read_lock(); | ||
| 417 | tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); | 389 | tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); |
| 418 | if (tunnel != NULL) { | 390 | if (tunnel != NULL) { |
| 419 | struct pcpu_tstats *tstats; | 391 | struct pcpu_tstats *tstats; |
| 420 | 392 | ||
| 421 | if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) | 393 | if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { |
| 422 | goto drop; | 394 | rcu_read_unlock(); |
| 395 | kfree_skb(skb); | ||
| 396 | return 0; | ||
| 397 | } | ||
| 423 | 398 | ||
| 424 | secpath_reset(skb); | 399 | secpath_reset(skb); |
| 425 | 400 | ||
| @@ -428,35 +403,22 @@ static int ipip_rcv(struct sk_buff *skb) | |||
| 428 | skb->protocol = htons(ETH_P_IP); | 403 | skb->protocol = htons(ETH_P_IP); |
| 429 | skb->pkt_type = PACKET_HOST; | 404 | skb->pkt_type = PACKET_HOST; |
| 430 | 405 | ||
| 431 | __skb_tunnel_rx(skb, tunnel->dev); | ||
| 432 | |||
| 433 | err = IP_ECN_decapsulate(iph, skb); | ||
| 434 | if (unlikely(err)) { | ||
| 435 | if (log_ecn_error) | ||
| 436 | net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", | ||
| 437 | &iph->saddr, iph->tos); | ||
| 438 | if (err > 1) { | ||
| 439 | ++tunnel->dev->stats.rx_frame_errors; | ||
| 440 | ++tunnel->dev->stats.rx_errors; | ||
| 441 | goto drop; | ||
| 442 | } | ||
| 443 | } | ||
| 444 | |||
| 445 | tstats = this_cpu_ptr(tunnel->dev->tstats); | 406 | tstats = this_cpu_ptr(tunnel->dev->tstats); |
| 446 | u64_stats_update_begin(&tstats->syncp); | ||
| 447 | tstats->rx_packets++; | 407 | tstats->rx_packets++; |
| 448 | tstats->rx_bytes += skb->len; | 408 | tstats->rx_bytes += skb->len; |
| 449 | u64_stats_update_end(&tstats->syncp); | 409 | |
| 410 | __skb_tunnel_rx(skb, tunnel->dev); | ||
| 411 | |||
| 412 | ipip_ecn_decapsulate(iph, skb); | ||
| 450 | 413 | ||
| 451 | netif_rx(skb); | 414 | netif_rx(skb); |
| 415 | |||
| 416 | rcu_read_unlock(); | ||
| 452 | return 0; | 417 | return 0; |
| 453 | } | 418 | } |
| 419 | rcu_read_unlock(); | ||
| 454 | 420 | ||
| 455 | return -1; | 421 | return -1; |
| 456 | |||
| 457 | drop: | ||
| 458 | kfree_skb(skb); | ||
| 459 | return 0; | ||
| 460 | } | 422 | } |
| 461 | 423 | ||
| 462 | /* | 424 | /* |
| @@ -467,6 +429,7 @@ drop: | |||
| 467 | static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | 429 | static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) |
| 468 | { | 430 | { |
| 469 | struct ip_tunnel *tunnel = netdev_priv(dev); | 431 | struct ip_tunnel *tunnel = netdev_priv(dev); |
| 432 | struct pcpu_tstats *tstats; | ||
| 470 | const struct iphdr *tiph = &tunnel->parms.iph; | 433 | const struct iphdr *tiph = &tunnel->parms.iph; |
| 471 | u8 tos = tunnel->parms.iph.tos; | 434 | u8 tos = tunnel->parms.iph.tos; |
| 472 | __be16 df = tiph->frag_off; | 435 | __be16 df = tiph->frag_off; |
| @@ -482,10 +445,6 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |||
| 482 | if (skb->protocol != htons(ETH_P_IP)) | 445 | if (skb->protocol != htons(ETH_P_IP)) |
| 483 | goto tx_error; | 446 | goto tx_error; |
| 484 | 447 | ||
| 485 | if (skb->ip_summed == CHECKSUM_PARTIAL && | ||
| 486 | skb_checksum_help(skb)) | ||
| 487 | goto tx_error; | ||
| 488 | |||
| 489 | if (tos & 1) | 448 | if (tos & 1) |
| 490 | tos = old_iph->tos; | 449 | tos = old_iph->tos; |
| 491 | 450 | ||
| @@ -495,7 +454,8 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |||
| 495 | dev->stats.tx_fifo_errors++; | 454 | dev->stats.tx_fifo_errors++; |
| 496 | goto tx_error; | 455 | goto tx_error; |
| 497 | } | 456 | } |
| 498 | dst = rt_nexthop(rt, old_iph->daddr); | 457 | if ((dst = rt->rt_gateway) == 0) |
| 458 | goto tx_error_icmp; | ||
| 499 | } | 459 | } |
| 500 | 460 | ||
| 501 | rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, | 461 | rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, |
| @@ -527,7 +487,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |||
| 527 | } | 487 | } |
| 528 | 488 | ||
| 529 | if (skb_dst(skb)) | 489 | if (skb_dst(skb)) |
| 530 | skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); | 490 | skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); |
| 531 | 491 | ||
| 532 | if ((old_iph->frag_off & htons(IP_DF)) && | 492 | if ((old_iph->frag_off & htons(IP_DF)) && |
| 533 | mtu < ntohs(old_iph->tot_len)) { | 493 | mtu < ntohs(old_iph->tot_len)) { |
| @@ -593,7 +553,9 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |||
| 593 | if ((iph->ttl = tiph->ttl) == 0) | 553 | if ((iph->ttl = tiph->ttl) == 0) |
| 594 | iph->ttl = old_iph->ttl; | 554 | iph->ttl = old_iph->ttl; |
| 595 | 555 | ||
| 596 | iptunnel_xmit(skb, dev); | 556 | nf_reset(skb); |
| 557 | tstats = this_cpu_ptr(dev->tstats); | ||
| 558 | __IPTUNNEL_XMIT(tstats, &dev->stats); | ||
| 597 | return NETDEV_TX_OK; | 559 | return NETDEV_TX_OK; |
| 598 | 560 | ||
| 599 | tx_error_icmp: | 561 | tx_error_icmp: |
| @@ -640,28 +602,6 @@ static void ipip_tunnel_bind_dev(struct net_device *dev) | |||
| 640 | dev->iflink = tunnel->parms.link; | 602 | dev->iflink = tunnel->parms.link; |
| 641 | } | 603 | } |
| 642 | 604 | ||
| 643 | static void ipip_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p) | ||
| 644 | { | ||
| 645 | struct net *net = dev_net(t->dev); | ||
| 646 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | ||
| 647 | |||
| 648 | ipip_tunnel_unlink(ipn, t); | ||
| 649 | synchronize_net(); | ||
| 650 | t->parms.iph.saddr = p->iph.saddr; | ||
| 651 | t->parms.iph.daddr = p->iph.daddr; | ||
| 652 | memcpy(t->dev->dev_addr, &p->iph.saddr, 4); | ||
| 653 | memcpy(t->dev->broadcast, &p->iph.daddr, 4); | ||
| 654 | ipip_tunnel_link(ipn, t); | ||
| 655 | t->parms.iph.ttl = p->iph.ttl; | ||
| 656 | t->parms.iph.tos = p->iph.tos; | ||
| 657 | t->parms.iph.frag_off = p->iph.frag_off; | ||
| 658 | if (t->parms.link != p->link) { | ||
| 659 | t->parms.link = p->link; | ||
| 660 | ipip_tunnel_bind_dev(t->dev); | ||
| 661 | } | ||
| 662 | netdev_state_change(t->dev); | ||
| 663 | } | ||
| 664 | |||
| 665 | static int | 605 | static int |
| 666 | ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) | 606 | ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) |
| 667 | { | 607 | { |
| @@ -691,7 +631,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) | |||
| 691 | case SIOCADDTUNNEL: | 631 | case SIOCADDTUNNEL: |
| 692 | case SIOCCHGTUNNEL: | 632 | case SIOCCHGTUNNEL: |
| 693 | err = -EPERM; | 633 | err = -EPERM; |
| 694 | if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) | 634 | if (!capable(CAP_NET_ADMIN)) |
| 695 | goto done; | 635 | goto done; |
| 696 | 636 | ||
| 697 | err = -EFAULT; | 637 | err = -EFAULT; |
| @@ -720,13 +660,29 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) | |||
| 720 | break; | 660 | break; |
| 721 | } | 661 | } |
| 722 | t = netdev_priv(dev); | 662 | t = netdev_priv(dev); |
| 663 | ipip_tunnel_unlink(ipn, t); | ||
| 664 | synchronize_net(); | ||
| 665 | t->parms.iph.saddr = p.iph.saddr; | ||
| 666 | t->parms.iph.daddr = p.iph.daddr; | ||
| 667 | memcpy(dev->dev_addr, &p.iph.saddr, 4); | ||
| 668 | memcpy(dev->broadcast, &p.iph.daddr, 4); | ||
| 669 | ipip_tunnel_link(ipn, t); | ||
| 670 | netdev_state_change(dev); | ||
| 723 | } | 671 | } |
| 724 | |||
| 725 | ipip_tunnel_update(t, &p); | ||
| 726 | } | 672 | } |
| 727 | 673 | ||
| 728 | if (t) { | 674 | if (t) { |
| 729 | err = 0; | 675 | err = 0; |
| 676 | if (cmd == SIOCCHGTUNNEL) { | ||
| 677 | t->parms.iph.ttl = p.iph.ttl; | ||
| 678 | t->parms.iph.tos = p.iph.tos; | ||
| 679 | t->parms.iph.frag_off = p.iph.frag_off; | ||
| 680 | if (t->parms.link != p.link) { | ||
| 681 | t->parms.link = p.link; | ||
| 682 | ipip_tunnel_bind_dev(dev); | ||
| 683 | netdev_state_change(dev); | ||
| 684 | } | ||
| 685 | } | ||
| 730 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) | 686 | if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) |
| 731 | err = -EFAULT; | 687 | err = -EFAULT; |
| 732 | } else | 688 | } else |
| @@ -735,7 +691,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) | |||
| 735 | 691 | ||
| 736 | case SIOCDELTUNNEL: | 692 | case SIOCDELTUNNEL: |
| 737 | err = -EPERM; | 693 | err = -EPERM; |
| 738 | if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) | 694 | if (!capable(CAP_NET_ADMIN)) |
| 739 | goto done; | 695 | goto done; |
| 740 | 696 | ||
| 741 | if (dev == ipn->fb_tunnel_dev) { | 697 | if (dev == ipn->fb_tunnel_dev) { |
| @@ -775,7 +731,7 @@ static const struct net_device_ops ipip_netdev_ops = { | |||
| 775 | .ndo_start_xmit = ipip_tunnel_xmit, | 731 | .ndo_start_xmit = ipip_tunnel_xmit, |
| 776 | .ndo_do_ioctl = ipip_tunnel_ioctl, | 732 | .ndo_do_ioctl = ipip_tunnel_ioctl, |
| 777 | .ndo_change_mtu = ipip_tunnel_change_mtu, | 733 | .ndo_change_mtu = ipip_tunnel_change_mtu, |
| 778 | .ndo_get_stats64 = ipip_get_stats64, | 734 | .ndo_get_stats = ipip_get_stats, |
| 779 | }; | 735 | }; |
| 780 | 736 | ||
| 781 | static void ipip_dev_free(struct net_device *dev) | 737 | static void ipip_dev_free(struct net_device *dev) |
| @@ -784,11 +740,6 @@ static void ipip_dev_free(struct net_device *dev) | |||
| 784 | free_netdev(dev); | 740 | free_netdev(dev); |
| 785 | } | 741 | } |
| 786 | 742 | ||
| 787 | #define IPIP_FEATURES (NETIF_F_SG | \ | ||
| 788 | NETIF_F_FRAGLIST | \ | ||
| 789 | NETIF_F_HIGHDMA | \ | ||
| 790 | NETIF_F_HW_CSUM) | ||
| 791 | |||
| 792 | static void ipip_tunnel_setup(struct net_device *dev) | 743 | static void ipip_tunnel_setup(struct net_device *dev) |
| 793 | { | 744 | { |
| 794 | dev->netdev_ops = &ipip_netdev_ops; | 745 | dev->netdev_ops = &ipip_netdev_ops; |
| @@ -803,9 +754,6 @@ static void ipip_tunnel_setup(struct net_device *dev) | |||
| 803 | dev->features |= NETIF_F_NETNS_LOCAL; | 754 | dev->features |= NETIF_F_NETNS_LOCAL; |
| 804 | dev->features |= NETIF_F_LLTX; | 755 | dev->features |= NETIF_F_LLTX; |
| 805 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 756 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; |
| 806 | |||
| 807 | dev->features |= IPIP_FEATURES; | ||
| 808 | dev->hw_features |= IPIP_FEATURES; | ||
| 809 | } | 757 | } |
| 810 | 758 | ||
| 811 | static int ipip_tunnel_init(struct net_device *dev) | 759 | static int ipip_tunnel_init(struct net_device *dev) |
| @@ -848,142 +796,6 @@ static int __net_init ipip_fb_tunnel_init(struct net_device *dev) | |||
| 848 | return 0; | 796 | return 0; |
| 849 | } | 797 | } |
| 850 | 798 | ||
| 851 | static void ipip_netlink_parms(struct nlattr *data[], | ||
| 852 | struct ip_tunnel_parm *parms) | ||
| 853 | { | ||
| 854 | memset(parms, 0, sizeof(*parms)); | ||
| 855 | |||
| 856 | parms->iph.version = 4; | ||
| 857 | parms->iph.protocol = IPPROTO_IPIP; | ||
| 858 | parms->iph.ihl = 5; | ||
| 859 | |||
| 860 | if (!data) | ||
| 861 | return; | ||
| 862 | |||
| 863 | if (data[IFLA_IPTUN_LINK]) | ||
| 864 | parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); | ||
| 865 | |||
| 866 | if (data[IFLA_IPTUN_LOCAL]) | ||
| 867 | parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]); | ||
| 868 | |||
| 869 | if (data[IFLA_IPTUN_REMOTE]) | ||
| 870 | parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]); | ||
| 871 | |||
| 872 | if (data[IFLA_IPTUN_TTL]) { | ||
| 873 | parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]); | ||
| 874 | if (parms->iph.ttl) | ||
| 875 | parms->iph.frag_off = htons(IP_DF); | ||
| 876 | } | ||
| 877 | |||
| 878 | if (data[IFLA_IPTUN_TOS]) | ||
| 879 | parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]); | ||
| 880 | |||
| 881 | if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) | ||
| 882 | parms->iph.frag_off = htons(IP_DF); | ||
| 883 | } | ||
| 884 | |||
| 885 | static int ipip_newlink(struct net *src_net, struct net_device *dev, | ||
| 886 | struct nlattr *tb[], struct nlattr *data[]) | ||
| 887 | { | ||
| 888 | struct net *net = dev_net(dev); | ||
| 889 | struct ip_tunnel *nt; | ||
| 890 | |||
| 891 | nt = netdev_priv(dev); | ||
| 892 | ipip_netlink_parms(data, &nt->parms); | ||
| 893 | |||
| 894 | if (ipip_tunnel_locate(net, &nt->parms, 0)) | ||
| 895 | return -EEXIST; | ||
| 896 | |||
| 897 | return ipip_tunnel_create(dev); | ||
| 898 | } | ||
| 899 | |||
| 900 | static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], | ||
| 901 | struct nlattr *data[]) | ||
| 902 | { | ||
| 903 | struct ip_tunnel *t; | ||
| 904 | struct ip_tunnel_parm p; | ||
| 905 | struct net *net = dev_net(dev); | ||
| 906 | struct ipip_net *ipn = net_generic(net, ipip_net_id); | ||
| 907 | |||
| 908 | if (dev == ipn->fb_tunnel_dev) | ||
| 909 | return -EINVAL; | ||
| 910 | |||
| 911 | ipip_netlink_parms(data, &p); | ||
| 912 | |||
| 913 | if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || | ||
| 914 | (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) | ||
| 915 | return -EINVAL; | ||
| 916 | |||
| 917 | t = ipip_tunnel_locate(net, &p, 0); | ||
| 918 | |||
| 919 | if (t) { | ||
| 920 | if (t->dev != dev) | ||
| 921 | return -EEXIST; | ||
| 922 | } else | ||
| 923 | t = netdev_priv(dev); | ||
| 924 | |||
| 925 | ipip_tunnel_update(t, &p); | ||
| 926 | return 0; | ||
| 927 | } | ||
| 928 | |||
| 929 | static size_t ipip_get_size(const struct net_device *dev) | ||
| 930 | { | ||
| 931 | return | ||
| 932 | /* IFLA_IPTUN_LINK */ | ||
| 933 | nla_total_size(4) + | ||
| 934 | /* IFLA_IPTUN_LOCAL */ | ||
| 935 | nla_total_size(4) + | ||
| 936 | /* IFLA_IPTUN_REMOTE */ | ||
| 937 | nla_total_size(4) + | ||
| 938 | /* IFLA_IPTUN_TTL */ | ||
| 939 | nla_total_size(1) + | ||
| 940 | /* IFLA_IPTUN_TOS */ | ||
| 941 | nla_total_size(1) + | ||
| 942 | /* IFLA_IPTUN_PMTUDISC */ | ||
| 943 | nla_total_size(1) + | ||
| 944 | 0; | ||
| 945 | } | ||
| 946 | |||
| 947 | static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) | ||
| 948 | { | ||
| 949 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
| 950 | struct ip_tunnel_parm *parm = &tunnel->parms; | ||
| 951 | |||
| 952 | if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || | ||
| 953 | nla_put_be32(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || | ||
| 954 | nla_put_be32(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || | ||
| 955 | nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) || | ||
| 956 | nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || | ||
| 957 | nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, | ||
| 958 | !!(parm->iph.frag_off & htons(IP_DF)))) | ||
| 959 | goto nla_put_failure; | ||
| 960 | return 0; | ||
| 961 | |||
| 962 | nla_put_failure: | ||
| 963 | return -EMSGSIZE; | ||
| 964 | } | ||
| 965 | |||
| 966 | static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = { | ||
| 967 | [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, | ||
| 968 | [IFLA_IPTUN_LOCAL] = { .type = NLA_U32 }, | ||
| 969 | [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 }, | ||
| 970 | [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, | ||
| 971 | [IFLA_IPTUN_TOS] = { .type = NLA_U8 }, | ||
| 972 | [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 }, | ||
| 973 | }; | ||
| 974 | |||
| 975 | static struct rtnl_link_ops ipip_link_ops __read_mostly = { | ||
| 976 | .kind = "ipip", | ||
| 977 | .maxtype = IFLA_IPTUN_MAX, | ||
| 978 | .policy = ipip_policy, | ||
| 979 | .priv_size = sizeof(struct ip_tunnel), | ||
| 980 | .setup = ipip_tunnel_setup, | ||
| 981 | .newlink = ipip_newlink, | ||
| 982 | .changelink = ipip_changelink, | ||
| 983 | .get_size = ipip_get_size, | ||
| 984 | .fill_info = ipip_fill_info, | ||
| 985 | }; | ||
| 986 | |||
| 987 | static struct xfrm_tunnel ipip_handler __read_mostly = { | 799 | static struct xfrm_tunnel ipip_handler __read_mostly = { |
| 988 | .handler = ipip_rcv, | 800 | .handler = ipip_rcv, |
| 989 | .err_handler = ipip_err, | 801 | .err_handler = ipip_err, |
| @@ -1080,28 +892,16 @@ static int __init ipip_init(void) | |||
| 1080 | return err; | 892 | return err; |
| 1081 | err = xfrm4_tunnel_register(&ipip_handler, AF_INET); | 893 | err = xfrm4_tunnel_register(&ipip_handler, AF_INET); |
| 1082 | if (err < 0) { | 894 | if (err < 0) { |
| 1083 | pr_info("%s: can't register tunnel\n", __func__); | 895 | unregister_pernet_device(&ipip_net_ops); |
| 1084 | goto xfrm_tunnel_failed; | 896 | printk(KERN_INFO "ipip init: can't register tunnel\n"); |
| 1085 | } | 897 | } |
| 1086 | err = rtnl_link_register(&ipip_link_ops); | ||
| 1087 | if (err < 0) | ||
| 1088 | goto rtnl_link_failed; | ||
| 1089 | |||
| 1090 | out: | ||
| 1091 | return err; | 898 | return err; |
| 1092 | |||
| 1093 | rtnl_link_failed: | ||
| 1094 | xfrm4_tunnel_deregister(&ipip_handler, AF_INET); | ||
| 1095 | xfrm_tunnel_failed: | ||
| 1096 | unregister_pernet_device(&ipip_net_ops); | ||
| 1097 | goto out; | ||
| 1098 | } | 899 | } |
| 1099 | 900 | ||
| 1100 | static void __exit ipip_fini(void) | 901 | static void __exit ipip_fini(void) |
| 1101 | { | 902 | { |
| 1102 | rtnl_link_unregister(&ipip_link_ops); | ||
| 1103 | if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) | 903 | if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) |
| 1104 | pr_info("%s: can't deregister tunnel\n", __func__); | 904 | printk(KERN_INFO "ipip close: can't deregister tunnel\n"); |
| 1105 | 905 | ||
| 1106 | unregister_pernet_device(&ipip_net_ops); | 906 | unregister_pernet_device(&ipip_net_ops); |
| 1107 | } | 907 | } |
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a9454cbd953..58e87915797 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c | |||
| @@ -26,6 +26,7 @@ | |||
| 26 | * | 26 | * |
| 27 | */ | 27 | */ |
| 28 | 28 | ||
| 29 | #include <asm/system.h> | ||
| 29 | #include <asm/uaccess.h> | 30 | #include <asm/uaccess.h> |
| 30 | #include <linux/types.h> | 31 | #include <linux/types.h> |
| 31 | #include <linux/capability.h> | 32 | #include <linux/capability.h> |
| @@ -60,12 +61,10 @@ | |||
| 60 | #include <linux/if_arp.h> | 61 | #include <linux/if_arp.h> |
| 61 | #include <linux/netfilter_ipv4.h> | 62 | #include <linux/netfilter_ipv4.h> |
| 62 | #include <linux/compat.h> | 63 | #include <linux/compat.h> |
| 63 | #include <linux/export.h> | ||
| 64 | #include <net/ipip.h> | 64 | #include <net/ipip.h> |
| 65 | #include <net/checksum.h> | 65 | #include <net/checksum.h> |
| 66 | #include <net/netlink.h> | 66 | #include <net/netlink.h> |
| 67 | #include <net/fib_rules.h> | 67 | #include <net/fib_rules.h> |
| 68 | #include <linux/netconf.h> | ||
| 69 | 68 | ||
| 70 | #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) | 69 | #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) |
| 71 | #define CONFIG_IP_PIMSM 1 | 70 | #define CONFIG_IP_PIMSM 1 |
| @@ -84,8 +83,8 @@ struct mr_table { | |||
| 84 | struct vif_device vif_table[MAXVIFS]; | 83 | struct vif_device vif_table[MAXVIFS]; |
| 85 | int maxvif; | 84 | int maxvif; |
| 86 | atomic_t cache_resolve_queue_len; | 85 | atomic_t cache_resolve_queue_len; |
| 87 | bool mroute_do_assert; | 86 | int mroute_do_assert; |
| 88 | bool mroute_do_pim; | 87 | int mroute_do_pim; |
| 89 | #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) | 88 | #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) |
| 90 | int mroute_reg_vif_num; | 89 | int mroute_reg_vif_num; |
| 91 | #endif | 90 | #endif |
| @@ -125,8 +124,6 @@ static DEFINE_SPINLOCK(mfc_unres_lock); | |||
| 125 | static struct kmem_cache *mrt_cachep __read_mostly; | 124 | static struct kmem_cache *mrt_cachep __read_mostly; |
| 126 | 125 | ||
| 127 | static struct mr_table *ipmr_new_table(struct net *net, u32 id); | 126 | static struct mr_table *ipmr_new_table(struct net *net, u32 id); |
| 128 | static void ipmr_free_table(struct mr_table *mrt); | ||
| 129 | |||
| 130 | static int ip_mr_forward(struct net *net, struct mr_table *mrt, | 127 | static int ip_mr_forward(struct net *net, struct mr_table *mrt, |
| 131 | struct sk_buff *skb, struct mfc_cache *cache, | 128 | struct sk_buff *skb, struct mfc_cache *cache, |
| 132 | int local); | 129 | int local); |
| @@ -134,9 +131,6 @@ static int ipmr_cache_report(struct mr_table *mrt, | |||
| 134 | struct sk_buff *pkt, vifi_t vifi, int assert); | 131 | struct sk_buff *pkt, vifi_t vifi, int assert); |
| 135 | static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, | 132 | static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, |
| 136 | struct mfc_cache *c, struct rtmsg *rtm); | 133 | struct mfc_cache *c, struct rtmsg *rtm); |
| 137 | static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, | ||
| 138 | int cmd); | ||
| 139 | static void mroute_clean_tables(struct mr_table *mrt); | ||
| 140 | static void ipmr_expire_process(unsigned long arg); | 134 | static void ipmr_expire_process(unsigned long arg); |
| 141 | 135 | ||
| 142 | #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES | 136 | #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES |
| @@ -224,7 +218,7 @@ static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, | |||
| 224 | return 0; | 218 | return 0; |
| 225 | } | 219 | } |
| 226 | 220 | ||
| 227 | static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = { | 221 | static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = { |
| 228 | .family = RTNL_FAMILY_IPMR, | 222 | .family = RTNL_FAMILY_IPMR, |
| 229 | .rule_size = sizeof(struct ipmr_rule), | 223 | .rule_size = sizeof(struct ipmr_rule), |
| 230 | .addr_size = sizeof(u32), | 224 | .addr_size = sizeof(u32), |
| @@ -277,7 +271,7 @@ static void __net_exit ipmr_rules_exit(struct net *net) | |||
| 277 | 271 | ||
| 278 | list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { | 272 | list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { |
| 279 | list_del(&mrt->list); | 273 | list_del(&mrt->list); |
| 280 | ipmr_free_table(mrt); | 274 | kfree(mrt); |
| 281 | } | 275 | } |
| 282 | fib_rules_unregister(net->ipv4.mr_rules_ops); | 276 | fib_rules_unregister(net->ipv4.mr_rules_ops); |
| 283 | } | 277 | } |
| @@ -305,7 +299,7 @@ static int __net_init ipmr_rules_init(struct net *net) | |||
| 305 | 299 | ||
| 306 | static void __net_exit ipmr_rules_exit(struct net *net) | 300 | static void __net_exit ipmr_rules_exit(struct net *net) |
| 307 | { | 301 | { |
| 308 | ipmr_free_table(net->ipv4.mrt); | 302 | kfree(net->ipv4.mrt); |
| 309 | } | 303 | } |
| 310 | #endif | 304 | #endif |
| 311 | 305 | ||
| @@ -342,13 +336,6 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) | |||
| 342 | return mrt; | 336 | return mrt; |
| 343 | } | 337 | } |
| 344 | 338 | ||
| 345 | static void ipmr_free_table(struct mr_table *mrt) | ||
| 346 | { | ||
| 347 | del_timer_sync(&mrt->ipmr_expire_timer); | ||
| 348 | mroute_clean_tables(mrt); | ||
| 349 | kfree(mrt); | ||
| 350 | } | ||
| 351 | |||
| 352 | /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ | 339 | /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ |
| 353 | 340 | ||
| 354 | static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) | 341 | static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) |
| @@ -537,8 +524,8 @@ failure: | |||
| 537 | } | 524 | } |
| 538 | #endif | 525 | #endif |
| 539 | 526 | ||
| 540 | /** | 527 | /* |
| 541 | * vif_delete - Delete a VIF entry | 528 | * Delete a VIF entry |
| 542 | * @notify: Set to 1, if the caller is a notifier_call | 529 | * @notify: Set to 1, if the caller is a notifier_call |
| 543 | */ | 530 | */ |
| 544 | 531 | ||
| @@ -585,9 +572,6 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, | |||
| 585 | in_dev = __in_dev_get_rtnl(dev); | 572 | in_dev = __in_dev_get_rtnl(dev); |
| 586 | if (in_dev) { | 573 | if (in_dev) { |
| 587 | IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; | 574 | IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; |
| 588 | inet_netconf_notify_devconf(dev_net(dev), | ||
| 589 | NETCONFA_MC_FORWARDING, | ||
| 590 | dev->ifindex, &in_dev->cnf); | ||
| 591 | ip_rt_multicast_event(in_dev); | 575 | ip_rt_multicast_event(in_dev); |
| 592 | } | 576 | } |
| 593 | 577 | ||
| @@ -632,7 +616,7 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) | |||
| 632 | e->error = -ETIMEDOUT; | 616 | e->error = -ETIMEDOUT; |
| 633 | memset(&e->msg, 0, sizeof(e->msg)); | 617 | memset(&e->msg, 0, sizeof(e->msg)); |
| 634 | 618 | ||
| 635 | rtnl_unicast(skb, net, NETLINK_CB(skb).portid); | 619 | rtnl_unicast(skb, net, NETLINK_CB(skb).pid); |
| 636 | } else { | 620 | } else { |
| 637 | kfree_skb(skb); | 621 | kfree_skb(skb); |
| 638 | } | 622 | } |
| @@ -671,7 +655,6 @@ static void ipmr_expire_process(unsigned long arg) | |||
| 671 | } | 655 | } |
| 672 | 656 | ||
| 673 | list_del(&c->list); | 657 | list_del(&c->list); |
| 674 | mroute_netlink_event(mrt, c, RTM_DELROUTE); | ||
| 675 | ipmr_destroy_unres(mrt, c); | 658 | ipmr_destroy_unres(mrt, c); |
| 676 | } | 659 | } |
| 677 | 660 | ||
| @@ -779,8 +762,6 @@ static int vif_add(struct net *net, struct mr_table *mrt, | |||
| 779 | return -EADDRNOTAVAIL; | 762 | return -EADDRNOTAVAIL; |
| 780 | } | 763 | } |
| 781 | IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; | 764 | IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; |
| 782 | inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, dev->ifindex, | ||
| 783 | &in_dev->cnf); | ||
| 784 | ip_rt_multicast_event(in_dev); | 765 | ip_rt_multicast_event(in_dev); |
| 785 | 766 | ||
| 786 | /* Fill in the VIF structures */ | 767 | /* Fill in the VIF structures */ |
| @@ -879,7 +860,7 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, | |||
| 879 | memset(&e->msg, 0, sizeof(e->msg)); | 860 | memset(&e->msg, 0, sizeof(e->msg)); |
| 880 | } | 861 | } |
| 881 | 862 | ||
| 882 | rtnl_unicast(skb, net, NETLINK_CB(skb).portid); | 863 | rtnl_unicast(skb, net, NETLINK_CB(skb).pid); |
| 883 | } else { | 864 | } else { |
| 884 | ip_mr_forward(net, mrt, skb, c, 0); | 865 | ip_mr_forward(net, mrt, skb, c, 0); |
| 885 | } | 866 | } |
| @@ -968,7 +949,8 @@ static int ipmr_cache_report(struct mr_table *mrt, | |||
| 968 | ret = sock_queue_rcv_skb(mroute_sk, skb); | 949 | ret = sock_queue_rcv_skb(mroute_sk, skb); |
| 969 | rcu_read_unlock(); | 950 | rcu_read_unlock(); |
| 970 | if (ret < 0) { | 951 | if (ret < 0) { |
| 971 | net_warn_ratelimited("mroute: pending queue full, dropping entries\n"); | 952 | if (net_ratelimit()) |
| 953 | printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); | ||
| 972 | kfree_skb(skb); | 954 | kfree_skb(skb); |
| 973 | } | 955 | } |
| 974 | 956 | ||
| @@ -1029,7 +1011,6 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) | |||
| 1029 | 1011 | ||
| 1030 | atomic_inc(&mrt->cache_resolve_queue_len); | 1012 | atomic_inc(&mrt->cache_resolve_queue_len); |
| 1031 | list_add(&c->list, &mrt->mfc_unres_queue); | 1013 | list_add(&c->list, &mrt->mfc_unres_queue); |
| 1032 | mroute_netlink_event(mrt, c, RTM_NEWROUTE); | ||
| 1033 | 1014 | ||
| 1034 | if (atomic_read(&mrt->cache_resolve_queue_len) == 1) | 1015 | if (atomic_read(&mrt->cache_resolve_queue_len) == 1) |
| 1035 | mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires); | 1016 | mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires); |
| @@ -1064,7 +1045,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc) | |||
| 1064 | if (c->mfc_origin == mfc->mfcc_origin.s_addr && | 1045 | if (c->mfc_origin == mfc->mfcc_origin.s_addr && |
| 1065 | c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { | 1046 | c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { |
| 1066 | list_del_rcu(&c->list); | 1047 | list_del_rcu(&c->list); |
| 1067 | mroute_netlink_event(mrt, c, RTM_DELROUTE); | 1048 | |
| 1068 | ipmr_cache_free(c); | 1049 | ipmr_cache_free(c); |
| 1069 | return 0; | 1050 | return 0; |
| 1070 | } | 1051 | } |
| @@ -1099,7 +1080,6 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, | |||
| 1099 | if (!mrtsock) | 1080 | if (!mrtsock) |
| 1100 | c->mfc_flags |= MFC_STATIC; | 1081 | c->mfc_flags |= MFC_STATIC; |
| 1101 | write_unlock_bh(&mrt_lock); | 1082 | write_unlock_bh(&mrt_lock); |
| 1102 | mroute_netlink_event(mrt, c, RTM_NEWROUTE); | ||
| 1103 | return 0; | 1083 | return 0; |
| 1104 | } | 1084 | } |
| 1105 | 1085 | ||
| @@ -1142,7 +1122,6 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, | |||
| 1142 | ipmr_cache_resolve(net, mrt, uc, c); | 1122 | ipmr_cache_resolve(net, mrt, uc, c); |
| 1143 | ipmr_cache_free(uc); | 1123 | ipmr_cache_free(uc); |
| 1144 | } | 1124 | } |
| 1145 | mroute_netlink_event(mrt, c, RTM_NEWROUTE); | ||
| 1146 | return 0; | 1125 | return 0; |
| 1147 | } | 1126 | } |
| 1148 | 1127 | ||
| @@ -1171,7 +1150,6 @@ static void mroute_clean_tables(struct mr_table *mrt) | |||
| 1171 | if (c->mfc_flags & MFC_STATIC) | 1150 | if (c->mfc_flags & MFC_STATIC) |
| 1172 | continue; | 1151 | continue; |
| 1173 | list_del_rcu(&c->list); | 1152 | list_del_rcu(&c->list); |
| 1174 | mroute_netlink_event(mrt, c, RTM_DELROUTE); | ||
| 1175 | ipmr_cache_free(c); | 1153 | ipmr_cache_free(c); |
| 1176 | } | 1154 | } |
| 1177 | } | 1155 | } |
| @@ -1180,7 +1158,6 @@ static void mroute_clean_tables(struct mr_table *mrt) | |||
| 1180 | spin_lock_bh(&mfc_unres_lock); | 1158 | spin_lock_bh(&mfc_unres_lock); |
| 1181 | list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { | 1159 | list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { |
| 1182 | list_del(&c->list); | 1160 | list_del(&c->list); |
| 1183 | mroute_netlink_event(mrt, c, RTM_DELROUTE); | ||
| 1184 | ipmr_destroy_unres(mrt, c); | 1161 | ipmr_destroy_unres(mrt, c); |
| 1185 | } | 1162 | } |
| 1186 | spin_unlock_bh(&mfc_unres_lock); | 1163 | spin_unlock_bh(&mfc_unres_lock); |
| @@ -1199,10 +1176,7 @@ static void mrtsock_destruct(struct sock *sk) | |||
| 1199 | ipmr_for_each_table(mrt, net) { | 1176 | ipmr_for_each_table(mrt, net) { |
| 1200 | if (sk == rtnl_dereference(mrt->mroute_sk)) { | 1177 | if (sk == rtnl_dereference(mrt->mroute_sk)) { |
| 1201 | IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; | 1178 | IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; |
| 1202 | inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, | 1179 | rcu_assign_pointer(mrt->mroute_sk, NULL); |
| 1203 | NETCONFA_IFINDEX_ALL, | ||
| 1204 | net->ipv4.devconf_all); | ||
| 1205 | RCU_INIT_POINTER(mrt->mroute_sk, NULL); | ||
| 1206 | mroute_clean_tables(mrt); | 1180 | mroute_clean_tables(mrt); |
| 1207 | } | 1181 | } |
| 1208 | } | 1182 | } |
| @@ -1224,24 +1198,23 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi | |||
| 1224 | struct net *net = sock_net(sk); | 1198 | struct net *net = sock_net(sk); |
| 1225 | struct mr_table *mrt; | 1199 | struct mr_table *mrt; |
| 1226 | 1200 | ||
| 1227 | if (sk->sk_type != SOCK_RAW || | ||
| 1228 | inet_sk(sk)->inet_num != IPPROTO_IGMP) | ||
| 1229 | return -EOPNOTSUPP; | ||
| 1230 | |||
| 1231 | mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); | 1201 | mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); |
| 1232 | if (mrt == NULL) | 1202 | if (mrt == NULL) |
| 1233 | return -ENOENT; | 1203 | return -ENOENT; |
| 1234 | 1204 | ||
| 1235 | if (optname != MRT_INIT) { | 1205 | if (optname != MRT_INIT) { |
| 1236 | if (sk != rcu_access_pointer(mrt->mroute_sk) && | 1206 | if (sk != rcu_dereference_raw(mrt->mroute_sk) && |
| 1237 | !ns_capable(net->user_ns, CAP_NET_ADMIN)) | 1207 | !capable(CAP_NET_ADMIN)) |
| 1238 | return -EACCES; | 1208 | return -EACCES; |
| 1239 | } | 1209 | } |
| 1240 | 1210 | ||
| 1241 | switch (optname) { | 1211 | switch (optname) { |
| 1242 | case MRT_INIT: | 1212 | case MRT_INIT: |
| 1213 | if (sk->sk_type != SOCK_RAW || | ||
| 1214 | inet_sk(sk)->inet_num != IPPROTO_IGMP) | ||
| 1215 | return -EOPNOTSUPP; | ||
| 1243 | if (optlen != sizeof(int)) | 1216 | if (optlen != sizeof(int)) |
| 1244 | return -EINVAL; | 1217 | return -ENOPROTOOPT; |
| 1245 | 1218 | ||
| 1246 | rtnl_lock(); | 1219 | rtnl_lock(); |
| 1247 | if (rtnl_dereference(mrt->mroute_sk)) { | 1220 | if (rtnl_dereference(mrt->mroute_sk)) { |
| @@ -1253,14 +1226,11 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi | |||
| 1253 | if (ret == 0) { | 1226 | if (ret == 0) { |
| 1254 | rcu_assign_pointer(mrt->mroute_sk, sk); | 1227 | rcu_assign_pointer(mrt->mroute_sk, sk); |
| 1255 | IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; | 1228 | IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; |
| 1256 | inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, | ||
| 1257 | NETCONFA_IFINDEX_ALL, | ||
| 1258 | net->ipv4.devconf_all); | ||
| 1259 | } | 1229 | } |
| 1260 | rtnl_unlock(); | 1230 | rtnl_unlock(); |
| 1261 | return ret; | 1231 | return ret; |
| 1262 | case MRT_DONE: | 1232 | case MRT_DONE: |
| 1263 | if (sk != rcu_access_pointer(mrt->mroute_sk)) | 1233 | if (sk != rcu_dereference_raw(mrt->mroute_sk)) |
| 1264 | return -EACCES; | 1234 | return -EACCES; |
| 1265 | return ip_ra_control(sk, 0, NULL); | 1235 | return ip_ra_control(sk, 0, NULL); |
| 1266 | case MRT_ADD_VIF: | 1236 | case MRT_ADD_VIF: |
| @@ -1305,11 +1275,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi | |||
| 1305 | case MRT_ASSERT: | 1275 | case MRT_ASSERT: |
| 1306 | { | 1276 | { |
| 1307 | int v; | 1277 | int v; |
| 1308 | if (optlen != sizeof(v)) | ||
| 1309 | return -EINVAL; | ||
| 1310 | if (get_user(v, (int __user *)optval)) | 1278 | if (get_user(v, (int __user *)optval)) |
| 1311 | return -EFAULT; | 1279 | return -EFAULT; |
| 1312 | mrt->mroute_do_assert = v; | 1280 | mrt->mroute_do_assert = (v) ? 1 : 0; |
| 1313 | return 0; | 1281 | return 0; |
| 1314 | } | 1282 | } |
| 1315 | #ifdef CONFIG_IP_PIMSM | 1283 | #ifdef CONFIG_IP_PIMSM |
| @@ -1317,11 +1285,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi | |||
| 1317 | { | 1285 | { |
| 1318 | int v; | 1286 | int v; |
| 1319 | 1287 | ||
| 1320 | if (optlen != sizeof(v)) | ||
| 1321 | return -EINVAL; | ||
| 1322 | if (get_user(v, (int __user *)optval)) | 1288 | if (get_user(v, (int __user *)optval)) |
| 1323 | return -EFAULT; | 1289 | return -EFAULT; |
| 1324 | v = !!v; | 1290 | v = (v) ? 1 : 0; |
| 1325 | 1291 | ||
| 1326 | rtnl_lock(); | 1292 | rtnl_lock(); |
| 1327 | ret = 0; | 1293 | ret = 0; |
| @@ -1343,10 +1309,6 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi | |||
| 1343 | if (get_user(v, (u32 __user *)optval)) | 1309 | if (get_user(v, (u32 __user *)optval)) |
| 1344 | return -EFAULT; | 1310 | return -EFAULT; |
| 1345 | 1311 | ||
| 1346 | /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */ | ||
| 1347 | if (v != RT_TABLE_DEFAULT && v >= 1000000000) | ||
| 1348 | return -EINVAL; | ||
| 1349 | |||
| 1350 | rtnl_lock(); | 1312 | rtnl_lock(); |
| 1351 | ret = 0; | 1313 | ret = 0; |
| 1352 | if (sk == rtnl_dereference(mrt->mroute_sk)) { | 1314 | if (sk == rtnl_dereference(mrt->mroute_sk)) { |
| @@ -1354,8 +1316,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi | |||
| 1354 | } else { | 1316 | } else { |
| 1355 | if (!ipmr_new_table(net, v)) | 1317 | if (!ipmr_new_table(net, v)) |
| 1356 | ret = -ENOMEM; | 1318 | ret = -ENOMEM; |
| 1357 | else | 1319 | raw_sk(sk)->ipmr_table = v; |
| 1358 | raw_sk(sk)->ipmr_table = v; | ||
| 1359 | } | 1320 | } |
| 1360 | rtnl_unlock(); | 1321 | rtnl_unlock(); |
| 1361 | return ret; | 1322 | return ret; |
| @@ -1381,10 +1342,6 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int | |||
| 1381 | struct net *net = sock_net(sk); | 1342 | struct net *net = sock_net(sk); |
| 1382 | struct mr_table *mrt; | 1343 | struct mr_table *mrt; |
| 1383 | 1344 | ||
| 1384 | if (sk->sk_type != SOCK_RAW || | ||
| 1385 | inet_sk(sk)->inet_num != IPPROTO_IGMP) | ||
| 1386 | return -EOPNOTSUPP; | ||
| 1387 | |||
| 1388 | mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); | 1345 | mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); |
| 1389 | if (mrt == NULL) | 1346 | if (mrt == NULL) |
| 1390 | return -ENOENT; | 1347 | return -ENOENT; |
| @@ -1562,6 +1519,7 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v | |||
| 1562 | struct mr_table *mrt; | 1519 | struct mr_table *mrt; |
| 1563 | struct vif_device *v; | 1520 | struct vif_device *v; |
| 1564 | int ct; | 1521 | int ct; |
| 1522 | LIST_HEAD(list); | ||
| 1565 | 1523 | ||
| 1566 | if (event != NETDEV_UNREGISTER) | 1524 | if (event != NETDEV_UNREGISTER) |
| 1567 | return NOTIFY_DONE; | 1525 | return NOTIFY_DONE; |
| @@ -1570,9 +1528,10 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v | |||
| 1570 | v = &mrt->vif_table[0]; | 1528 | v = &mrt->vif_table[0]; |
| 1571 | for (ct = 0; ct < mrt->maxvif; ct++, v++) { | 1529 | for (ct = 0; ct < mrt->maxvif; ct++, v++) { |
| 1572 | if (v->dev == dev) | 1530 | if (v->dev == dev) |
| 1573 | vif_delete(mrt, ct, 1, NULL); | 1531 | vif_delete(mrt, ct, 1, &list); |
| 1574 | } | 1532 | } |
| 1575 | } | 1533 | } |
| 1534 | unregister_netdevice_many(&list); | ||
| 1576 | return NOTIFY_DONE; | 1535 | return NOTIFY_DONE; |
| 1577 | } | 1536 | } |
| 1578 | 1537 | ||
| @@ -1618,7 +1577,6 @@ static inline int ipmr_forward_finish(struct sk_buff *skb) | |||
| 1618 | struct ip_options *opt = &(IPCB(skb)->opt); | 1577 | struct ip_options *opt = &(IPCB(skb)->opt); |
| 1619 | 1578 | ||
| 1620 | IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); | 1579 | IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); |
| 1621 | IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len); | ||
| 1622 | 1580 | ||
| 1623 | if (unlikely(opt->optlen)) | 1581 | if (unlikely(opt->optlen)) |
| 1624 | ip_forward_options(skb); | 1582 | ip_forward_options(skb); |
| @@ -1839,12 +1797,9 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) | |||
| 1839 | .daddr = iph->daddr, | 1797 | .daddr = iph->daddr, |
| 1840 | .saddr = iph->saddr, | 1798 | .saddr = iph->saddr, |
| 1841 | .flowi4_tos = RT_TOS(iph->tos), | 1799 | .flowi4_tos = RT_TOS(iph->tos), |
| 1842 | .flowi4_oif = (rt_is_output_route(rt) ? | 1800 | .flowi4_oif = rt->rt_oif, |
| 1843 | skb->dev->ifindex : 0), | 1801 | .flowi4_iif = rt->rt_iif, |
| 1844 | .flowi4_iif = (rt_is_output_route(rt) ? | 1802 | .flowi4_mark = rt->rt_mark, |
| 1845 | LOOPBACK_IFINDEX : | ||
| 1846 | skb->dev->ifindex), | ||
| 1847 | .flowi4_mark = skb->mark, | ||
| 1848 | }; | 1803 | }; |
| 1849 | struct mr_table *mrt; | 1804 | struct mr_table *mrt; |
| 1850 | int err; | 1805 | int err; |
| @@ -2053,44 +2008,37 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, | |||
| 2053 | { | 2008 | { |
| 2054 | int ct; | 2009 | int ct; |
| 2055 | struct rtnexthop *nhp; | 2010 | struct rtnexthop *nhp; |
| 2056 | struct nlattr *mp_attr; | 2011 | u8 *b = skb_tail_pointer(skb); |
| 2057 | struct rta_mfc_stats mfcs; | 2012 | struct rtattr *mp_head; |
| 2058 | 2013 | ||
| 2059 | /* If cache is unresolved, don't try to parse IIF and OIF */ | 2014 | /* If cache is unresolved, don't try to parse IIF and OIF */ |
| 2060 | if (c->mfc_parent >= MAXVIFS) | 2015 | if (c->mfc_parent >= MAXVIFS) |
| 2061 | return -ENOENT; | 2016 | return -ENOENT; |
| 2062 | 2017 | ||
| 2063 | if (VIF_EXISTS(mrt, c->mfc_parent) && | 2018 | if (VIF_EXISTS(mrt, c->mfc_parent)) |
| 2064 | nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) | 2019 | RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex); |
| 2065 | return -EMSGSIZE; | ||
| 2066 | 2020 | ||
| 2067 | if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH))) | 2021 | mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0)); |
| 2068 | return -EMSGSIZE; | ||
| 2069 | 2022 | ||
| 2070 | for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { | 2023 | for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { |
| 2071 | if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { | 2024 | if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { |
| 2072 | if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) { | 2025 | if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) |
| 2073 | nla_nest_cancel(skb, mp_attr); | 2026 | goto rtattr_failure; |
| 2074 | return -EMSGSIZE; | 2027 | nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); |
| 2075 | } | ||
| 2076 | |||
| 2077 | nhp->rtnh_flags = 0; | 2028 | nhp->rtnh_flags = 0; |
| 2078 | nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; | 2029 | nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; |
| 2079 | nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex; | 2030 | nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex; |
| 2080 | nhp->rtnh_len = sizeof(*nhp); | 2031 | nhp->rtnh_len = sizeof(*nhp); |
| 2081 | } | 2032 | } |
| 2082 | } | 2033 | } |
| 2083 | 2034 | mp_head->rta_type = RTA_MULTIPATH; | |
| 2084 | nla_nest_end(skb, mp_attr); | 2035 | mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head; |
| 2085 | |||
| 2086 | mfcs.mfcs_packets = c->mfc_un.res.pkt; | ||
| 2087 | mfcs.mfcs_bytes = c->mfc_un.res.bytes; | ||
| 2088 | mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; | ||
| 2089 | if (nla_put(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs) < 0) | ||
| 2090 | return -EMSGSIZE; | ||
| 2091 | |||
| 2092 | rtm->rtm_type = RTN_MULTICAST; | 2036 | rtm->rtm_type = RTN_MULTICAST; |
| 2093 | return 1; | 2037 | return 1; |
| 2038 | |||
| 2039 | rtattr_failure: | ||
| 2040 | nlmsg_trim(skb, b); | ||
| 2041 | return -EMSGSIZE; | ||
| 2094 | } | 2042 | } |
| 2095 | 2043 | ||
| 2096 | int ipmr_get_route(struct net *net, struct sk_buff *skb, | 2044 | int ipmr_get_route(struct net *net, struct sk_buff *skb, |
| @@ -2158,13 +2106,12 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, | |||
| 2158 | } | 2106 | } |
| 2159 | 2107 | ||
| 2160 | static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, | 2108 | static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, |
| 2161 | u32 portid, u32 seq, struct mfc_cache *c, int cmd) | 2109 | u32 pid, u32 seq, struct mfc_cache *c) |
| 2162 | { | 2110 | { |
| 2163 | struct nlmsghdr *nlh; | 2111 | struct nlmsghdr *nlh; |
| 2164 | struct rtmsg *rtm; | 2112 | struct rtmsg *rtm; |
| 2165 | int err; | ||
| 2166 | 2113 | ||
| 2167 | nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), NLM_F_MULTI); | 2114 | nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI); |
| 2168 | if (nlh == NULL) | 2115 | if (nlh == NULL) |
| 2169 | return -EMSGSIZE; | 2116 | return -EMSGSIZE; |
| 2170 | 2117 | ||
| @@ -2174,22 +2121,16 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, | |||
| 2174 | rtm->rtm_src_len = 32; | 2121 | rtm->rtm_src_len = 32; |
| 2175 | rtm->rtm_tos = 0; | 2122 | rtm->rtm_tos = 0; |
| 2176 | rtm->rtm_table = mrt->id; | 2123 | rtm->rtm_table = mrt->id; |
| 2177 | if (nla_put_u32(skb, RTA_TABLE, mrt->id)) | 2124 | NLA_PUT_U32(skb, RTA_TABLE, mrt->id); |
| 2178 | goto nla_put_failure; | ||
| 2179 | rtm->rtm_type = RTN_MULTICAST; | 2125 | rtm->rtm_type = RTN_MULTICAST; |
| 2180 | rtm->rtm_scope = RT_SCOPE_UNIVERSE; | 2126 | rtm->rtm_scope = RT_SCOPE_UNIVERSE; |
| 2181 | if (c->mfc_flags & MFC_STATIC) | 2127 | rtm->rtm_protocol = RTPROT_UNSPEC; |
| 2182 | rtm->rtm_protocol = RTPROT_STATIC; | ||
| 2183 | else | ||
| 2184 | rtm->rtm_protocol = RTPROT_MROUTED; | ||
| 2185 | rtm->rtm_flags = 0; | 2128 | rtm->rtm_flags = 0; |
| 2186 | 2129 | ||
| 2187 | if (nla_put_be32(skb, RTA_SRC, c->mfc_origin) || | 2130 | NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin); |
| 2188 | nla_put_be32(skb, RTA_DST, c->mfc_mcastgrp)) | 2131 | NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp); |
| 2189 | goto nla_put_failure; | 2132 | |
| 2190 | err = __ipmr_fill_mroute(mrt, skb, c, rtm); | 2133 | if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0) |
| 2191 | /* do not break the dump if cache is unresolved */ | ||
| 2192 | if (err < 0 && err != -ENOENT) | ||
| 2193 | goto nla_put_failure; | 2134 | goto nla_put_failure; |
| 2194 | 2135 | ||
| 2195 | return nlmsg_end(skb, nlh); | 2136 | return nlmsg_end(skb, nlh); |
| @@ -2199,52 +2140,6 @@ nla_put_failure: | |||
| 2199 | return -EMSGSIZE; | 2140 | return -EMSGSIZE; |
| 2200 | } | 2141 | } |
| 2201 | 2142 | ||
| 2202 | static size_t mroute_msgsize(bool unresolved, int maxvif) | ||
| 2203 | { | ||
| 2204 | size_t len = | ||
| 2205 | NLMSG_ALIGN(sizeof(struct rtmsg)) | ||
| 2206 | + nla_total_size(4) /* RTA_TABLE */ | ||
| 2207 | + nla_total_size(4) /* RTA_SRC */ | ||
| 2208 | + nla_total_size(4) /* RTA_DST */ | ||
| 2209 | ; | ||
| 2210 | |||
| 2211 | if (!unresolved) | ||
| 2212 | len = len | ||
| 2213 | + nla_total_size(4) /* RTA_IIF */ | ||
| 2214 | + nla_total_size(0) /* RTA_MULTIPATH */ | ||
| 2215 | + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) | ||
| 2216 | /* RTA_MFC_STATS */ | ||
| 2217 | + nla_total_size(sizeof(struct rta_mfc_stats)) | ||
| 2218 | ; | ||
| 2219 | |||
| 2220 | return len; | ||
| 2221 | } | ||
| 2222 | |||
| 2223 | static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, | ||
| 2224 | int cmd) | ||
| 2225 | { | ||
| 2226 | struct net *net = read_pnet(&mrt->net); | ||
| 2227 | struct sk_buff *skb; | ||
| 2228 | int err = -ENOBUFS; | ||
| 2229 | |||
| 2230 | skb = nlmsg_new(mroute_msgsize(mfc->mfc_parent >= MAXVIFS, mrt->maxvif), | ||
| 2231 | GFP_ATOMIC); | ||
| 2232 | if (skb == NULL) | ||
| 2233 | goto errout; | ||
| 2234 | |||
| 2235 | err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd); | ||
| 2236 | if (err < 0) | ||
| 2237 | goto errout; | ||
| 2238 | |||
| 2239 | rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE, NULL, GFP_ATOMIC); | ||
| 2240 | return; | ||
| 2241 | |||
| 2242 | errout: | ||
| 2243 | kfree_skb(skb); | ||
| 2244 | if (err < 0) | ||
| 2245 | rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err); | ||
| 2246 | } | ||
| 2247 | |||
| 2248 | static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) | 2143 | static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) |
| 2249 | { | 2144 | { |
| 2250 | struct net *net = sock_net(skb->sk); | 2145 | struct net *net = sock_net(skb->sk); |
| @@ -2269,31 +2164,15 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) | |||
| 2269 | if (e < s_e) | 2164 | if (e < s_e) |
| 2270 | goto next_entry; | 2165 | goto next_entry; |
| 2271 | if (ipmr_fill_mroute(mrt, skb, | 2166 | if (ipmr_fill_mroute(mrt, skb, |
| 2272 | NETLINK_CB(cb->skb).portid, | 2167 | NETLINK_CB(cb->skb).pid, |
| 2273 | cb->nlh->nlmsg_seq, | 2168 | cb->nlh->nlmsg_seq, |
| 2274 | mfc, RTM_NEWROUTE) < 0) | 2169 | mfc) < 0) |
| 2275 | goto done; | 2170 | goto done; |
| 2276 | next_entry: | 2171 | next_entry: |
| 2277 | e++; | 2172 | e++; |
| 2278 | } | 2173 | } |
| 2279 | e = s_e = 0; | 2174 | e = s_e = 0; |
| 2280 | } | 2175 | } |
| 2281 | spin_lock_bh(&mfc_unres_lock); | ||
| 2282 | list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { | ||
| 2283 | if (e < s_e) | ||
| 2284 | goto next_entry2; | ||
| 2285 | if (ipmr_fill_mroute(mrt, skb, | ||
| 2286 | NETLINK_CB(cb->skb).portid, | ||
| 2287 | cb->nlh->nlmsg_seq, | ||
| 2288 | mfc, RTM_NEWROUTE) < 0) { | ||
| 2289 | spin_unlock_bh(&mfc_unres_lock); | ||
| 2290 | goto done; | ||
| 2291 | } | ||
| 2292 | next_entry2: | ||
| 2293 | e++; | ||
| 2294 | } | ||
| 2295 | spin_unlock_bh(&mfc_unres_lock); | ||
| 2296 | e = s_e = 0; | ||
| 2297 | s_h = 0; | 2176 | s_h = 0; |
| 2298 | next_table: | 2177 | next_table: |
| 2299 | t++; | 2178 | t++; |
| @@ -2660,7 +2539,7 @@ int __init ip_mr_init(void) | |||
| 2660 | goto reg_notif_fail; | 2539 | goto reg_notif_fail; |
| 2661 | #ifdef CONFIG_IP_PIMSM_V2 | 2540 | #ifdef CONFIG_IP_PIMSM_V2 |
| 2662 | if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) { | 2541 | if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) { |
| 2663 | pr_err("%s: can't add PIM protocol\n", __func__); | 2542 | printk(KERN_ERR "ip_mr_init: can't add PIM protocol\n"); |
| 2664 | err = -EAGAIN; | 2543 | err = -EAGAIN; |
| 2665 | goto add_proto_fail; | 2544 | goto add_proto_fail; |
| 2666 | } | 2545 | } |
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 4c0cf63dd92..929b27bdeb7 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c | |||
| @@ -5,14 +5,13 @@ | |||
| 5 | #include <linux/ip.h> | 5 | #include <linux/ip.h> |
| 6 | #include <linux/skbuff.h> | 6 | #include <linux/skbuff.h> |
| 7 | #include <linux/gfp.h> | 7 | #include <linux/gfp.h> |
| 8 | #include <linux/export.h> | ||
| 9 | #include <net/route.h> | 8 | #include <net/route.h> |
| 10 | #include <net/xfrm.h> | 9 | #include <net/xfrm.h> |
| 11 | #include <net/ip.h> | 10 | #include <net/ip.h> |
| 12 | #include <net/netfilter/nf_queue.h> | 11 | #include <net/netfilter/nf_queue.h> |
| 13 | 12 | ||
| 14 | /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ | 13 | /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ |
| 15 | int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type) | 14 | int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) |
| 16 | { | 15 | { |
| 17 | struct net *net = dev_net(skb_dst(skb)->dev); | 16 | struct net *net = dev_net(skb_dst(skb)->dev); |
| 18 | const struct iphdr *iph = ip_hdr(skb); | 17 | const struct iphdr *iph = ip_hdr(skb); |
| @@ -64,14 +63,50 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type) | |||
| 64 | /* Change in oif may mean change in hh_len. */ | 63 | /* Change in oif may mean change in hh_len. */ |
| 65 | hh_len = skb_dst(skb)->dev->hard_header_len; | 64 | hh_len = skb_dst(skb)->dev->hard_header_len; |
| 66 | if (skb_headroom(skb) < hh_len && | 65 | if (skb_headroom(skb) < hh_len && |
| 67 | pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)), | 66 | pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) |
| 68 | 0, GFP_ATOMIC)) | ||
| 69 | return -1; | 67 | return -1; |
| 70 | 68 | ||
| 71 | return 0; | 69 | return 0; |
| 72 | } | 70 | } |
| 73 | EXPORT_SYMBOL(ip_route_me_harder); | 71 | EXPORT_SYMBOL(ip_route_me_harder); |
| 74 | 72 | ||
| 73 | #ifdef CONFIG_XFRM | ||
| 74 | int ip_xfrm_me_harder(struct sk_buff *skb) | ||
| 75 | { | ||
| 76 | struct flowi fl; | ||
| 77 | unsigned int hh_len; | ||
| 78 | struct dst_entry *dst; | ||
| 79 | |||
| 80 | if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) | ||
| 81 | return 0; | ||
| 82 | if (xfrm_decode_session(skb, &fl, AF_INET) < 0) | ||
| 83 | return -1; | ||
| 84 | |||
| 85 | dst = skb_dst(skb); | ||
| 86 | if (dst->xfrm) | ||
| 87 | dst = ((struct xfrm_dst *)dst)->route; | ||
| 88 | dst_hold(dst); | ||
| 89 | |||
| 90 | dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0); | ||
| 91 | if (IS_ERR(dst)) | ||
| 92 | return -1; | ||
| 93 | |||
| 94 | skb_dst_drop(skb); | ||
| 95 | skb_dst_set(skb, dst); | ||
| 96 | |||
| 97 | /* Change in oif may mean change in hh_len. */ | ||
| 98 | hh_len = skb_dst(skb)->dev->hard_header_len; | ||
| 99 | if (skb_headroom(skb) < hh_len && | ||
| 100 | pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) | ||
| 101 | return -1; | ||
| 102 | return 0; | ||
| 103 | } | ||
| 104 | EXPORT_SYMBOL(ip_xfrm_me_harder); | ||
| 105 | #endif | ||
| 106 | |||
| 107 | void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *); | ||
| 108 | EXPORT_SYMBOL(ip_nat_decode_session); | ||
| 109 | |||
| 75 | /* | 110 | /* |
| 76 | * Extra routing may needed on local out, as the QUEUE target never | 111 | * Extra routing may needed on local out, as the QUEUE target never |
| 77 | * returns control to the table. | 112 | * returns control to the table. |
| @@ -188,15 +223,25 @@ static const struct nf_afinfo nf_ip_afinfo = { | |||
| 188 | .route_key_size = sizeof(struct ip_rt_info), | 223 | .route_key_size = sizeof(struct ip_rt_info), |
| 189 | }; | 224 | }; |
| 190 | 225 | ||
| 191 | static int __init ipv4_netfilter_init(void) | 226 | static int ipv4_netfilter_init(void) |
| 192 | { | 227 | { |
| 193 | return nf_register_afinfo(&nf_ip_afinfo); | 228 | return nf_register_afinfo(&nf_ip_afinfo); |
| 194 | } | 229 | } |
| 195 | 230 | ||
| 196 | static void __exit ipv4_netfilter_fini(void) | 231 | static void ipv4_netfilter_fini(void) |
| 197 | { | 232 | { |
| 198 | nf_unregister_afinfo(&nf_ip_afinfo); | 233 | nf_unregister_afinfo(&nf_ip_afinfo); |
| 199 | } | 234 | } |
| 200 | 235 | ||
| 201 | module_init(ipv4_netfilter_init); | 236 | module_init(ipv4_netfilter_init); |
| 202 | module_exit(ipv4_netfilter_fini); | 237 | module_exit(ipv4_netfilter_fini); |
| 238 | |||
| 239 | #ifdef CONFIG_SYSCTL | ||
| 240 | struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = { | ||
| 241 | { .procname = "net", }, | ||
| 242 | { .procname = "ipv4", }, | ||
| 243 | { .procname = "netfilter", }, | ||
| 244 | { } | ||
| 245 | }; | ||
| 246 | EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path); | ||
| 247 | #endif /* CONFIG_SYSCTL */ | ||
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index d8d6f2a5bf1..73b4e91a87e 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig | |||
| @@ -27,7 +27,7 @@ config NF_CONNTRACK_IPV4 | |||
| 27 | 27 | ||
| 28 | config NF_CONNTRACK_PROC_COMPAT | 28 | config NF_CONNTRACK_PROC_COMPAT |
| 29 | bool "proc/sysctl compatibility with old connection tracking" | 29 | bool "proc/sysctl compatibility with old connection tracking" |
| 30 | depends on NF_CONNTRACK_PROCFS && NF_CONNTRACK_IPV4 | 30 | depends on NF_CONNTRACK_IPV4 |
| 31 | default y | 31 | default y |
| 32 | help | 32 | help |
| 33 | This option enables /proc and sysctl compatibility with the old | 33 | This option enables /proc and sysctl compatibility with the old |
| @@ -76,21 +76,11 @@ config IP_NF_MATCH_AH | |||
| 76 | config IP_NF_MATCH_ECN | 76 | config IP_NF_MATCH_ECN |
| 77 | tristate '"ecn" match support' | 77 | tristate '"ecn" match support' |
| 78 | depends on NETFILTER_ADVANCED | 78 | depends on NETFILTER_ADVANCED |
| 79 | select NETFILTER_XT_MATCH_ECN | 79 | help |
| 80 | ---help--- | 80 | This option adds a `ECN' match, which allows you to match against |
| 81 | This is a backwards-compat option for the user's convenience | 81 | the IPv4 and TCP header ECN fields. |
| 82 | (e.g. when running oldconfig). It selects | ||
| 83 | CONFIG_NETFILTER_XT_MATCH_ECN. | ||
| 84 | |||
| 85 | config IP_NF_MATCH_RPFILTER | ||
| 86 | tristate '"rpfilter" reverse path filter match support' | ||
| 87 | depends on NETFILTER_ADVANCED | ||
| 88 | ---help--- | ||
| 89 | This option allows you to match packets whose replies would | ||
| 90 | go out via the interface the packet came in. | ||
| 91 | 82 | ||
| 92 | To compile it as a module, choose M here. If unsure, say N. | 83 | To compile it as a module, choose M here. If unsure, say N. |
| 93 | The module will be called ipt_rpfilter. | ||
| 94 | 84 | ||
| 95 | config IP_NF_MATCH_TTL | 85 | config IP_NF_MATCH_TTL |
| 96 | tristate '"ttl" match support' | 86 | tristate '"ttl" match support' |
| @@ -123,6 +113,27 @@ config IP_NF_TARGET_REJECT | |||
| 123 | 113 | ||
| 124 | To compile it as a module, choose M here. If unsure, say N. | 114 | To compile it as a module, choose M here. If unsure, say N. |
| 125 | 115 | ||
| 116 | config IP_NF_TARGET_REJECT_SKERR | ||
| 117 | bool "Force socket error when rejecting with icmp*" | ||
| 118 | depends on IP_NF_TARGET_REJECT | ||
| 119 | default n | ||
| 120 | help | ||
| 121 | This option enables turning a "--reject-with icmp*" into a matching | ||
| 122 | socket error also. | ||
| 123 | The REJECT target normally allows sending an ICMP message. But it | ||
| 124 | leaves the local socket unaware of any ingress rejects. | ||
| 125 | |||
| 126 | If unsure, say N. | ||
| 127 | |||
| 128 | config IP_NF_TARGET_LOG | ||
| 129 | tristate "LOG target support" | ||
| 130 | default m if NETFILTER_ADVANCED=n | ||
| 131 | help | ||
| 132 | This option adds a `LOG' target, which allows you to create rules in | ||
| 133 | any iptables table which records the packet header to the syslog. | ||
| 134 | |||
| 135 | To compile it as a module, choose M here. If unsure, say N. | ||
| 136 | |||
| 126 | config IP_NF_TARGET_ULOG | 137 | config IP_NF_TARGET_ULOG |
| 127 | tristate "ULOG target support" | 138 | tristate "ULOG target support" |
| 128 | default m if NETFILTER_ADVANCED=n | 139 | default m if NETFILTER_ADVANCED=n |
| @@ -143,22 +154,25 @@ config IP_NF_TARGET_ULOG | |||
| 143 | To compile it as a module, choose M here. If unsure, say N. | 154 | To compile it as a module, choose M here. If unsure, say N. |
| 144 | 155 | ||
| 145 | # NAT + specific targets: nf_conntrack | 156 | # NAT + specific targets: nf_conntrack |
| 146 | config NF_NAT_IPV4 | 157 | config NF_NAT |
| 147 | tristate "IPv4 NAT" | 158 | tristate "Full NAT" |
| 148 | depends on NF_CONNTRACK_IPV4 | 159 | depends on NF_CONNTRACK_IPV4 |
| 149 | default m if NETFILTER_ADVANCED=n | 160 | default m if NETFILTER_ADVANCED=n |
| 150 | select NF_NAT | ||
| 151 | help | 161 | help |
| 152 | The IPv4 NAT option allows masquerading, port forwarding and other | 162 | The Full NAT option allows masquerading, port forwarding and other |
| 153 | forms of full Network Address Port Translation. It is controlled by | 163 | forms of full Network Address Port Translation. It is controlled by |
| 154 | the `nat' table in iptables: see the man page for iptables(8). | 164 | the `nat' table in iptables: see the man page for iptables(8). |
| 155 | 165 | ||
| 156 | To compile it as a module, choose M here. If unsure, say N. | 166 | To compile it as a module, choose M here. If unsure, say N. |
| 157 | 167 | ||
| 158 | if NF_NAT_IPV4 | 168 | config NF_NAT_NEEDED |
| 169 | bool | ||
| 170 | depends on NF_NAT | ||
| 171 | default y | ||
| 159 | 172 | ||
| 160 | config IP_NF_TARGET_MASQUERADE | 173 | config IP_NF_TARGET_MASQUERADE |
| 161 | tristate "MASQUERADE target support" | 174 | tristate "MASQUERADE target support" |
| 175 | depends on NF_NAT | ||
| 162 | default m if NETFILTER_ADVANCED=n | 176 | default m if NETFILTER_ADVANCED=n |
| 163 | help | 177 | help |
| 164 | Masquerading is a special case of NAT: all outgoing connections are | 178 | Masquerading is a special case of NAT: all outgoing connections are |
| @@ -171,27 +185,30 @@ config IP_NF_TARGET_MASQUERADE | |||
| 171 | 185 | ||
| 172 | config IP_NF_TARGET_NETMAP | 186 | config IP_NF_TARGET_NETMAP |
| 173 | tristate "NETMAP target support" | 187 | tristate "NETMAP target support" |
| 188 | depends on NF_NAT | ||
| 174 | depends on NETFILTER_ADVANCED | 189 | depends on NETFILTER_ADVANCED |
| 175 | select NETFILTER_XT_TARGET_NETMAP | 190 | help |
| 176 | ---help--- | 191 | NETMAP is an implementation of static 1:1 NAT mapping of network |
| 177 | This is a backwards-compat option for the user's convenience | 192 | addresses. It maps the network address part, while keeping the host |
| 178 | (e.g. when running oldconfig). It selects | 193 | address part intact. |
| 179 | CONFIG_NETFILTER_XT_TARGET_NETMAP. | 194 | |
| 195 | To compile it as a module, choose M here. If unsure, say N. | ||
| 180 | 196 | ||
| 181 | config IP_NF_TARGET_REDIRECT | 197 | config IP_NF_TARGET_REDIRECT |
| 182 | tristate "REDIRECT target support" | 198 | tristate "REDIRECT target support" |
| 199 | depends on NF_NAT | ||
| 183 | depends on NETFILTER_ADVANCED | 200 | depends on NETFILTER_ADVANCED |
| 184 | select NETFILTER_XT_TARGET_REDIRECT | 201 | help |
| 185 | ---help--- | 202 | REDIRECT is a special case of NAT: all incoming connections are |
| 186 | This is a backwards-compat option for the user's convenience | 203 | mapped onto the incoming interface's address, causing the packets to |
| 187 | (e.g. when running oldconfig). It selects | 204 | come to the local machine instead of passing through. This is |
| 188 | CONFIG_NETFILTER_XT_TARGET_REDIRECT. | 205 | useful for transparent proxies. |
| 189 | 206 | ||
| 190 | endif | 207 | To compile it as a module, choose M here. If unsure, say N. |
| 191 | 208 | ||
| 192 | config NF_NAT_SNMP_BASIC | 209 | config NF_NAT_SNMP_BASIC |
| 193 | tristate "Basic SNMP-ALG support" | 210 | tristate "Basic SNMP-ALG support" |
| 194 | depends on NF_CONNTRACK_SNMP && NF_NAT_IPV4 | 211 | depends on NF_CONNTRACK_SNMP && NF_NAT |
| 195 | depends on NETFILTER_ADVANCED | 212 | depends on NETFILTER_ADVANCED |
| 196 | default NF_NAT && NF_CONNTRACK_SNMP | 213 | default NF_NAT && NF_CONNTRACK_SNMP |
| 197 | ---help--- | 214 | ---help--- |
| @@ -213,21 +230,61 @@ config NF_NAT_SNMP_BASIC | |||
| 213 | # <expr> '&&' <expr> (6) | 230 | # <expr> '&&' <expr> (6) |
| 214 | # | 231 | # |
| 215 | # (6) Returns the result of min(/expr/, /expr/). | 232 | # (6) Returns the result of min(/expr/, /expr/). |
| 233 | config NF_NAT_PROTO_DCCP | ||
| 234 | tristate | ||
| 235 | depends on NF_NAT && NF_CT_PROTO_DCCP | ||
| 236 | default NF_NAT && NF_CT_PROTO_DCCP | ||
| 216 | 237 | ||
| 217 | config NF_NAT_PROTO_GRE | 238 | config NF_NAT_PROTO_GRE |
| 218 | tristate | 239 | tristate |
| 219 | depends on NF_NAT_IPV4 && NF_CT_PROTO_GRE | 240 | depends on NF_NAT && NF_CT_PROTO_GRE |
| 241 | |||
| 242 | config NF_NAT_PROTO_UDPLITE | ||
| 243 | tristate | ||
| 244 | depends on NF_NAT && NF_CT_PROTO_UDPLITE | ||
| 245 | default NF_NAT && NF_CT_PROTO_UDPLITE | ||
| 246 | |||
| 247 | config NF_NAT_PROTO_SCTP | ||
| 248 | tristate | ||
| 249 | default NF_NAT && NF_CT_PROTO_SCTP | ||
| 250 | depends on NF_NAT && NF_CT_PROTO_SCTP | ||
| 251 | select LIBCRC32C | ||
| 252 | |||
| 253 | config NF_NAT_FTP | ||
| 254 | tristate | ||
| 255 | depends on NF_CONNTRACK && NF_NAT | ||
| 256 | default NF_NAT && NF_CONNTRACK_FTP | ||
| 257 | |||
| 258 | config NF_NAT_IRC | ||
| 259 | tristate | ||
| 260 | depends on NF_CONNTRACK && NF_NAT | ||
| 261 | default NF_NAT && NF_CONNTRACK_IRC | ||
| 262 | |||
| 263 | config NF_NAT_TFTP | ||
| 264 | tristate | ||
| 265 | depends on NF_CONNTRACK && NF_NAT | ||
| 266 | default NF_NAT && NF_CONNTRACK_TFTP | ||
| 267 | |||
| 268 | config NF_NAT_AMANDA | ||
| 269 | tristate | ||
| 270 | depends on NF_CONNTRACK && NF_NAT | ||
| 271 | default NF_NAT && NF_CONNTRACK_AMANDA | ||
| 220 | 272 | ||
| 221 | config NF_NAT_PPTP | 273 | config NF_NAT_PPTP |
| 222 | tristate | 274 | tristate |
| 223 | depends on NF_CONNTRACK && NF_NAT_IPV4 | 275 | depends on NF_CONNTRACK && NF_NAT |
| 224 | default NF_NAT_IPV4 && NF_CONNTRACK_PPTP | 276 | default NF_NAT && NF_CONNTRACK_PPTP |
| 225 | select NF_NAT_PROTO_GRE | 277 | select NF_NAT_PROTO_GRE |
| 226 | 278 | ||
| 227 | config NF_NAT_H323 | 279 | config NF_NAT_H323 |
| 228 | tristate | 280 | tristate |
| 229 | depends on NF_CONNTRACK && NF_NAT_IPV4 | 281 | depends on NF_CONNTRACK && NF_NAT |
| 230 | default NF_NAT_IPV4 && NF_CONNTRACK_H323 | 282 | default NF_NAT && NF_CONNTRACK_H323 |
| 283 | |||
| 284 | config NF_NAT_SIP | ||
| 285 | tristate | ||
| 286 | depends on NF_CONNTRACK && NF_NAT | ||
| 287 | default NF_NAT && NF_CONNTRACK_SIP | ||
| 231 | 288 | ||
| 232 | # mangle + specific targets | 289 | # mangle + specific targets |
| 233 | config IP_NF_MANGLE | 290 | config IP_NF_MANGLE |
| @@ -280,6 +337,7 @@ config IP_NF_TARGET_TTL | |||
| 280 | # raw + specific targets | 337 | # raw + specific targets |
| 281 | config IP_NF_RAW | 338 | config IP_NF_RAW |
| 282 | tristate 'raw table support (required for NOTRACK/TRACE)' | 339 | tristate 'raw table support (required for NOTRACK/TRACE)' |
| 340 | depends on NETFILTER_ADVANCED | ||
| 283 | help | 341 | help |
| 284 | This option adds a `raw' table to iptables. This table is the very | 342 | This option adds a `raw' table to iptables. This table is the very |
| 285 | first in the netfilter framework and hooks in at the PREROUTING | 343 | first in the netfilter framework and hooks in at the PREROUTING |
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 007b128eecc..dca2082ec68 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile | |||
| @@ -10,22 +10,32 @@ nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o | |||
| 10 | endif | 10 | endif |
| 11 | endif | 11 | endif |
| 12 | 12 | ||
| 13 | nf_nat-y := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o | ||
| 14 | iptable_nat-y := nf_nat_rule.o nf_nat_standalone.o | ||
| 15 | |||
| 13 | # connection tracking | 16 | # connection tracking |
| 14 | obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o | 17 | obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o |
| 15 | 18 | ||
| 16 | nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o | 19 | obj-$(CONFIG_NF_NAT) += nf_nat.o |
| 17 | obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o | ||
| 18 | 20 | ||
| 19 | # defrag | 21 | # defrag |
| 20 | obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o | 22 | obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o |
| 21 | 23 | ||
| 22 | # NAT helpers (nf_conntrack) | 24 | # NAT helpers (nf_conntrack) |
| 25 | obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o | ||
| 26 | obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o | ||
| 23 | obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o | 27 | obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o |
| 28 | obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o | ||
| 24 | obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o | 29 | obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o |
| 30 | obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o | ||
| 25 | obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o | 31 | obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o |
| 32 | obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o | ||
| 26 | 33 | ||
| 27 | # NAT protocols (nf_nat) | 34 | # NAT protocols (nf_nat) |
| 35 | obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o | ||
| 28 | obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o | 36 | obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o |
| 37 | obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o | ||
| 38 | obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o | ||
| 29 | 39 | ||
| 30 | # generic IP tables | 40 | # generic IP tables |
| 31 | obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o | 41 | obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o |
| @@ -33,18 +43,21 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o | |||
| 33 | # the three instances of ip_tables | 43 | # the three instances of ip_tables |
| 34 | obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o | 44 | obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o |
| 35 | obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o | 45 | obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o |
| 36 | obj-$(CONFIG_NF_NAT_IPV4) += iptable_nat.o | 46 | obj-$(CONFIG_NF_NAT) += iptable_nat.o |
| 37 | obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o | 47 | obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o |
| 38 | obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o | 48 | obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o |
| 39 | 49 | ||
| 40 | # matches | 50 | # matches |
| 41 | obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o | 51 | obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o |
| 42 | obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o | 52 | obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o |
| 43 | 53 | ||
| 44 | # targets | 54 | # targets |
| 45 | obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o | 55 | obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o |
| 46 | obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o | 56 | obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o |
| 57 | obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o | ||
| 47 | obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o | 58 | obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o |
| 59 | obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o | ||
| 60 | obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o | ||
| 48 | obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o | 61 | obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o |
| 49 | obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o | 62 | obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o |
| 50 | 63 | ||
| @@ -54,3 +67,6 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o | |||
| 54 | 67 | ||
| 55 | # just filtering instance of ARP tables for now | 68 | # just filtering instance of ARP tables for now |
| 56 | obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o | 69 | obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o |
| 70 | |||
| 71 | obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o | ||
| 72 | |||
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 3ea4127404d..fd7a3f68917 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c | |||
| @@ -221,8 +221,9 @@ static inline int arp_checkentry(const struct arpt_arp *arp) | |||
| 221 | static unsigned int | 221 | static unsigned int |
| 222 | arpt_error(struct sk_buff *skb, const struct xt_action_param *par) | 222 | arpt_error(struct sk_buff *skb, const struct xt_action_param *par) |
| 223 | { | 223 | { |
| 224 | net_err_ratelimited("arp_tables: error: '%s'\n", | 224 | if (net_ratelimit()) |
| 225 | (const char *)par->targinfo); | 225 | pr_err("arp_tables: error: '%s'\n", |
| 226 | (const char *)par->targinfo); | ||
| 226 | 227 | ||
| 227 | return NF_DROP; | 228 | return NF_DROP; |
| 228 | } | 229 | } |
| @@ -302,7 +303,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, | |||
| 302 | if (v < 0) { | 303 | if (v < 0) { |
| 303 | /* Pop from stack? */ | 304 | /* Pop from stack? */ |
| 304 | if (v != XT_RETURN) { | 305 | if (v != XT_RETURN) { |
| 305 | verdict = (unsigned int)(-v) - 1; | 306 | verdict = (unsigned)(-v) - 1; |
| 306 | break; | 307 | break; |
| 307 | } | 308 | } |
| 308 | e = back; | 309 | e = back; |
| @@ -1533,7 +1534,7 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, | |||
| 1533 | { | 1534 | { |
| 1534 | int ret; | 1535 | int ret; |
| 1535 | 1536 | ||
| 1536 | if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) | 1537 | if (!capable(CAP_NET_ADMIN)) |
| 1537 | return -EPERM; | 1538 | return -EPERM; |
| 1538 | 1539 | ||
| 1539 | switch (cmd) { | 1540 | switch (cmd) { |
| @@ -1677,7 +1678,7 @@ static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, | |||
| 1677 | { | 1678 | { |
| 1678 | int ret; | 1679 | int ret; |
| 1679 | 1680 | ||
| 1680 | if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) | 1681 | if (!capable(CAP_NET_ADMIN)) |
| 1681 | return -EPERM; | 1682 | return -EPERM; |
| 1682 | 1683 | ||
| 1683 | switch (cmd) { | 1684 | switch (cmd) { |
| @@ -1698,7 +1699,7 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned | |||
| 1698 | { | 1699 | { |
| 1699 | int ret; | 1700 | int ret; |
| 1700 | 1701 | ||
| 1701 | if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) | 1702 | if (!capable(CAP_NET_ADMIN)) |
| 1702 | return -EPERM; | 1703 | return -EPERM; |
| 1703 | 1704 | ||
| 1704 | switch (cmd) { | 1705 | switch (cmd) { |
| @@ -1722,7 +1723,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len | |||
| 1722 | { | 1723 | { |
| 1723 | int ret; | 1724 | int ret; |
| 1724 | 1725 | ||
| 1725 | if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) | 1726 | if (!capable(CAP_NET_ADMIN)) |
| 1726 | return -EPERM; | 1727 | return -EPERM; |
| 1727 | 1728 | ||
| 1728 | switch (cmd) { | 1729 | switch (cmd) { |
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 17c5e06da66..24e556e83a3 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c | |||
| @@ -153,7 +153,8 @@ ip_checkentry(const struct ipt_ip *ip) | |||
| 153 | static unsigned int | 153 | static unsigned int |
| 154 | ipt_error(struct sk_buff *skb, const struct xt_action_param *par) | 154 | ipt_error(struct sk_buff *skb, const struct xt_action_param *par) |
| 155 | { | 155 | { |
| 156 | net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo); | 156 | if (net_ratelimit()) |
| 157 | pr_info("error: `%s'\n", (const char *)par->targinfo); | ||
| 157 | 158 | ||
| 158 | return NF_DROP; | 159 | return NF_DROP; |
| 159 | } | 160 | } |
| @@ -376,7 +377,7 @@ ipt_do_table(struct sk_buff *skb, | |||
| 376 | if (v < 0) { | 377 | if (v < 0) { |
| 377 | /* Pop from stack? */ | 378 | /* Pop from stack? */ |
| 378 | if (v != XT_RETURN) { | 379 | if (v != XT_RETURN) { |
| 379 | verdict = (unsigned int)(-v) - 1; | 380 | verdict = (unsigned)(-v) - 1; |
| 380 | break; | 381 | break; |
| 381 | } | 382 | } |
| 382 | if (*stackptr <= origptr) { | 383 | if (*stackptr <= origptr) { |
| @@ -1846,7 +1847,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, | |||
| 1846 | { | 1847 | { |
| 1847 | int ret; | 1848 | int ret; |
| 1848 | 1849 | ||
| 1849 | if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) | 1850 | if (!capable(CAP_NET_ADMIN)) |
| 1850 | return -EPERM; | 1851 | return -EPERM; |
| 1851 | 1852 | ||
| 1852 | switch (cmd) { | 1853 | switch (cmd) { |
| @@ -1961,7 +1962,7 @@ compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) | |||
| 1961 | { | 1962 | { |
| 1962 | int ret; | 1963 | int ret; |
| 1963 | 1964 | ||
| 1964 | if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) | 1965 | if (!capable(CAP_NET_ADMIN)) |
| 1965 | return -EPERM; | 1966 | return -EPERM; |
| 1966 | 1967 | ||
| 1967 | switch (cmd) { | 1968 | switch (cmd) { |
| @@ -1983,7 +1984,7 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) | |||
| 1983 | { | 1984 | { |
| 1984 | int ret; | 1985 | int ret; |
| 1985 | 1986 | ||
| 1986 | if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) | 1987 | if (!capable(CAP_NET_ADMIN)) |
| 1987 | return -EPERM; | 1988 | return -EPERM; |
| 1988 | 1989 | ||
| 1989 | switch (cmd) { | 1990 | switch (cmd) { |
| @@ -2008,7 +2009,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) | |||
| 2008 | { | 2009 | { |
| 2009 | int ret; | 2010 | int ret; |
| 2010 | 2011 | ||
| 2011 | if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) | 2012 | if (!capable(CAP_NET_ADMIN)) |
| 2012 | return -EPERM; | 2013 | return -EPERM; |
| 2013 | 2014 | ||
| 2014 | switch (cmd) { | 2015 | switch (cmd) { |
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 75e33a7048f..db8d22db425 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c | |||
| @@ -246,7 +246,8 @@ clusterip_hashfn(const struct sk_buff *skb, | |||
| 246 | dport = ports[1]; | 246 | dport = ports[1]; |
| 247 | } | 247 | } |
| 248 | } else { | 248 | } else { |
| 249 | net_info_ratelimited("unknown protocol %u\n", iph->protocol); | 249 | if (net_ratelimit()) |
| 250 | pr_info("unknown protocol %u\n", iph->protocol); | ||
| 250 | } | 251 | } |
| 251 | 252 | ||
| 252 | switch (config->hash_mode) { | 253 | switch (config->hash_mode) { |
| @@ -394,6 +395,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) | |||
| 394 | config = clusterip_config_init(cipinfo, | 395 | config = clusterip_config_init(cipinfo, |
| 395 | e->ip.dst.s_addr, dev); | 396 | e->ip.dst.s_addr, dev); |
| 396 | if (!config) { | 397 | if (!config) { |
| 398 | pr_info("cannot allocate config\n"); | ||
| 397 | dev_put(dev); | 399 | dev_put(dev); |
| 398 | return -ENOMEM; | 400 | return -ENOMEM; |
| 399 | } | 401 | } |
| @@ -661,7 +663,6 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input, | |||
| 661 | #define PROC_WRITELEN 10 | 663 | #define PROC_WRITELEN 10 |
| 662 | char buffer[PROC_WRITELEN+1]; | 664 | char buffer[PROC_WRITELEN+1]; |
| 663 | unsigned long nodenum; | 665 | unsigned long nodenum; |
| 664 | int rc; | ||
| 665 | 666 | ||
| 666 | if (size > PROC_WRITELEN) | 667 | if (size > PROC_WRITELEN) |
| 667 | return -EIO; | 668 | return -EIO; |
| @@ -670,15 +671,11 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input, | |||
| 670 | buffer[size] = 0; | 671 | buffer[size] = 0; |
| 671 | 672 | ||
| 672 | if (*buffer == '+') { | 673 | if (*buffer == '+') { |
| 673 | rc = kstrtoul(buffer+1, 10, &nodenum); | 674 | nodenum = simple_strtoul(buffer+1, NULL, 10); |
| 674 | if (rc) | ||
| 675 | return rc; | ||
| 676 | if (clusterip_add_node(c, nodenum)) | 675 | if (clusterip_add_node(c, nodenum)) |
| 677 | return -ENOMEM; | 676 | return -ENOMEM; |
| 678 | } else if (*buffer == '-') { | 677 | } else if (*buffer == '-') { |
| 679 | rc = kstrtoul(buffer+1, 10, &nodenum); | 678 | nodenum = simple_strtoul(buffer+1, NULL,10); |
| 680 | if (rc) | ||
| 681 | return rc; | ||
| 682 | if (clusterip_del_node(c, nodenum)) | 679 | if (clusterip_del_node(c, nodenum)) |
| 683 | return -ENOENT; | 680 | return -ENOENT; |
| 684 | } else | 681 | } else |
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 5d5d4d1be9c..9931152a78b 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c | |||
| @@ -19,9 +19,9 @@ | |||
| 19 | #include <net/ip.h> | 19 | #include <net/ip.h> |
| 20 | #include <net/checksum.h> | 20 | #include <net/checksum.h> |
| 21 | #include <net/route.h> | 21 | #include <net/route.h> |
| 22 | #include <net/netfilter/nf_nat_rule.h> | ||
| 22 | #include <linux/netfilter_ipv4.h> | 23 | #include <linux/netfilter_ipv4.h> |
| 23 | #include <linux/netfilter/x_tables.h> | 24 | #include <linux/netfilter/x_tables.h> |
| 24 | #include <net/netfilter/nf_nat.h> | ||
| 25 | 25 | ||
| 26 | MODULE_LICENSE("GPL"); | 26 | MODULE_LICENSE("GPL"); |
| 27 | MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); | 27 | MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); |
| @@ -30,9 +30,9 @@ MODULE_DESCRIPTION("Xtables: automatic-address SNAT"); | |||
| 30 | /* FIXME: Multiple targets. --RR */ | 30 | /* FIXME: Multiple targets. --RR */ |
| 31 | static int masquerade_tg_check(const struct xt_tgchk_param *par) | 31 | static int masquerade_tg_check(const struct xt_tgchk_param *par) |
| 32 | { | 32 | { |
| 33 | const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; | 33 | const struct nf_nat_multi_range_compat *mr = par->targinfo; |
| 34 | 34 | ||
| 35 | if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) { | 35 | if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { |
| 36 | pr_debug("bad MAP_IPS.\n"); | 36 | pr_debug("bad MAP_IPS.\n"); |
| 37 | return -EINVAL; | 37 | return -EINVAL; |
| 38 | } | 38 | } |
| @@ -50,9 +50,9 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) | |||
| 50 | struct nf_conn_nat *nat; | 50 | struct nf_conn_nat *nat; |
| 51 | enum ip_conntrack_info ctinfo; | 51 | enum ip_conntrack_info ctinfo; |
| 52 | struct nf_nat_range newrange; | 52 | struct nf_nat_range newrange; |
| 53 | const struct nf_nat_ipv4_multi_range_compat *mr; | 53 | const struct nf_nat_multi_range_compat *mr; |
| 54 | const struct rtable *rt; | 54 | const struct rtable *rt; |
| 55 | __be32 newsrc, nh; | 55 | __be32 newsrc; |
| 56 | 56 | ||
| 57 | NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); | 57 | NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); |
| 58 | 58 | ||
| @@ -70,8 +70,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) | |||
| 70 | 70 | ||
| 71 | mr = par->targinfo; | 71 | mr = par->targinfo; |
| 72 | rt = skb_rtable(skb); | 72 | rt = skb_rtable(skb); |
| 73 | nh = rt_nexthop(rt, ip_hdr(skb)->daddr); | 73 | newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE); |
| 74 | newsrc = inet_select_addr(par->out, nh, RT_SCOPE_UNIVERSE); | ||
| 75 | if (!newsrc) { | 74 | if (!newsrc) { |
| 76 | pr_info("%s ate my IP address\n", par->out->name); | 75 | pr_info("%s ate my IP address\n", par->out->name); |
| 77 | return NF_DROP; | 76 | return NF_DROP; |
| @@ -80,16 +79,13 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) | |||
| 80 | nat->masq_index = par->out->ifindex; | 79 | nat->masq_index = par->out->ifindex; |
| 81 | 80 | ||
| 82 | /* Transfer from original range. */ | 81 | /* Transfer from original range. */ |
| 83 | memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); | 82 | newrange = ((struct nf_nat_range) |
| 84 | memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); | 83 | { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, |
| 85 | newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; | 84 | newsrc, newsrc, |
| 86 | newrange.min_addr.ip = newsrc; | 85 | mr->range[0].min, mr->range[0].max }); |
| 87 | newrange.max_addr.ip = newsrc; | ||
| 88 | newrange.min_proto = mr->range[0].min; | ||
| 89 | newrange.max_proto = mr->range[0].max; | ||
| 90 | 86 | ||
| 91 | /* Hand modified range to generic setup. */ | 87 | /* Hand modified range to generic setup. */ |
| 92 | return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); | 88 | return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_SRC); |
| 93 | } | 89 | } |
| 94 | 90 | ||
| 95 | static int | 91 | static int |
| @@ -99,8 +95,7 @@ device_cmp(struct nf_conn *i, void *ifindex) | |||
| 99 | 95 | ||
| 100 | if (!nat) | 96 | if (!nat) |
| 101 | return 0; | 97 | return 0; |
| 102 | if (nf_ct_l3num(i) != NFPROTO_IPV4) | 98 | |
| 103 | return 0; | ||
| 104 | return nat->masq_index == (int)(long)ifindex; | 99 | return nat->masq_index == (int)(long)ifindex; |
| 105 | } | 100 | } |
| 106 | 101 | ||
| @@ -144,7 +139,7 @@ static struct xt_target masquerade_tg_reg __read_mostly = { | |||
| 144 | .name = "MASQUERADE", | 139 | .name = "MASQUERADE", |
| 145 | .family = NFPROTO_IPV4, | 140 | .family = NFPROTO_IPV4, |
| 146 | .target = masquerade_tg, | 141 | .target = masquerade_tg, |
| 147 | .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), | 142 | .targetsize = sizeof(struct nf_nat_multi_range_compat), |
| 148 | .table = "nat", | 143 | .table = "nat", |
| 149 | .hooks = 1 << NF_INET_POST_ROUTING, | 144 | .hooks = 1 << NF_INET_POST_ROUTING, |
| 150 | .checkentry = masquerade_tg_check, | 145 | .checkentry = masquerade_tg_check, |
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 04b18c1ac34..9dd754c7f2b 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c | |||
| @@ -81,7 +81,6 @@ static void send_reset(struct sk_buff *oldskb, int hook) | |||
| 81 | niph->saddr = oiph->daddr; | 81 | niph->saddr = oiph->daddr; |
| 82 | niph->daddr = oiph->saddr; | 82 | niph->daddr = oiph->saddr; |
| 83 | 83 | ||
| 84 | skb_reset_transport_header(nskb); | ||
| 85 | tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); | 84 | tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); |
| 86 | memset(tcph, 0, sizeof(*tcph)); | 85 | memset(tcph, 0, sizeof(*tcph)); |
| 87 | tcph->source = oth->dest; | 86 | tcph->source = oth->dest; |
| @@ -129,6 +128,14 @@ static void send_reset(struct sk_buff *oldskb, int hook) | |||
| 129 | static inline void send_unreach(struct sk_buff *skb_in, int code) | 128 | static inline void send_unreach(struct sk_buff *skb_in, int code) |
| 130 | { | 129 | { |
| 131 | icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); | 130 | icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); |
| 131 | #ifdef CONFIG_IP_NF_TARGET_REJECT_SKERR | ||
| 132 | if (skb_in->sk) { | ||
| 133 | skb_in->sk->sk_err = icmp_err_convert[code].errno; | ||
| 134 | skb_in->sk->sk_error_report(skb_in->sk); | ||
| 135 | pr_debug("ipt_REJECT: sk_err=%d for skb=%p sk=%p\n", | ||
| 136 | skb_in->sk->sk_err, skb_in, skb_in->sk); | ||
| 137 | } | ||
| 138 | #endif | ||
| 132 | } | 139 | } |
| 133 | 140 | ||
| 134 | static unsigned int | 141 | static unsigned int |
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index b5ef3cba225..446e0f467a1 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c | |||
| @@ -65,7 +65,7 @@ static unsigned int flushtimeout = 10; | |||
| 65 | module_param(flushtimeout, uint, 0600); | 65 | module_param(flushtimeout, uint, 0600); |
| 66 | MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)"); | 66 | MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)"); |
| 67 | 67 | ||
| 68 | static bool nflog = true; | 68 | static int nflog = 1; |
| 69 | module_param(nflog, bool, 0400); | 69 | module_param(nflog, bool, 0400); |
| 70 | MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); | 70 | MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); |
| 71 | 71 | ||
| @@ -135,8 +135,10 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size) | |||
| 135 | * due to slab allocator restrictions */ | 135 | * due to slab allocator restrictions */ |
| 136 | 136 | ||
| 137 | n = max(size, nlbufsiz); | 137 | n = max(size, nlbufsiz); |
| 138 | skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN); | 138 | skb = alloc_skb(n, GFP_ATOMIC); |
| 139 | if (!skb) { | 139 | if (!skb) { |
| 140 | pr_debug("cannot alloc whole buffer %ub!\n", n); | ||
| 141 | |||
| 140 | if (n > size) { | 142 | if (n > size) { |
| 141 | /* try to allocate only as much as we need for | 143 | /* try to allocate only as much as we need for |
| 142 | * current packet */ | 144 | * current packet */ |
| @@ -196,15 +198,12 @@ static void ipt_ulog_packet(unsigned int hooknum, | |||
| 196 | 198 | ||
| 197 | pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold); | 199 | pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold); |
| 198 | 200 | ||
| 199 | nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, | 201 | /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */ |
| 200 | sizeof(*pm)+copy_len, 0); | 202 | nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, |
| 201 | if (!nlh) { | 203 | sizeof(*pm)+copy_len); |
| 202 | pr_debug("error during nlmsg_put\n"); | ||
| 203 | goto out_unlock; | ||
| 204 | } | ||
| 205 | ub->qlen++; | 204 | ub->qlen++; |
| 206 | 205 | ||
| 207 | pm = nlmsg_data(nlh); | 206 | pm = NLMSG_DATA(nlh); |
| 208 | 207 | ||
| 209 | /* We might not have a timestamp, get one */ | 208 | /* We might not have a timestamp, get one */ |
| 210 | if (skb->tstamp.tv64 == 0) | 209 | if (skb->tstamp.tv64 == 0) |
| @@ -264,11 +263,13 @@ static void ipt_ulog_packet(unsigned int hooknum, | |||
| 264 | nlh->nlmsg_type = NLMSG_DONE; | 263 | nlh->nlmsg_type = NLMSG_DONE; |
| 265 | ulog_send(groupnum); | 264 | ulog_send(groupnum); |
| 266 | } | 265 | } |
| 267 | out_unlock: | 266 | |
| 268 | spin_unlock_bh(&ulog_lock); | 267 | spin_unlock_bh(&ulog_lock); |
| 269 | 268 | ||
| 270 | return; | 269 | return; |
| 271 | 270 | ||
| 271 | nlmsg_failure: | ||
| 272 | pr_debug("error during NLMSG_PUT\n"); | ||
| 272 | alloc_failure: | 273 | alloc_failure: |
| 273 | pr_debug("Error building netlink message\n"); | 274 | pr_debug("Error building netlink message\n"); |
| 274 | spin_unlock_bh(&ulog_lock); | 275 | spin_unlock_bh(&ulog_lock); |
| @@ -381,9 +382,6 @@ static struct nf_logger ipt_ulog_logger __read_mostly = { | |||
| 381 | static int __init ulog_tg_init(void) | 382 | static int __init ulog_tg_init(void) |
| 382 | { | 383 | { |
| 383 | int ret, i; | 384 | int ret, i; |
| 384 | struct netlink_kernel_cfg cfg = { | ||
| 385 | .groups = ULOG_MAXNLGROUPS, | ||
| 386 | }; | ||
| 387 | 385 | ||
| 388 | pr_debug("init module\n"); | 386 | pr_debug("init module\n"); |
| 389 | 387 | ||
| @@ -396,7 +394,9 @@ static int __init ulog_tg_init(void) | |||
| 396 | for (i = 0; i < ULOG_MAXNLGROUPS; i++) | 394 | for (i = 0; i < ULOG_MAXNLGROUPS; i++) |
| 397 | setup_timer(&ulog_buffers[i].timer, ulog_timer, i); | 395 | setup_timer(&ulog_buffers[i].timer, ulog_timer, i); |
| 398 | 396 | ||
| 399 | nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, &cfg); | 397 | nflognl = netlink_kernel_create(&init_net, |
| 398 | NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, | ||
| 399 | NULL, THIS_MODULE); | ||
| 400 | if (!nflognl) | 400 | if (!nflognl) |
| 401 | return -ENOMEM; | 401 | return -ENOMEM; |
| 402 | 402 | ||
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c deleted file mode 100644 index c30130062cd..00000000000 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ /dev/null | |||
| @@ -1,141 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (c) 2011 Florian Westphal <fw@strlen.de> | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License version 2 as | ||
| 6 | * published by the Free Software Foundation. | ||
| 7 | * | ||
| 8 | * based on fib_frontend.c; Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | ||
| 9 | */ | ||
| 10 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
| 11 | #include <linux/module.h> | ||
| 12 | #include <linux/skbuff.h> | ||
| 13 | #include <linux/netdevice.h> | ||
| 14 | #include <linux/ip.h> | ||
| 15 | #include <net/ip.h> | ||
| 16 | #include <net/ip_fib.h> | ||
| 17 | #include <net/route.h> | ||
| 18 | |||
| 19 | #include <linux/netfilter/xt_rpfilter.h> | ||
| 20 | #include <linux/netfilter/x_tables.h> | ||
| 21 | |||
| 22 | MODULE_LICENSE("GPL"); | ||
| 23 | MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); | ||
| 24 | MODULE_DESCRIPTION("iptables: ipv4 reverse path filter match"); | ||
| 25 | |||
| 26 | /* don't try to find route from mcast/bcast/zeronet */ | ||
| 27 | static __be32 rpfilter_get_saddr(__be32 addr) | ||
| 28 | { | ||
| 29 | if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) || | ||
| 30 | ipv4_is_zeronet(addr)) | ||
| 31 | return 0; | ||
| 32 | return addr; | ||
| 33 | } | ||
| 34 | |||
| 35 | static bool rpfilter_lookup_reverse(struct flowi4 *fl4, | ||
| 36 | const struct net_device *dev, u8 flags) | ||
| 37 | { | ||
| 38 | struct fib_result res; | ||
| 39 | bool dev_match; | ||
| 40 | struct net *net = dev_net(dev); | ||
| 41 | int ret __maybe_unused; | ||
| 42 | |||
| 43 | if (fib_lookup(net, fl4, &res)) | ||
| 44 | return false; | ||
| 45 | |||
| 46 | if (res.type != RTN_UNICAST) { | ||
| 47 | if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL)) | ||
| 48 | return false; | ||
| 49 | } | ||
| 50 | dev_match = false; | ||
| 51 | #ifdef CONFIG_IP_ROUTE_MULTIPATH | ||
| 52 | for (ret = 0; ret < res.fi->fib_nhs; ret++) { | ||
| 53 | struct fib_nh *nh = &res.fi->fib_nh[ret]; | ||
| 54 | |||
| 55 | if (nh->nh_dev == dev) { | ||
| 56 | dev_match = true; | ||
| 57 | break; | ||
| 58 | } | ||
| 59 | } | ||
| 60 | #else | ||
| 61 | if (FIB_RES_DEV(res) == dev) | ||
| 62 | dev_match = true; | ||
| 63 | #endif | ||
| 64 | if (dev_match || flags & XT_RPFILTER_LOOSE) | ||
| 65 | return FIB_RES_NH(res).nh_scope <= RT_SCOPE_HOST; | ||
| 66 | return dev_match; | ||
| 67 | } | ||
| 68 | |||
| 69 | static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) | ||
| 70 | { | ||
| 71 | const struct xt_rpfilter_info *info; | ||
| 72 | const struct iphdr *iph; | ||
| 73 | struct flowi4 flow; | ||
| 74 | bool invert; | ||
| 75 | |||
| 76 | info = par->matchinfo; | ||
| 77 | invert = info->flags & XT_RPFILTER_INVERT; | ||
| 78 | |||
| 79 | if (par->in->flags & IFF_LOOPBACK) | ||
| 80 | return true ^ invert; | ||
| 81 | |||
| 82 | iph = ip_hdr(skb); | ||
| 83 | if (ipv4_is_multicast(iph->daddr)) { | ||
| 84 | if (ipv4_is_zeronet(iph->saddr)) | ||
| 85 | return ipv4_is_local_multicast(iph->daddr) ^ invert; | ||
| 86 | flow.flowi4_iif = 0; | ||
| 87 | } else { | ||
| 88 | flow.flowi4_iif = LOOPBACK_IFINDEX; | ||
| 89 | } | ||
| 90 | |||
| 91 | flow.daddr = iph->saddr; | ||
| 92 | flow.saddr = rpfilter_get_saddr(iph->daddr); | ||
| 93 | flow.flowi4_oif = 0; | ||
| 94 | flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; | ||
| 95 | flow.flowi4_tos = RT_TOS(iph->tos); | ||
| 96 | flow.flowi4_scope = RT_SCOPE_UNIVERSE; | ||
| 97 | |||
| 98 | return rpfilter_lookup_reverse(&flow, par->in, info->flags) ^ invert; | ||
| 99 | } | ||
| 100 | |||
| 101 | static int rpfilter_check(const struct xt_mtchk_param *par) | ||
| 102 | { | ||
| 103 | const struct xt_rpfilter_info *info = par->matchinfo; | ||
| 104 | unsigned int options = ~XT_RPFILTER_OPTION_MASK; | ||
| 105 | if (info->flags & options) { | ||
| 106 | pr_info("unknown options encountered"); | ||
| 107 | return -EINVAL; | ||
| 108 | } | ||
| 109 | |||
| 110 | if (strcmp(par->table, "mangle") != 0 && | ||
| 111 | strcmp(par->table, "raw") != 0) { | ||
| 112 | pr_info("match only valid in the \'raw\' " | ||
| 113 | "or \'mangle\' tables, not \'%s\'.\n", par->table); | ||
| 114 | return -EINVAL; | ||
| 115 | } | ||
| 116 | |||
| 117 | return 0; | ||
| 118 | } | ||
| 119 | |||
| 120 | static struct xt_match rpfilter_mt_reg __read_mostly = { | ||
| 121 | .name = "rpfilter", | ||
| 122 | .family = NFPROTO_IPV4, | ||
| 123 | .checkentry = rpfilter_check, | ||
| 124 | .match = rpfilter_mt, | ||
| 125 | .matchsize = sizeof(struct xt_rpfilter_info), | ||
| 126 | .hooks = (1 << NF_INET_PRE_ROUTING), | ||
| 127 | .me = THIS_MODULE | ||
| 128 | }; | ||
| 129 | |||
| 130 | static int __init rpfilter_mt_init(void) | ||
| 131 | { | ||
| 132 | return xt_register_match(&rpfilter_mt_reg); | ||
| 133 | } | ||
| 134 | |||
| 135 | static void __exit rpfilter_mt_exit(void) | ||
| 136 | { | ||
| 137 | xt_unregister_match(&rpfilter_mt_reg); | ||
| 138 | } | ||
| 139 | |||
| 140 | module_init(rpfilter_mt_init); | ||
| 141 | module_exit(rpfilter_mt_exit); | ||
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 6b3da5cf54e..c37641e819f 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c | |||
| @@ -52,7 +52,7 @@ iptable_filter_hook(unsigned int hook, struct sk_buff *skb, | |||
| 52 | static struct nf_hook_ops *filter_ops __read_mostly; | 52 | static struct nf_hook_ops *filter_ops __read_mostly; |
| 53 | 53 | ||
| 54 | /* Default to forward because I got too much mail already. */ | 54 | /* Default to forward because I got too much mail already. */ |
| 55 | static bool forward = true; | 55 | static int forward = NF_ACCEPT; |
| 56 | module_param(forward, bool, 0000); | 56 | module_param(forward, bool, 0000); |
| 57 | 57 | ||
| 58 | static int __net_init iptable_filter_net_init(struct net *net) | 58 | static int __net_init iptable_filter_net_init(struct net *net) |
| @@ -64,12 +64,14 @@ static int __net_init iptable_filter_net_init(struct net *net) | |||
| 64 | return -ENOMEM; | 64 | return -ENOMEM; |
| 65 | /* Entry 1 is the FORWARD hook */ | 65 | /* Entry 1 is the FORWARD hook */ |
| 66 | ((struct ipt_standard *)repl->entries)[1].target.verdict = | 66 | ((struct ipt_standard *)repl->entries)[1].target.verdict = |
| 67 | forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; | 67 | -forward - 1; |
| 68 | 68 | ||
| 69 | net->ipv4.iptable_filter = | 69 | net->ipv4.iptable_filter = |
| 70 | ipt_register_table(net, &packet_filter, repl); | 70 | ipt_register_table(net, &packet_filter, repl); |
| 71 | kfree(repl); | 71 | kfree(repl); |
| 72 | return PTR_RET(net->ipv4.iptable_filter); | 72 | if (IS_ERR(net->ipv4.iptable_filter)) |
| 73 | return PTR_ERR(net->ipv4.iptable_filter); | ||
| 74 | return 0; | ||
| 73 | } | 75 | } |
| 74 | 76 | ||
| 75 | static void __net_exit iptable_filter_net_exit(struct net *net) | 77 | static void __net_exit iptable_filter_net_exit(struct net *net) |
| @@ -86,6 +88,11 @@ static int __init iptable_filter_init(void) | |||
| 86 | { | 88 | { |
| 87 | int ret; | 89 | int ret; |
| 88 | 90 | ||
| 91 | if (forward < 0 || forward > NF_MAX_VERDICT) { | ||
| 92 | pr_err("iptables forward must be 0 or 1\n"); | ||
| 93 | return -EINVAL; | ||
| 94 | } | ||
| 95 | |||
| 89 | ret = register_pernet_subsys(&iptable_filter_net_ops); | 96 | ret = register_pernet_subsys(&iptable_filter_net_ops); |
| 90 | if (ret < 0) | 97 | if (ret < 0) |
| 91 | return ret; | 98 | return ret; |
| @@ -94,10 +101,14 @@ static int __init iptable_filter_init(void) | |||
| 94 | filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook); | 101 | filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook); |
| 95 | if (IS_ERR(filter_ops)) { | 102 | if (IS_ERR(filter_ops)) { |
| 96 | ret = PTR_ERR(filter_ops); | 103 | ret = PTR_ERR(filter_ops); |
| 97 | unregister_pernet_subsys(&iptable_filter_net_ops); | 104 | goto cleanup_table; |
| 98 | } | 105 | } |
| 99 | 106 | ||
| 100 | return ret; | 107 | return ret; |
| 108 | |||
| 109 | cleanup_table: | ||
| 110 | unregister_pernet_subsys(&iptable_filter_net_ops); | ||
| 111 | return ret; | ||
| 101 | } | 112 | } |
| 102 | 113 | ||
| 103 | static void __exit iptable_filter_fini(void) | 114 | static void __exit iptable_filter_fini(void) |
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 85d88f20644..aef5d1fbe77 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c | |||
| @@ -104,7 +104,9 @@ static int __net_init iptable_mangle_net_init(struct net *net) | |||
| 104 | net->ipv4.iptable_mangle = | 104 | net->ipv4.iptable_mangle = |
| 105 | ipt_register_table(net, &packet_mangler, repl); | 105 | ipt_register_table(net, &packet_mangler, repl); |
| 106 | kfree(repl); | 106 | kfree(repl); |
| 107 | return PTR_RET(net->ipv4.iptable_mangle); | 107 | if (IS_ERR(net->ipv4.iptable_mangle)) |
| 108 | return PTR_ERR(net->ipv4.iptable_mangle); | ||
| 109 | return 0; | ||
| 108 | } | 110 | } |
| 109 | 111 | ||
| 110 | static void __net_exit iptable_mangle_net_exit(struct net *net) | 112 | static void __net_exit iptable_mangle_net_exit(struct net *net) |
| @@ -129,10 +131,14 @@ static int __init iptable_mangle_init(void) | |||
| 129 | mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook); | 131 | mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook); |
| 130 | if (IS_ERR(mangle_ops)) { | 132 | if (IS_ERR(mangle_ops)) { |
| 131 | ret = PTR_ERR(mangle_ops); | 133 | ret = PTR_ERR(mangle_ops); |
| 132 | unregister_pernet_subsys(&iptable_mangle_net_ops); | 134 | goto cleanup_table; |
| 133 | } | 135 | } |
| 134 | 136 | ||
| 135 | return ret; | 137 | return ret; |
| 138 | |||
| 139 | cleanup_table: | ||
| 140 | unregister_pernet_subsys(&iptable_mangle_net_ops); | ||
| 141 | return ret; | ||
| 136 | } | 142 | } |
| 137 | 143 | ||
| 138 | static void __exit iptable_mangle_fini(void) | 144 | static void __exit iptable_mangle_fini(void) |
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c deleted file mode 100644 index eeaff7e4acb..00000000000 --- a/net/ipv4/netfilter/iptable_nat.c +++ /dev/null | |||
| @@ -1,329 +0,0 @@ | |||
| 1 | /* (C) 1999-2001 Paul `Rusty' Russell | ||
| 2 | * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> | ||
| 3 | * (C) 2011 Patrick McHardy <kaber@trash.net> | ||
| 4 | * | ||
| 5 | * This program is free software; you can redistribute it and/or modify | ||
| 6 | * it under the terms of the GNU General Public License version 2 as | ||
| 7 | * published by the Free Software Foundation. | ||
| 8 | */ | ||
| 9 | |||
| 10 | #include <linux/module.h> | ||
| 11 | #include <linux/netfilter.h> | ||
| 12 | #include <linux/netfilter_ipv4.h> | ||
| 13 | #include <linux/netfilter_ipv4/ip_tables.h> | ||
| 14 | #include <linux/ip.h> | ||
| 15 | #include <net/ip.h> | ||
| 16 | |||
| 17 | #include <net/netfilter/nf_nat.h> | ||
| 18 | #include <net/netfilter/nf_nat_core.h> | ||
| 19 | #include <net/netfilter/nf_nat_l3proto.h> | ||
| 20 | |||
| 21 | static const struct xt_table nf_nat_ipv4_table = { | ||
| 22 | .name = "nat", | ||
| 23 | .valid_hooks = (1 << NF_INET_PRE_ROUTING) | | ||
| 24 | (1 << NF_INET_POST_ROUTING) | | ||
| 25 | (1 << NF_INET_LOCAL_OUT) | | ||
| 26 | (1 << NF_INET_LOCAL_IN), | ||
| 27 | .me = THIS_MODULE, | ||
| 28 | .af = NFPROTO_IPV4, | ||
| 29 | }; | ||
| 30 | |||
| 31 | static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) | ||
| 32 | { | ||
| 33 | /* Force range to this IP; let proto decide mapping for | ||
| 34 | * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). | ||
| 35 | */ | ||
| 36 | struct nf_nat_range range; | ||
| 37 | |||
| 38 | range.flags = 0; | ||
| 39 | pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, | ||
| 40 | HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ? | ||
| 41 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip : | ||
| 42 | &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); | ||
| 43 | |||
| 44 | return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); | ||
| 45 | } | ||
| 46 | |||
| 47 | static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum, | ||
| 48 | const struct net_device *in, | ||
| 49 | const struct net_device *out, | ||
| 50 | struct nf_conn *ct) | ||
| 51 | { | ||
| 52 | struct net *net = nf_ct_net(ct); | ||
| 53 | unsigned int ret; | ||
| 54 | |||
| 55 | ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table); | ||
| 56 | if (ret == NF_ACCEPT) { | ||
| 57 | if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum))) | ||
| 58 | ret = alloc_null_binding(ct, hooknum); | ||
| 59 | } | ||
| 60 | return ret; | ||
| 61 | } | ||
| 62 | |||
| 63 | static unsigned int | ||
| 64 | nf_nat_ipv4_fn(unsigned int hooknum, | ||
| 65 | struct sk_buff *skb, | ||
| 66 | const struct net_device *in, | ||
| 67 | const struct net_device *out, | ||
| 68 | int (*okfn)(struct sk_buff *)) | ||
| 69 | { | ||
| 70 | struct nf_conn *ct; | ||
| 71 | enum ip_conntrack_info ctinfo; | ||
| 72 | struct nf_conn_nat *nat; | ||
| 73 | /* maniptype == SRC for postrouting. */ | ||
| 74 | enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum); | ||
| 75 | |||
| 76 | /* We never see fragments: conntrack defrags on pre-routing | ||
| 77 | * and local-out, and nf_nat_out protects post-routing. | ||
| 78 | */ | ||
| 79 | NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb))); | ||
| 80 | |||
| 81 | ct = nf_ct_get(skb, &ctinfo); | ||
| 82 | /* Can't track? It's not due to stress, or conntrack would | ||
| 83 | * have dropped it. Hence it's the user's responsibilty to | ||
| 84 | * packet filter it out, or implement conntrack/NAT for that | ||
| 85 | * protocol. 8) --RR | ||
| 86 | */ | ||
| 87 | if (!ct) | ||
| 88 | return NF_ACCEPT; | ||
| 89 | |||
| 90 | /* Don't try to NAT if this packet is not conntracked */ | ||
| 91 | if (nf_ct_is_untracked(ct)) | ||
| 92 | return NF_ACCEPT; | ||
| 93 | |||
| 94 | nat = nfct_nat(ct); | ||
| 95 | if (!nat) { | ||
| 96 | /* NAT module was loaded late. */ | ||
| 97 | if (nf_ct_is_confirmed(ct)) | ||
| 98 | return NF_ACCEPT; | ||
| 99 | nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); | ||
| 100 | if (nat == NULL) { | ||
| 101 | pr_debug("failed to add NAT extension\n"); | ||
| 102 | return NF_ACCEPT; | ||
| 103 | } | ||
| 104 | } | ||
| 105 | |||
| 106 | switch (ctinfo) { | ||
| 107 | case IP_CT_RELATED: | ||
| 108 | case IP_CT_RELATED_REPLY: | ||
| 109 | if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { | ||
| 110 | if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, | ||
| 111 | hooknum)) | ||
| 112 | return NF_DROP; | ||
| 113 | else | ||
| 114 | return NF_ACCEPT; | ||
| 115 | } | ||
| 116 | /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ | ||
| 117 | case IP_CT_NEW: | ||
| 118 | /* Seen it before? This can happen for loopback, retrans, | ||
| 119 | * or local packets. | ||
| 120 | */ | ||
| 121 | if (!nf_nat_initialized(ct, maniptype)) { | ||
| 122 | unsigned int ret; | ||
| 123 | |||
| 124 | ret = nf_nat_rule_find(skb, hooknum, in, out, ct); | ||
| 125 | if (ret != NF_ACCEPT) | ||
| 126 | return ret; | ||
| 127 | } else { | ||
| 128 | pr_debug("Already setup manip %s for ct %p\n", | ||
| 129 | maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", | ||
| 130 | ct); | ||
| 131 | if (nf_nat_oif_changed(hooknum, ctinfo, nat, out)) | ||
| 132 | goto oif_changed; | ||
| 133 | } | ||
| 134 | break; | ||
| 135 | |||
| 136 | default: | ||
| 137 | /* ESTABLISHED */ | ||
| 138 | NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || | ||
| 139 | ctinfo == IP_CT_ESTABLISHED_REPLY); | ||
| 140 | if (nf_nat_oif_changed(hooknum, ctinfo, nat, out)) | ||
| 141 | goto oif_changed; | ||
| 142 | } | ||
| 143 | |||
| 144 | return nf_nat_packet(ct, ctinfo, hooknum, skb); | ||
| 145 | |||
| 146 | oif_changed: | ||
| 147 | nf_ct_kill_acct(ct, ctinfo, skb); | ||
| 148 | return NF_DROP; | ||
| 149 | } | ||
| 150 | |||
| 151 | static unsigned int | ||
| 152 | nf_nat_ipv4_in(unsigned int hooknum, | ||
| 153 | struct sk_buff *skb, | ||
| 154 | const struct net_device *in, | ||
| 155 | const struct net_device *out, | ||
| 156 | int (*okfn)(struct sk_buff *)) | ||
| 157 | { | ||
| 158 | unsigned int ret; | ||
| 159 | __be32 daddr = ip_hdr(skb)->daddr; | ||
| 160 | |||
| 161 | ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn); | ||
| 162 | if (ret != NF_DROP && ret != NF_STOLEN && | ||
| 163 | daddr != ip_hdr(skb)->daddr) | ||
| 164 | skb_dst_drop(skb); | ||
| 165 | |||
| 166 | return ret; | ||
| 167 | } | ||
| 168 | |||
| 169 | static unsigned int | ||
| 170 | nf_nat_ipv4_out(unsigned int hooknum, | ||
| 171 | struct sk_buff *skb, | ||
| 172 | const struct net_device *in, | ||
| 173 | const struct net_device *out, | ||
| 174 | int (*okfn)(struct sk_buff *)) | ||
| 175 | { | ||
| 176 | #ifdef CONFIG_XFRM | ||
| 177 | const struct nf_conn *ct; | ||
| 178 | enum ip_conntrack_info ctinfo; | ||
| 179 | #endif | ||
| 180 | unsigned int ret; | ||
| 181 | |||
| 182 | /* root is playing with raw sockets. */ | ||
| 183 | if (skb->len < sizeof(struct iphdr) || | ||
| 184 | ip_hdrlen(skb) < sizeof(struct iphdr)) | ||
| 185 | return NF_ACCEPT; | ||
| 186 | |||
| 187 | ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn); | ||
| 188 | #ifdef CONFIG_XFRM | ||
| 189 | if (ret != NF_DROP && ret != NF_STOLEN && | ||
| 190 | !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && | ||
| 191 | (ct = nf_ct_get(skb, &ctinfo)) != NULL) { | ||
| 192 | enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); | ||
| 193 | |||
| 194 | if ((ct->tuplehash[dir].tuple.src.u3.ip != | ||
| 195 | ct->tuplehash[!dir].tuple.dst.u3.ip) || | ||
| 196 | (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && | ||
| 197 | ct->tuplehash[dir].tuple.src.u.all != | ||
| 198 | ct->tuplehash[!dir].tuple.dst.u.all)) | ||
| 199 | if (nf_xfrm_me_harder(skb, AF_INET) < 0) | ||
| 200 | ret = NF_DROP; | ||
| 201 | } | ||
| 202 | #endif | ||
| 203 | return ret; | ||
| 204 | } | ||
| 205 | |||
| 206 | static unsigned int | ||
| 207 | nf_nat_ipv4_local_fn(unsigned int hooknum, | ||
| 208 | struct sk_buff *skb, | ||
| 209 | const struct net_device *in, | ||
| 210 | const struct net_device *out, | ||
| 211 | int (*okfn)(struct sk_buff *)) | ||
| 212 | { | ||
| 213 | const struct nf_conn *ct; | ||
| 214 | enum ip_conntrack_info ctinfo; | ||
| 215 | unsigned int ret; | ||
| 216 | |||
| 217 | /* root is playing with raw sockets. */ | ||
| 218 | if (skb->len < sizeof(struct iphdr) || | ||
| 219 | ip_hdrlen(skb) < sizeof(struct iphdr)) | ||
| 220 | return NF_ACCEPT; | ||
| 221 | |||
| 222 | ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn); | ||
| 223 | if (ret != NF_DROP && ret != NF_STOLEN && | ||
| 224 | (ct = nf_ct_get(skb, &ctinfo)) != NULL) { | ||
| 225 | enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); | ||
| 226 | |||
| 227 | if (ct->tuplehash[dir].tuple.dst.u3.ip != | ||
| 228 | ct->tuplehash[!dir].tuple.src.u3.ip) { | ||
| 229 | if (ip_route_me_harder(skb, RTN_UNSPEC)) | ||
| 230 | ret = NF_DROP; | ||
| 231 | } | ||
| 232 | #ifdef CONFIG_XFRM | ||
| 233 | else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && | ||
| 234 | ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && | ||
| 235 | ct->tuplehash[dir].tuple.dst.u.all != | ||
| 236 | ct->tuplehash[!dir].tuple.src.u.all) | ||
| 237 | if (nf_xfrm_me_harder(skb, AF_INET) < 0) | ||
| 238 | ret = NF_DROP; | ||
| 239 | #endif | ||
| 240 | } | ||
| 241 | return ret; | ||
| 242 | } | ||
| 243 | |||
| 244 | static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { | ||
| 245 | /* Before packet filtering, change destination */ | ||
| 246 | { | ||
| 247 | .hook = nf_nat_ipv4_in, | ||
| 248 | .owner = THIS_MODULE, | ||
| 249 | .pf = NFPROTO_IPV4, | ||
| 250 | .hooknum = NF_INET_PRE_ROUTING, | ||
| 251 | .priority = NF_IP_PRI_NAT_DST, | ||
| 252 | }, | ||
| 253 | /* After packet filtering, change source */ | ||
| 254 | { | ||
| 255 | .hook = nf_nat_ipv4_out, | ||
| 256 | .owner = THIS_MODULE, | ||
| 257 | .pf = NFPROTO_IPV4, | ||
| 258 | .hooknum = NF_INET_POST_ROUTING, | ||
| 259 | .priority = NF_IP_PRI_NAT_SRC, | ||
| 260 | }, | ||
| 261 | /* Before packet filtering, change destination */ | ||
| 262 | { | ||
| 263 | .hook = nf_nat_ipv4_local_fn, | ||
| 264 | .owner = THIS_MODULE, | ||
| 265 | .pf = NFPROTO_IPV4, | ||
| 266 | .hooknum = NF_INET_LOCAL_OUT, | ||
| 267 | .priority = NF_IP_PRI_NAT_DST, | ||
| 268 | }, | ||
| 269 | /* After packet filtering, change source */ | ||
| 270 | { | ||
| 271 | .hook = nf_nat_ipv4_fn, | ||
| 272 | .owner = THIS_MODULE, | ||
| 273 | .pf = NFPROTO_IPV4, | ||
| 274 | .hooknum = NF_INET_LOCAL_IN, | ||
| 275 | .priority = NF_IP_PRI_NAT_SRC, | ||
| 276 | }, | ||
| 277 | }; | ||
| 278 | |||
| 279 | static int __net_init iptable_nat_net_init(struct net *net) | ||
| 280 | { | ||
| 281 | struct ipt_replace *repl; | ||
| 282 | |||
| 283 | repl = ipt_alloc_initial_table(&nf_nat_ipv4_table); | ||
| 284 | if (repl == NULL) | ||
| 285 | return -ENOMEM; | ||
| 286 | net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl); | ||
| 287 | kfree(repl); | ||
| 288 | return PTR_RET(net->ipv4.nat_table); | ||
| 289 | } | ||
| 290 | |||
| 291 | static void __net_exit iptable_nat_net_exit(struct net *net) | ||
| 292 | { | ||
| 293 | ipt_unregister_table(net, net->ipv4.nat_table); | ||
| 294 | } | ||
| 295 | |||
| 296 | static struct pernet_operations iptable_nat_net_ops = { | ||
| 297 | .init = iptable_nat_net_init, | ||
| 298 | .exit = iptable_nat_net_exit, | ||
| 299 | }; | ||
| 300 | |||
| 301 | static int __init iptable_nat_init(void) | ||
| 302 | { | ||
| 303 | int err; | ||
| 304 | |||
| 305 | err = register_pernet_subsys(&iptable_nat_net_ops); | ||
| 306 | if (err < 0) | ||
| 307 | goto err1; | ||
| 308 | |||
| 309 | err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); | ||
| 310 | if (err < 0) | ||
| 311 | goto err2; | ||
| 312 | return 0; | ||
| 313 | |||
| 314 | err2: | ||
| 315 | unregister_pernet_subsys(&iptable_nat_net_ops); | ||
| 316 | err1: | ||
| 317 | return err; | ||
| 318 | } | ||
| 319 | |||
| 320 | static void __exit iptable_nat_exit(void) | ||
| 321 | { | ||
| 322 | nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); | ||
| 323 | unregister_pernet_subsys(&iptable_nat_net_ops); | ||
| 324 | } | ||
| 325 | |||
| 326 | module_init(iptable_nat_init); | ||
| 327 | module_exit(iptable_nat_exit); | ||
| 328 | |||
| 329 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 03d9696d3c6..07fb710cd72 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c | |||
| @@ -48,7 +48,9 @@ static int __net_init iptable_raw_net_init(struct net *net) | |||
| 48 | net->ipv4.iptable_raw = | 48 | net->ipv4.iptable_raw = |
| 49 | ipt_register_table(net, &packet_raw, repl); | 49 | ipt_register_table(net, &packet_raw, repl); |
| 50 | kfree(repl); | 50 | kfree(repl); |
| 51 | return PTR_RET(net->ipv4.iptable_raw); | 51 | if (IS_ERR(net->ipv4.iptable_raw)) |
| 52 | return PTR_ERR(net->ipv4.iptable_raw); | ||
| 53 | return 0; | ||
| 52 | } | 54 | } |
| 53 | 55 | ||
| 54 | static void __net_exit iptable_raw_net_exit(struct net *net) | 56 | static void __net_exit iptable_raw_net_exit(struct net *net) |
| @@ -73,10 +75,14 @@ static int __init iptable_raw_init(void) | |||
| 73 | rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook); | 75 | rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook); |
| 74 | if (IS_ERR(rawtable_ops)) { | 76 | if (IS_ERR(rawtable_ops)) { |
| 75 | ret = PTR_ERR(rawtable_ops); | 77 | ret = PTR_ERR(rawtable_ops); |
| 76 | unregister_pernet_subsys(&iptable_raw_net_ops); | 78 | goto cleanup_table; |
| 77 | } | 79 | } |
| 78 | 80 | ||
| 79 | return ret; | 81 | return ret; |
| 82 | |||
| 83 | cleanup_table: | ||
| 84 | unregister_pernet_subsys(&iptable_raw_net_ops); | ||
| 85 | return ret; | ||
| 80 | } | 86 | } |
| 81 | 87 | ||
| 82 | static void __exit iptable_raw_fini(void) | 88 | static void __exit iptable_raw_fini(void) |
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index b283d8e2601..be45bdc4c60 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c | |||
| @@ -66,7 +66,10 @@ static int __net_init iptable_security_net_init(struct net *net) | |||
| 66 | net->ipv4.iptable_security = | 66 | net->ipv4.iptable_security = |
| 67 | ipt_register_table(net, &security_table, repl); | 67 | ipt_register_table(net, &security_table, repl); |
| 68 | kfree(repl); | 68 | kfree(repl); |
| 69 | return PTR_RET(net->ipv4.iptable_security); | 69 | if (IS_ERR(net->ipv4.iptable_security)) |
| 70 | return PTR_ERR(net->ipv4.iptable_security); | ||
| 71 | |||
| 72 | return 0; | ||
| 70 | } | 73 | } |
| 71 | 74 | ||
| 72 | static void __net_exit iptable_security_net_exit(struct net *net) | 75 | static void __net_exit iptable_security_net_exit(struct net *net) |
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index fcdd0c2406e..de9da21113a 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | |||
| @@ -29,6 +29,11 @@ | |||
| 29 | #include <net/netfilter/ipv4/nf_defrag_ipv4.h> | 29 | #include <net/netfilter/ipv4/nf_defrag_ipv4.h> |
| 30 | #include <net/netfilter/nf_log.h> | 30 | #include <net/netfilter/nf_log.h> |
| 31 | 31 | ||
| 32 | int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb, | ||
| 33 | struct nf_conn *ct, | ||
| 34 | enum ip_conntrack_info ctinfo); | ||
| 35 | EXPORT_SYMBOL_GPL(nf_nat_seq_adjust_hook); | ||
| 36 | |||
| 32 | static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, | 37 | static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, |
| 33 | struct nf_conntrack_tuple *tuple) | 38 | struct nf_conntrack_tuple *tuple) |
| 34 | { | 39 | { |
| @@ -69,32 +74,24 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, | |||
| 69 | 74 | ||
| 70 | iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); | 75 | iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); |
| 71 | if (iph == NULL) | 76 | if (iph == NULL) |
| 72 | return -NF_ACCEPT; | 77 | return -NF_DROP; |
| 73 | 78 | ||
| 74 | /* Conntrack defragments packets, we might still see fragments | 79 | /* Conntrack defragments packets, we might still see fragments |
| 75 | * inside ICMP packets though. */ | 80 | * inside ICMP packets though. */ |
| 76 | if (iph->frag_off & htons(IP_OFFSET)) | 81 | if (iph->frag_off & htons(IP_OFFSET)) |
| 77 | return -NF_ACCEPT; | 82 | return -NF_DROP; |
| 78 | 83 | ||
| 79 | *dataoff = nhoff + (iph->ihl << 2); | 84 | *dataoff = nhoff + (iph->ihl << 2); |
| 80 | *protonum = iph->protocol; | 85 | *protonum = iph->protocol; |
| 81 | 86 | ||
| 82 | /* Check bogus IP headers */ | ||
| 83 | if (*dataoff > skb->len) { | ||
| 84 | pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: " | ||
| 85 | "nhoff %u, ihl %u, skblen %u\n", | ||
| 86 | nhoff, iph->ihl << 2, skb->len); | ||
| 87 | return -NF_ACCEPT; | ||
| 88 | } | ||
| 89 | |||
| 90 | return NF_ACCEPT; | 87 | return NF_ACCEPT; |
| 91 | } | 88 | } |
| 92 | 89 | ||
| 93 | static unsigned int ipv4_helper(unsigned int hooknum, | 90 | static unsigned int ipv4_confirm(unsigned int hooknum, |
| 94 | struct sk_buff *skb, | 91 | struct sk_buff *skb, |
| 95 | const struct net_device *in, | 92 | const struct net_device *in, |
| 96 | const struct net_device *out, | 93 | const struct net_device *out, |
| 97 | int (*okfn)(struct sk_buff *)) | 94 | int (*okfn)(struct sk_buff *)) |
| 98 | { | 95 | { |
| 99 | struct nf_conn *ct; | 96 | struct nf_conn *ct; |
| 100 | enum ip_conntrack_info ctinfo; | 97 | enum ip_conntrack_info ctinfo; |
| @@ -105,38 +102,24 @@ static unsigned int ipv4_helper(unsigned int hooknum, | |||
| 105 | /* This is where we call the helper: as the packet goes out. */ | 102 | /* This is where we call the helper: as the packet goes out. */ |
| 106 | ct = nf_ct_get(skb, &ctinfo); | 103 | ct = nf_ct_get(skb, &ctinfo); |
| 107 | if (!ct || ctinfo == IP_CT_RELATED_REPLY) | 104 | if (!ct || ctinfo == IP_CT_RELATED_REPLY) |
| 108 | return NF_ACCEPT; | 105 | goto out; |
| 109 | 106 | ||
| 110 | help = nfct_help(ct); | 107 | help = nfct_help(ct); |
| 111 | if (!help) | 108 | if (!help) |
| 112 | return NF_ACCEPT; | 109 | goto out; |
| 113 | 110 | ||
| 114 | /* rcu_read_lock()ed by nf_hook_slow */ | 111 | /* rcu_read_lock()ed by nf_hook_slow */ |
| 115 | helper = rcu_dereference(help->helper); | 112 | helper = rcu_dereference(help->helper); |
| 116 | if (!helper) | 113 | if (!helper) |
| 117 | return NF_ACCEPT; | 114 | goto out; |
| 118 | 115 | ||
| 119 | ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), | 116 | ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), |
| 120 | ct, ctinfo); | 117 | ct, ctinfo); |
| 121 | if (ret != NF_ACCEPT && (ret & NF_VERDICT_MASK) != NF_QUEUE) { | 118 | if (ret != NF_ACCEPT) { |
| 122 | nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL, | 119 | nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL, |
| 123 | "nf_ct_%s: dropping packet", helper->name); | 120 | "nf_ct_%s: dropping packet", helper->name); |
| 121 | return ret; | ||
| 124 | } | 122 | } |
| 125 | return ret; | ||
| 126 | } | ||
| 127 | |||
| 128 | static unsigned int ipv4_confirm(unsigned int hooknum, | ||
| 129 | struct sk_buff *skb, | ||
| 130 | const struct net_device *in, | ||
| 131 | const struct net_device *out, | ||
| 132 | int (*okfn)(struct sk_buff *)) | ||
| 133 | { | ||
| 134 | struct nf_conn *ct; | ||
| 135 | enum ip_conntrack_info ctinfo; | ||
| 136 | |||
| 137 | ct = nf_ct_get(skb, &ctinfo); | ||
| 138 | if (!ct || ctinfo == IP_CT_RELATED_REPLY) | ||
| 139 | goto out; | ||
| 140 | 123 | ||
| 141 | /* adjust seqs for loopback traffic only in outgoing direction */ | 124 | /* adjust seqs for loopback traffic only in outgoing direction */ |
| 142 | if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && | 125 | if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && |
| @@ -144,8 +127,7 @@ static unsigned int ipv4_confirm(unsigned int hooknum, | |||
| 144 | typeof(nf_nat_seq_adjust_hook) seq_adjust; | 127 | typeof(nf_nat_seq_adjust_hook) seq_adjust; |
| 145 | 128 | ||
| 146 | seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); | 129 | seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); |
| 147 | if (!seq_adjust || | 130 | if (!seq_adjust || !seq_adjust(skb, ct, ctinfo)) { |
| 148 | !seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { | ||
| 149 | NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); | 131 | NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); |
| 150 | return NF_DROP; | 132 | return NF_DROP; |
| 151 | } | 133 | } |
| @@ -195,13 +177,6 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { | |||
| 195 | .priority = NF_IP_PRI_CONNTRACK, | 177 | .priority = NF_IP_PRI_CONNTRACK, |
| 196 | }, | 178 | }, |
| 197 | { | 179 | { |
| 198 | .hook = ipv4_helper, | ||
| 199 | .owner = THIS_MODULE, | ||
| 200 | .pf = NFPROTO_IPV4, | ||
| 201 | .hooknum = NF_INET_POST_ROUTING, | ||
| 202 | .priority = NF_IP_PRI_CONNTRACK_HELPER, | ||
| 203 | }, | ||
| 204 | { | ||
| 205 | .hook = ipv4_confirm, | 180 | .hook = ipv4_confirm, |
| 206 | .owner = THIS_MODULE, | 181 | .owner = THIS_MODULE, |
| 207 | .pf = NFPROTO_IPV4, | 182 | .pf = NFPROTO_IPV4, |
| @@ -209,13 +184,6 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { | |||
| 209 | .priority = NF_IP_PRI_CONNTRACK_CONFIRM, | 184 | .priority = NF_IP_PRI_CONNTRACK_CONFIRM, |
| 210 | }, | 185 | }, |
| 211 | { | 186 | { |
| 212 | .hook = ipv4_helper, | ||
| 213 | .owner = THIS_MODULE, | ||
| 214 | .pf = NFPROTO_IPV4, | ||
| 215 | .hooknum = NF_INET_LOCAL_IN, | ||
| 216 | .priority = NF_IP_PRI_CONNTRACK_HELPER, | ||
| 217 | }, | ||
| 218 | { | ||
| 219 | .hook = ipv4_confirm, | 187 | .hook = ipv4_confirm, |
| 220 | .owner = THIS_MODULE, | 188 | .owner = THIS_MODULE, |
| 221 | .pf = NFPROTO_IPV4, | 189 | .pf = NFPROTO_IPV4, |
| @@ -231,30 +199,35 @@ static int log_invalid_proto_max = 255; | |||
| 231 | static ctl_table ip_ct_sysctl_table[] = { | 199 | static ctl_table ip_ct_sysctl_table[] = { |
| 232 | { | 200 | { |
| 233 | .procname = "ip_conntrack_max", | 201 | .procname = "ip_conntrack_max", |
| 202 | .data = &nf_conntrack_max, | ||
| 234 | .maxlen = sizeof(int), | 203 | .maxlen = sizeof(int), |
| 235 | .mode = 0644, | 204 | .mode = 0644, |
| 236 | .proc_handler = proc_dointvec, | 205 | .proc_handler = proc_dointvec, |
| 237 | }, | 206 | }, |
| 238 | { | 207 | { |
| 239 | .procname = "ip_conntrack_count", | 208 | .procname = "ip_conntrack_count", |
| 209 | .data = &init_net.ct.count, | ||
| 240 | .maxlen = sizeof(int), | 210 | .maxlen = sizeof(int), |
| 241 | .mode = 0444, | 211 | .mode = 0444, |
| 242 | .proc_handler = proc_dointvec, | 212 | .proc_handler = proc_dointvec, |
| 243 | }, | 213 | }, |
| 244 | { | 214 | { |
| 245 | .procname = "ip_conntrack_buckets", | 215 | .procname = "ip_conntrack_buckets", |
| 216 | .data = &init_net.ct.htable_size, | ||
| 246 | .maxlen = sizeof(unsigned int), | 217 | .maxlen = sizeof(unsigned int), |
| 247 | .mode = 0444, | 218 | .mode = 0444, |
| 248 | .proc_handler = proc_dointvec, | 219 | .proc_handler = proc_dointvec, |
| 249 | }, | 220 | }, |
| 250 | { | 221 | { |
| 251 | .procname = "ip_conntrack_checksum", | 222 | .procname = "ip_conntrack_checksum", |
| 223 | .data = &init_net.ct.sysctl_checksum, | ||
| 252 | .maxlen = sizeof(int), | 224 | .maxlen = sizeof(int), |
| 253 | .mode = 0644, | 225 | .mode = 0644, |
| 254 | .proc_handler = proc_dointvec, | 226 | .proc_handler = proc_dointvec, |
| 255 | }, | 227 | }, |
| 256 | { | 228 | { |
| 257 | .procname = "ip_conntrack_log_invalid", | 229 | .procname = "ip_conntrack_log_invalid", |
| 230 | .data = &init_net.ct.sysctl_log_invalid, | ||
| 258 | .maxlen = sizeof(unsigned int), | 231 | .maxlen = sizeof(unsigned int), |
| 259 | .mode = 0644, | 232 | .mode = 0644, |
| 260 | .proc_handler = proc_dointvec_minmax, | 233 | .proc_handler = proc_dointvec_minmax, |
| @@ -330,9 +303,8 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) | |||
| 330 | static int ipv4_tuple_to_nlattr(struct sk_buff *skb, | 303 | static int ipv4_tuple_to_nlattr(struct sk_buff *skb, |
| 331 | const struct nf_conntrack_tuple *tuple) | 304 | const struct nf_conntrack_tuple *tuple) |
| 332 | { | 305 | { |
| 333 | if (nla_put_be32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) || | 306 | NLA_PUT_BE32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip); |
| 334 | nla_put_be32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip)) | 307 | NLA_PUT_BE32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip); |
| 335 | goto nla_put_failure; | ||
| 336 | return 0; | 308 | return 0; |
| 337 | 309 | ||
| 338 | nla_put_failure: | 310 | nla_put_failure: |
| @@ -370,25 +342,6 @@ static struct nf_sockopt_ops so_getorigdst = { | |||
| 370 | .owner = THIS_MODULE, | 342 | .owner = THIS_MODULE, |
| 371 | }; | 343 | }; |
| 372 | 344 | ||
| 373 | static int ipv4_init_net(struct net *net) | ||
| 374 | { | ||
| 375 | #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) | ||
| 376 | struct nf_ip_net *in = &net->ct.nf_ct_proto; | ||
| 377 | in->ctl_table = kmemdup(ip_ct_sysctl_table, | ||
| 378 | sizeof(ip_ct_sysctl_table), | ||
| 379 | GFP_KERNEL); | ||
| 380 | if (!in->ctl_table) | ||
| 381 | return -ENOMEM; | ||
| 382 | |||
| 383 | in->ctl_table[0].data = &nf_conntrack_max; | ||
| 384 | in->ctl_table[1].data = &net->ct.count; | ||
| 385 | in->ctl_table[2].data = &net->ct.htable_size; | ||
| 386 | in->ctl_table[3].data = &net->ct.sysctl_checksum; | ||
| 387 | in->ctl_table[4].data = &net->ct.sysctl_log_invalid; | ||
| 388 | #endif | ||
| 389 | return 0; | ||
| 390 | } | ||
| 391 | |||
| 392 | struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { | 345 | struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { |
| 393 | .l3proto = PF_INET, | 346 | .l3proto = PF_INET, |
| 394 | .name = "ipv4", | 347 | .name = "ipv4", |
| @@ -403,9 +356,9 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { | |||
| 403 | .nla_policy = ipv4_nla_policy, | 356 | .nla_policy = ipv4_nla_policy, |
| 404 | #endif | 357 | #endif |
| 405 | #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) | 358 | #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) |
| 406 | .ctl_table_path = "net/ipv4/netfilter", | 359 | .ctl_table_path = nf_net_ipv4_netfilter_sysctl_path, |
| 360 | .ctl_table = ip_ct_sysctl_table, | ||
| 407 | #endif | 361 | #endif |
| 408 | .init_net = ipv4_init_net, | ||
| 409 | .me = THIS_MODULE, | 362 | .me = THIS_MODULE, |
| 410 | }; | 363 | }; |
| 411 | 364 | ||
| @@ -416,65 +369,6 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); | |||
| 416 | MODULE_ALIAS("ip_conntrack"); | 369 | MODULE_ALIAS("ip_conntrack"); |
| 417 | MODULE_LICENSE("GPL"); | 370 | MODULE_LICENSE("GPL"); |
| 418 | 371 | ||
| 419 | static int ipv4_net_init(struct net *net) | ||
| 420 | { | ||
| 421 | int ret = 0; | ||
| 422 | |||
| 423 | ret = nf_conntrack_l4proto_register(net, | ||
| 424 | &nf_conntrack_l4proto_tcp4); | ||
| 425 | if (ret < 0) { | ||
| 426 | pr_err("nf_conntrack_l4proto_tcp4 :protocol register failed\n"); | ||
| 427 | goto out_tcp; | ||
| 428 | } | ||
| 429 | ret = nf_conntrack_l4proto_register(net, | ||
| 430 | &nf_conntrack_l4proto_udp4); | ||
| 431 | if (ret < 0) { | ||
| 432 | pr_err("nf_conntrack_l4proto_udp4 :protocol register failed\n"); | ||
| 433 | goto out_udp; | ||
| 434 | } | ||
| 435 | ret = nf_conntrack_l4proto_register(net, | ||
| 436 | &nf_conntrack_l4proto_icmp); | ||
| 437 | if (ret < 0) { | ||
| 438 | pr_err("nf_conntrack_l4proto_icmp4 :protocol register failed\n"); | ||
| 439 | goto out_icmp; | ||
| 440 | } | ||
| 441 | ret = nf_conntrack_l3proto_register(net, | ||
| 442 | &nf_conntrack_l3proto_ipv4); | ||
| 443 | if (ret < 0) { | ||
| 444 | pr_err("nf_conntrack_l3proto_ipv4 :protocol register failed\n"); | ||
| 445 | goto out_ipv4; | ||
| 446 | } | ||
| 447 | return 0; | ||
| 448 | out_ipv4: | ||
| 449 | nf_conntrack_l4proto_unregister(net, | ||
| 450 | &nf_conntrack_l4proto_icmp); | ||
| 451 | out_icmp: | ||
| 452 | nf_conntrack_l4proto_unregister(net, | ||
| 453 | &nf_conntrack_l4proto_udp4); | ||
| 454 | out_udp: | ||
| 455 | nf_conntrack_l4proto_unregister(net, | ||
| 456 | &nf_conntrack_l4proto_tcp4); | ||
| 457 | out_tcp: | ||
| 458 | return ret; | ||
| 459 | } | ||
| 460 | |||
| 461 | static void ipv4_net_exit(struct net *net) | ||
| 462 | { | ||
| 463 | nf_conntrack_l3proto_unregister(net, | ||
| 464 | &nf_conntrack_l3proto_ipv4); | ||
| 465 | nf_conntrack_l4proto_unregister(net, | ||
| 466 | &nf_conntrack_l4proto_icmp); | ||
| 467 | nf_conntrack_l4proto_unregister(net, | ||
| 468 | &nf_conntrack_l4proto_udp4); | ||
| 469 | nf_conntrack_l4proto_unregister(net, | ||
| 470 | &nf_conntrack_l4proto_tcp4); | ||
| 471 | } | ||
| 472 | |||
| 473 | static struct pernet_operations ipv4_net_ops = { | ||
| 474 | .init = ipv4_net_init, | ||
| 475 | .exit = ipv4_net_exit, | ||
| 476 | }; | ||
| 477 | |||
| 478 | static int __init nf_conntrack_l3proto_ipv4_init(void) | 372 | static int __init nf_conntrack_l3proto_ipv4_init(void) |
| 479 | { | 373 | { |
| 480 | int ret = 0; | 374 | int ret = 0; |
| @@ -488,17 +382,35 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) | |||
| 488 | return ret; | 382 | return ret; |
| 489 | } | 383 | } |
| 490 | 384 | ||
| 491 | ret = register_pernet_subsys(&ipv4_net_ops); | 385 | ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4); |
| 492 | if (ret < 0) { | 386 | if (ret < 0) { |
| 493 | pr_err("nf_conntrack_ipv4: can't register pernet ops\n"); | 387 | pr_err("nf_conntrack_ipv4: can't register tcp.\n"); |
| 494 | goto cleanup_sockopt; | 388 | goto cleanup_sockopt; |
| 495 | } | 389 | } |
| 496 | 390 | ||
| 391 | ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4); | ||
| 392 | if (ret < 0) { | ||
| 393 | pr_err("nf_conntrack_ipv4: can't register udp.\n"); | ||
| 394 | goto cleanup_tcp; | ||
| 395 | } | ||
| 396 | |||
| 397 | ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp); | ||
| 398 | if (ret < 0) { | ||
| 399 | pr_err("nf_conntrack_ipv4: can't register icmp.\n"); | ||
| 400 | goto cleanup_udp; | ||
| 401 | } | ||
| 402 | |||
| 403 | ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4); | ||
| 404 | if (ret < 0) { | ||
| 405 | pr_err("nf_conntrack_ipv4: can't register ipv4\n"); | ||
| 406 | goto cleanup_icmp; | ||
| 407 | } | ||
| 408 | |||
| 497 | ret = nf_register_hooks(ipv4_conntrack_ops, | 409 | ret = nf_register_hooks(ipv4_conntrack_ops, |
| 498 | ARRAY_SIZE(ipv4_conntrack_ops)); | 410 | ARRAY_SIZE(ipv4_conntrack_ops)); |
| 499 | if (ret < 0) { | 411 | if (ret < 0) { |
| 500 | pr_err("nf_conntrack_ipv4: can't register hooks.\n"); | 412 | pr_err("nf_conntrack_ipv4: can't register hooks.\n"); |
| 501 | goto cleanup_pernet; | 413 | goto cleanup_ipv4; |
| 502 | } | 414 | } |
| 503 | #if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) | 415 | #if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) |
| 504 | ret = nf_conntrack_ipv4_compat_init(); | 416 | ret = nf_conntrack_ipv4_compat_init(); |
| @@ -510,8 +422,14 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) | |||
| 510 | cleanup_hooks: | 422 | cleanup_hooks: |
| 511 | nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); | 423 | nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); |
| 512 | #endif | 424 | #endif |
| 513 | cleanup_pernet: | 425 | cleanup_ipv4: |
| 514 | unregister_pernet_subsys(&ipv4_net_ops); | 426 | nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); |
| 427 | cleanup_icmp: | ||
| 428 | nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp); | ||
| 429 | cleanup_udp: | ||
| 430 | nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); | ||
| 431 | cleanup_tcp: | ||
| 432 | nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); | ||
| 515 | cleanup_sockopt: | 433 | cleanup_sockopt: |
| 516 | nf_unregister_sockopt(&so_getorigdst); | 434 | nf_unregister_sockopt(&so_getorigdst); |
| 517 | return ret; | 435 | return ret; |
| @@ -524,7 +442,10 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void) | |||
| 524 | nf_conntrack_ipv4_compat_fini(); | 442 | nf_conntrack_ipv4_compat_fini(); |
| 525 | #endif | 443 | #endif |
| 526 | nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); | 444 | nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); |
| 527 | unregister_pernet_subsys(&ipv4_net_ops); | 445 | nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); |
| 446 | nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp); | ||
| 447 | nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); | ||
| 448 | nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); | ||
| 528 | nf_unregister_sockopt(&so_getorigdst); | 449 | nf_unregister_sockopt(&so_getorigdst); |
| 529 | } | 450 | } |
| 530 | 451 | ||
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 9682b36df38..5585980fce2 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c | |||
| @@ -21,7 +21,6 @@ | |||
| 21 | #include <net/netfilter/nf_conntrack_expect.h> | 21 | #include <net/netfilter/nf_conntrack_expect.h> |
| 22 | #include <net/netfilter/nf_conntrack_acct.h> | 22 | #include <net/netfilter/nf_conntrack_acct.h> |
| 23 | #include <linux/rculist_nulls.h> | 23 | #include <linux/rculist_nulls.h> |
| 24 | #include <linux/export.h> | ||
| 25 | 24 | ||
| 26 | struct ct_iter_state { | 25 | struct ct_iter_state { |
| 27 | struct seq_net_private p; | 26 | struct seq_net_private p; |
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 5241d997ab7..ab5b27a2916 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c | |||
| @@ -23,11 +23,6 @@ | |||
| 23 | 23 | ||
| 24 | static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; | 24 | static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; |
| 25 | 25 | ||
| 26 | static inline struct nf_icmp_net *icmp_pernet(struct net *net) | ||
| 27 | { | ||
| 28 | return &net->ct.nf_ct_proto.icmp; | ||
| 29 | } | ||
| 30 | |||
| 31 | static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, | 26 | static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, |
| 32 | struct nf_conntrack_tuple *tuple) | 27 | struct nf_conntrack_tuple *tuple) |
| 33 | { | 28 | { |
| @@ -80,31 +75,25 @@ static int icmp_print_tuple(struct seq_file *s, | |||
| 80 | ntohs(tuple->src.u.icmp.id)); | 75 | ntohs(tuple->src.u.icmp.id)); |
| 81 | } | 76 | } |
| 82 | 77 | ||
| 83 | static unsigned int *icmp_get_timeouts(struct net *net) | ||
| 84 | { | ||
| 85 | return &icmp_pernet(net)->timeout; | ||
| 86 | } | ||
| 87 | |||
| 88 | /* Returns verdict for packet, or -1 for invalid. */ | 78 | /* Returns verdict for packet, or -1 for invalid. */ |
| 89 | static int icmp_packet(struct nf_conn *ct, | 79 | static int icmp_packet(struct nf_conn *ct, |
| 90 | const struct sk_buff *skb, | 80 | const struct sk_buff *skb, |
| 91 | unsigned int dataoff, | 81 | unsigned int dataoff, |
| 92 | enum ip_conntrack_info ctinfo, | 82 | enum ip_conntrack_info ctinfo, |
| 93 | u_int8_t pf, | 83 | u_int8_t pf, |
| 94 | unsigned int hooknum, | 84 | unsigned int hooknum) |
| 95 | unsigned int *timeout) | ||
| 96 | { | 85 | { |
| 97 | /* Do not immediately delete the connection after the first | 86 | /* Do not immediately delete the connection after the first |
| 98 | successful reply to avoid excessive conntrackd traffic | 87 | successful reply to avoid excessive conntrackd traffic |
| 99 | and also to handle correctly ICMP echo reply duplicates. */ | 88 | and also to handle correctly ICMP echo reply duplicates. */ |
| 100 | nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); | 89 | nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout); |
| 101 | 90 | ||
| 102 | return NF_ACCEPT; | 91 | return NF_ACCEPT; |
| 103 | } | 92 | } |
| 104 | 93 | ||
| 105 | /* Called when a new connection for this protocol found. */ | 94 | /* Called when a new connection for this protocol found. */ |
| 106 | static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, | 95 | static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, |
| 107 | unsigned int dataoff, unsigned int *timeouts) | 96 | unsigned int dataoff) |
| 108 | { | 97 | { |
| 109 | static const u_int8_t valid_new[] = { | 98 | static const u_int8_t valid_new[] = { |
| 110 | [ICMP_ECHO] = 1, | 99 | [ICMP_ECHO] = 1, |
| @@ -233,10 +222,10 @@ icmp_error(struct net *net, struct nf_conn *tmpl, | |||
| 233 | static int icmp_tuple_to_nlattr(struct sk_buff *skb, | 222 | static int icmp_tuple_to_nlattr(struct sk_buff *skb, |
| 234 | const struct nf_conntrack_tuple *t) | 223 | const struct nf_conntrack_tuple *t) |
| 235 | { | 224 | { |
| 236 | if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) || | 225 | NLA_PUT_BE16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id); |
| 237 | nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) || | 226 | NLA_PUT_U8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type); |
| 238 | nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code)) | 227 | NLA_PUT_U8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code); |
| 239 | goto nla_put_failure; | 228 | |
| 240 | return 0; | 229 | return 0; |
| 241 | 230 | ||
| 242 | nla_put_failure: | 231 | nla_put_failure: |
| @@ -274,50 +263,12 @@ static int icmp_nlattr_tuple_size(void) | |||
| 274 | } | 263 | } |
| 275 | #endif | 264 | #endif |
| 276 | 265 | ||
| 277 | #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) | ||
| 278 | |||
| 279 | #include <linux/netfilter/nfnetlink.h> | ||
| 280 | #include <linux/netfilter/nfnetlink_cttimeout.h> | ||
| 281 | |||
| 282 | static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], | ||
| 283 | struct net *net, void *data) | ||
| 284 | { | ||
| 285 | unsigned int *timeout = data; | ||
| 286 | struct nf_icmp_net *in = icmp_pernet(net); | ||
| 287 | |||
| 288 | if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { | ||
| 289 | *timeout = | ||
| 290 | ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; | ||
| 291 | } else { | ||
| 292 | /* Set default ICMP timeout. */ | ||
| 293 | *timeout = in->timeout; | ||
| 294 | } | ||
| 295 | return 0; | ||
| 296 | } | ||
| 297 | |||
| 298 | static int | ||
| 299 | icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) | ||
| 300 | { | ||
| 301 | const unsigned int *timeout = data; | ||
| 302 | |||
| 303 | if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ))) | ||
| 304 | goto nla_put_failure; | ||
| 305 | return 0; | ||
| 306 | |||
| 307 | nla_put_failure: | ||
| 308 | return -ENOSPC; | ||
| 309 | } | ||
| 310 | |||
| 311 | static const struct nla_policy | ||
| 312 | icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = { | ||
| 313 | [CTA_TIMEOUT_ICMP_TIMEOUT] = { .type = NLA_U32 }, | ||
| 314 | }; | ||
| 315 | #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ | ||
| 316 | |||
| 317 | #ifdef CONFIG_SYSCTL | 266 | #ifdef CONFIG_SYSCTL |
| 267 | static struct ctl_table_header *icmp_sysctl_header; | ||
| 318 | static struct ctl_table icmp_sysctl_table[] = { | 268 | static struct ctl_table icmp_sysctl_table[] = { |
| 319 | { | 269 | { |
| 320 | .procname = "nf_conntrack_icmp_timeout", | 270 | .procname = "nf_conntrack_icmp_timeout", |
| 271 | .data = &nf_ct_icmp_timeout, | ||
| 321 | .maxlen = sizeof(unsigned int), | 272 | .maxlen = sizeof(unsigned int), |
| 322 | .mode = 0644, | 273 | .mode = 0644, |
| 323 | .proc_handler = proc_dointvec_jiffies, | 274 | .proc_handler = proc_dointvec_jiffies, |
| @@ -328,6 +279,7 @@ static struct ctl_table icmp_sysctl_table[] = { | |||
| 328 | static struct ctl_table icmp_compat_sysctl_table[] = { | 279 | static struct ctl_table icmp_compat_sysctl_table[] = { |
| 329 | { | 280 | { |
| 330 | .procname = "ip_conntrack_icmp_timeout", | 281 | .procname = "ip_conntrack_icmp_timeout", |
| 282 | .data = &nf_ct_icmp_timeout, | ||
| 331 | .maxlen = sizeof(unsigned int), | 283 | .maxlen = sizeof(unsigned int), |
| 332 | .mode = 0644, | 284 | .mode = 0644, |
| 333 | .proc_handler = proc_dointvec_jiffies, | 285 | .proc_handler = proc_dointvec_jiffies, |
| @@ -337,62 +289,6 @@ static struct ctl_table icmp_compat_sysctl_table[] = { | |||
| 337 | #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ | 289 | #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ |
| 338 | #endif /* CONFIG_SYSCTL */ | 290 | #endif /* CONFIG_SYSCTL */ |
| 339 | 291 | ||
| 340 | static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, | ||
| 341 | struct nf_icmp_net *in) | ||
| 342 | { | ||
| 343 | #ifdef CONFIG_SYSCTL | ||
| 344 | pn->ctl_table = kmemdup(icmp_sysctl_table, | ||
| 345 | sizeof(icmp_sysctl_table), | ||
| 346 | GFP_KERNEL); | ||
| 347 | if (!pn->ctl_table) | ||
| 348 | return -ENOMEM; | ||
| 349 | |||
| 350 | pn->ctl_table[0].data = &in->timeout; | ||
| 351 | #endif | ||
| 352 | return 0; | ||
| 353 | } | ||
| 354 | |||
| 355 | static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, | ||
| 356 | struct nf_icmp_net *in) | ||
| 357 | { | ||
| 358 | #ifdef CONFIG_SYSCTL | ||
| 359 | #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT | ||
| 360 | pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table, | ||
| 361 | sizeof(icmp_compat_sysctl_table), | ||
| 362 | GFP_KERNEL); | ||
| 363 | if (!pn->ctl_compat_table) | ||
| 364 | return -ENOMEM; | ||
| 365 | |||
| 366 | pn->ctl_compat_table[0].data = &in->timeout; | ||
| 367 | #endif | ||
| 368 | #endif | ||
| 369 | return 0; | ||
| 370 | } | ||
| 371 | |||
| 372 | static int icmp_init_net(struct net *net, u_int16_t proto) | ||
| 373 | { | ||
| 374 | int ret; | ||
| 375 | struct nf_icmp_net *in = icmp_pernet(net); | ||
| 376 | struct nf_proto_net *pn = &in->pn; | ||
| 377 | |||
| 378 | in->timeout = nf_ct_icmp_timeout; | ||
| 379 | |||
| 380 | ret = icmp_kmemdup_compat_sysctl_table(pn, in); | ||
| 381 | if (ret < 0) | ||
| 382 | return ret; | ||
| 383 | |||
| 384 | ret = icmp_kmemdup_sysctl_table(pn, in); | ||
| 385 | if (ret < 0) | ||
| 386 | nf_ct_kfree_compat_sysctl_table(pn); | ||
| 387 | |||
| 388 | return ret; | ||
| 389 | } | ||
| 390 | |||
| 391 | static struct nf_proto_net *icmp_get_net_proto(struct net *net) | ||
| 392 | { | ||
| 393 | return &net->ct.nf_ct_proto.icmp.pn; | ||
| 394 | } | ||
| 395 | |||
| 396 | struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = | 292 | struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = |
| 397 | { | 293 | { |
| 398 | .l3proto = PF_INET, | 294 | .l3proto = PF_INET, |
| @@ -402,7 +298,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = | |||
| 402 | .invert_tuple = icmp_invert_tuple, | 298 | .invert_tuple = icmp_invert_tuple, |
| 403 | .print_tuple = icmp_print_tuple, | 299 | .print_tuple = icmp_print_tuple, |
| 404 | .packet = icmp_packet, | 300 | .packet = icmp_packet, |
| 405 | .get_timeouts = icmp_get_timeouts, | ||
| 406 | .new = icmp_new, | 301 | .new = icmp_new, |
| 407 | .error = icmp_error, | 302 | .error = icmp_error, |
| 408 | .destroy = NULL, | 303 | .destroy = NULL, |
| @@ -413,15 +308,11 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = | |||
| 413 | .nlattr_to_tuple = icmp_nlattr_to_tuple, | 308 | .nlattr_to_tuple = icmp_nlattr_to_tuple, |
| 414 | .nla_policy = icmp_nla_policy, | 309 | .nla_policy = icmp_nla_policy, |
| 415 | #endif | 310 | #endif |
| 416 | #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) | 311 | #ifdef CONFIG_SYSCTL |
| 417 | .ctnl_timeout = { | 312 | .ctl_table_header = &icmp_sysctl_header, |
| 418 | .nlattr_to_obj = icmp_timeout_nlattr_to_obj, | 313 | .ctl_table = icmp_sysctl_table, |
| 419 | .obj_to_nlattr = icmp_timeout_obj_to_nlattr, | 314 | #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT |
| 420 | .nlattr_max = CTA_TIMEOUT_ICMP_MAX, | 315 | .ctl_compat_table = icmp_compat_sysctl_table, |
| 421 | .obj_size = sizeof(unsigned int), | 316 | #endif |
| 422 | .nla_policy = icmp_timeout_nla_policy, | 317 | #endif |
| 423 | }, | ||
| 424 | #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ | ||
| 425 | .init_net = icmp_init_net, | ||
| 426 | .get_net_proto = icmp_get_net_proto, | ||
| 427 | }; | 318 | }; |
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 742815518b0..9bb1b8a37a2 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c | |||
| @@ -94,14 +94,14 @@ static struct nf_hook_ops ipv4_defrag_ops[] = { | |||
| 94 | { | 94 | { |
| 95 | .hook = ipv4_conntrack_defrag, | 95 | .hook = ipv4_conntrack_defrag, |
| 96 | .owner = THIS_MODULE, | 96 | .owner = THIS_MODULE, |
| 97 | .pf = NFPROTO_IPV4, | 97 | .pf = PF_INET, |
| 98 | .hooknum = NF_INET_PRE_ROUTING, | 98 | .hooknum = NF_INET_PRE_ROUTING, |
| 99 | .priority = NF_IP_PRI_CONNTRACK_DEFRAG, | 99 | .priority = NF_IP_PRI_CONNTRACK_DEFRAG, |
| 100 | }, | 100 | }, |
| 101 | { | 101 | { |
| 102 | .hook = ipv4_conntrack_defrag, | 102 | .hook = ipv4_conntrack_defrag, |
| 103 | .owner = THIS_MODULE, | 103 | .owner = THIS_MODULE, |
| 104 | .pf = NFPROTO_IPV4, | 104 | .pf = PF_INET, |
| 105 | .hooknum = NF_INET_LOCAL_OUT, | 105 | .hooknum = NF_INET_LOCAL_OUT, |
| 106 | .priority = NF_IP_PRI_CONNTRACK_DEFRAG, | 106 | .priority = NF_IP_PRI_CONNTRACK_DEFRAG, |
| 107 | }, | 107 | }, |
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 9c3db10b22d..790f3160e01 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c | |||
| @@ -15,12 +15,13 @@ | |||
| 15 | 15 | ||
| 16 | #include <net/netfilter/nf_nat.h> | 16 | #include <net/netfilter/nf_nat.h> |
| 17 | #include <net/netfilter/nf_nat_helper.h> | 17 | #include <net/netfilter/nf_nat_helper.h> |
| 18 | #include <net/netfilter/nf_nat_rule.h> | ||
| 18 | #include <net/netfilter/nf_conntrack_helper.h> | 19 | #include <net/netfilter/nf_conntrack_helper.h> |
| 19 | #include <net/netfilter/nf_conntrack_expect.h> | 20 | #include <net/netfilter/nf_conntrack_expect.h> |
| 20 | #include <linux/netfilter/nf_conntrack_h323.h> | 21 | #include <linux/netfilter/nf_conntrack_h323.h> |
| 21 | 22 | ||
| 22 | /****************************************************************************/ | 23 | /****************************************************************************/ |
| 23 | static int set_addr(struct sk_buff *skb, unsigned int protoff, | 24 | static int set_addr(struct sk_buff *skb, |
| 24 | unsigned char **data, int dataoff, | 25 | unsigned char **data, int dataoff, |
| 25 | unsigned int addroff, __be32 ip, __be16 port) | 26 | unsigned int addroff, __be32 ip, __be16 port) |
| 26 | { | 27 | { |
| @@ -39,9 +40,11 @@ static int set_addr(struct sk_buff *skb, unsigned int protoff, | |||
| 39 | 40 | ||
| 40 | if (ip_hdr(skb)->protocol == IPPROTO_TCP) { | 41 | if (ip_hdr(skb)->protocol == IPPROTO_TCP) { |
| 41 | if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, | 42 | if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, |
| 42 | protoff, addroff, sizeof(buf), | 43 | addroff, sizeof(buf), |
| 43 | (char *) &buf, sizeof(buf))) { | 44 | (char *) &buf, sizeof(buf))) { |
| 44 | net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n"); | 45 | if (net_ratelimit()) |
| 46 | pr_notice("nf_nat_h323: nf_nat_mangle_tcp_packet" | ||
| 47 | " error\n"); | ||
| 45 | return -1; | 48 | return -1; |
| 46 | } | 49 | } |
| 47 | 50 | ||
| @@ -53,9 +56,11 @@ static int set_addr(struct sk_buff *skb, unsigned int protoff, | |||
| 53 | *data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff; | 56 | *data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff; |
| 54 | } else { | 57 | } else { |
| 55 | if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, | 58 | if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, |
| 56 | protoff, addroff, sizeof(buf), | 59 | addroff, sizeof(buf), |
| 57 | (char *) &buf, sizeof(buf))) { | 60 | (char *) &buf, sizeof(buf))) { |
| 58 | net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n"); | 61 | if (net_ratelimit()) |
| 62 | pr_notice("nf_nat_h323: nf_nat_mangle_udp_packet" | ||
| 63 | " error\n"); | ||
| 59 | return -1; | 64 | return -1; |
| 60 | } | 65 | } |
| 61 | /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy | 66 | /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy |
| @@ -68,22 +73,22 @@ static int set_addr(struct sk_buff *skb, unsigned int protoff, | |||
| 68 | } | 73 | } |
| 69 | 74 | ||
| 70 | /****************************************************************************/ | 75 | /****************************************************************************/ |
| 71 | static int set_h225_addr(struct sk_buff *skb, unsigned int protoff, | 76 | static int set_h225_addr(struct sk_buff *skb, |
| 72 | unsigned char **data, int dataoff, | 77 | unsigned char **data, int dataoff, |
| 73 | TransportAddress *taddr, | 78 | TransportAddress *taddr, |
| 74 | union nf_inet_addr *addr, __be16 port) | 79 | union nf_inet_addr *addr, __be16 port) |
| 75 | { | 80 | { |
| 76 | return set_addr(skb, protoff, data, dataoff, taddr->ipAddress.ip, | 81 | return set_addr(skb, data, dataoff, taddr->ipAddress.ip, |
| 77 | addr->ip, port); | 82 | addr->ip, port); |
| 78 | } | 83 | } |
| 79 | 84 | ||
| 80 | /****************************************************************************/ | 85 | /****************************************************************************/ |
| 81 | static int set_h245_addr(struct sk_buff *skb, unsigned protoff, | 86 | static int set_h245_addr(struct sk_buff *skb, |
| 82 | unsigned char **data, int dataoff, | 87 | unsigned char **data, int dataoff, |
| 83 | H245_TransportAddress *taddr, | 88 | H245_TransportAddress *taddr, |
| 84 | union nf_inet_addr *addr, __be16 port) | 89 | union nf_inet_addr *addr, __be16 port) |
| 85 | { | 90 | { |
| 86 | return set_addr(skb, protoff, data, dataoff, | 91 | return set_addr(skb, data, dataoff, |
| 87 | taddr->unicastAddress.iPAddress.network, | 92 | taddr->unicastAddress.iPAddress.network, |
| 88 | addr->ip, port); | 93 | addr->ip, port); |
| 89 | } | 94 | } |
| @@ -91,10 +96,10 @@ static int set_h245_addr(struct sk_buff *skb, unsigned protoff, | |||
| 91 | /****************************************************************************/ | 96 | /****************************************************************************/ |
| 92 | static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct, | 97 | static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct, |
| 93 | enum ip_conntrack_info ctinfo, | 98 | enum ip_conntrack_info ctinfo, |
| 94 | unsigned int protoff, unsigned char **data, | 99 | unsigned char **data, |
| 95 | TransportAddress *taddr, int count) | 100 | TransportAddress *taddr, int count) |
| 96 | { | 101 | { |
| 97 | const struct nf_ct_h323_master *info = nfct_help_data(ct); | 102 | const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; |
| 98 | int dir = CTINFO2DIR(ctinfo); | 103 | int dir = CTINFO2DIR(ctinfo); |
| 99 | int i; | 104 | int i; |
| 100 | __be16 port; | 105 | __be16 port; |
| @@ -117,8 +122,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct, | |||
| 117 | &addr.ip, port, | 122 | &addr.ip, port, |
| 118 | &ct->tuplehash[!dir].tuple.dst.u3.ip, | 123 | &ct->tuplehash[!dir].tuple.dst.u3.ip, |
| 119 | info->sig_port[!dir]); | 124 | info->sig_port[!dir]); |
| 120 | return set_h225_addr(skb, protoff, data, 0, | 125 | return set_h225_addr(skb, data, 0, &taddr[i], |
| 121 | &taddr[i], | ||
| 122 | &ct->tuplehash[!dir]. | 126 | &ct->tuplehash[!dir]. |
| 123 | tuple.dst.u3, | 127 | tuple.dst.u3, |
| 124 | info->sig_port[!dir]); | 128 | info->sig_port[!dir]); |
| @@ -129,8 +133,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct, | |||
| 129 | &addr.ip, port, | 133 | &addr.ip, port, |
| 130 | &ct->tuplehash[!dir].tuple.src.u3.ip, | 134 | &ct->tuplehash[!dir].tuple.src.u3.ip, |
| 131 | info->sig_port[!dir]); | 135 | info->sig_port[!dir]); |
| 132 | return set_h225_addr(skb, protoff, data, 0, | 136 | return set_h225_addr(skb, data, 0, &taddr[i], |
| 133 | &taddr[i], | ||
| 134 | &ct->tuplehash[!dir]. | 137 | &ct->tuplehash[!dir]. |
| 135 | tuple.src.u3, | 138 | tuple.src.u3, |
| 136 | info->sig_port[!dir]); | 139 | info->sig_port[!dir]); |
| @@ -144,7 +147,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct, | |||
| 144 | /****************************************************************************/ | 147 | /****************************************************************************/ |
| 145 | static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct, | 148 | static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct, |
| 146 | enum ip_conntrack_info ctinfo, | 149 | enum ip_conntrack_info ctinfo, |
| 147 | unsigned int protoff, unsigned char **data, | 150 | unsigned char **data, |
| 148 | TransportAddress *taddr, int count) | 151 | TransportAddress *taddr, int count) |
| 149 | { | 152 | { |
| 150 | int dir = CTINFO2DIR(ctinfo); | 153 | int dir = CTINFO2DIR(ctinfo); |
| @@ -160,7 +163,7 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct, | |||
| 160 | &addr.ip, ntohs(port), | 163 | &addr.ip, ntohs(port), |
| 161 | &ct->tuplehash[!dir].tuple.dst.u3.ip, | 164 | &ct->tuplehash[!dir].tuple.dst.u3.ip, |
| 162 | ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port)); | 165 | ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port)); |
| 163 | return set_h225_addr(skb, protoff, data, 0, &taddr[i], | 166 | return set_h225_addr(skb, data, 0, &taddr[i], |
| 164 | &ct->tuplehash[!dir].tuple.dst.u3, | 167 | &ct->tuplehash[!dir].tuple.dst.u3, |
| 165 | ct->tuplehash[!dir].tuple. | 168 | ct->tuplehash[!dir].tuple. |
| 166 | dst.u.udp.port); | 169 | dst.u.udp.port); |
| @@ -173,13 +176,13 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct, | |||
| 173 | /****************************************************************************/ | 176 | /****************************************************************************/ |
| 174 | static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, | 177 | static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, |
| 175 | enum ip_conntrack_info ctinfo, | 178 | enum ip_conntrack_info ctinfo, |
| 176 | unsigned int protoff, unsigned char **data, int dataoff, | 179 | unsigned char **data, int dataoff, |
| 177 | H245_TransportAddress *taddr, | 180 | H245_TransportAddress *taddr, |
| 178 | __be16 port, __be16 rtp_port, | 181 | __be16 port, __be16 rtp_port, |
| 179 | struct nf_conntrack_expect *rtp_exp, | 182 | struct nf_conntrack_expect *rtp_exp, |
| 180 | struct nf_conntrack_expect *rtcp_exp) | 183 | struct nf_conntrack_expect *rtcp_exp) |
| 181 | { | 184 | { |
| 182 | struct nf_ct_h323_master *info = nfct_help_data(ct); | 185 | struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; |
| 183 | int dir = CTINFO2DIR(ctinfo); | 186 | int dir = CTINFO2DIR(ctinfo); |
| 184 | int i; | 187 | int i; |
| 185 | u_int16_t nated_port; | 188 | u_int16_t nated_port; |
| @@ -211,7 +214,8 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, | |||
| 211 | 214 | ||
| 212 | /* Run out of expectations */ | 215 | /* Run out of expectations */ |
| 213 | if (i >= H323_RTP_CHANNEL_MAX) { | 216 | if (i >= H323_RTP_CHANNEL_MAX) { |
| 214 | net_notice_ratelimited("nf_nat_h323: out of expectations\n"); | 217 | if (net_ratelimit()) |
| 218 | pr_notice("nf_nat_h323: out of expectations\n"); | ||
| 215 | return 0; | 219 | return 0; |
| 216 | } | 220 | } |
| 217 | 221 | ||
| @@ -240,12 +244,13 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, | |||
| 240 | } | 244 | } |
| 241 | 245 | ||
| 242 | if (nated_port == 0) { /* No port available */ | 246 | if (nated_port == 0) { /* No port available */ |
| 243 | net_notice_ratelimited("nf_nat_h323: out of RTP ports\n"); | 247 | if (net_ratelimit()) |
| 248 | pr_notice("nf_nat_h323: out of RTP ports\n"); | ||
| 244 | return 0; | 249 | return 0; |
| 245 | } | 250 | } |
| 246 | 251 | ||
| 247 | /* Modify signal */ | 252 | /* Modify signal */ |
| 248 | if (set_h245_addr(skb, protoff, data, dataoff, taddr, | 253 | if (set_h245_addr(skb, data, dataoff, taddr, |
| 249 | &ct->tuplehash[!dir].tuple.dst.u3, | 254 | &ct->tuplehash[!dir].tuple.dst.u3, |
| 250 | htons((port & htons(1)) ? nated_port + 1 : | 255 | htons((port & htons(1)) ? nated_port + 1 : |
| 251 | nated_port)) == 0) { | 256 | nated_port)) == 0) { |
| @@ -276,7 +281,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, | |||
| 276 | /****************************************************************************/ | 281 | /****************************************************************************/ |
| 277 | static int nat_t120(struct sk_buff *skb, struct nf_conn *ct, | 282 | static int nat_t120(struct sk_buff *skb, struct nf_conn *ct, |
| 278 | enum ip_conntrack_info ctinfo, | 283 | enum ip_conntrack_info ctinfo, |
| 279 | unsigned int protoff, unsigned char **data, int dataoff, | 284 | unsigned char **data, int dataoff, |
| 280 | H245_TransportAddress *taddr, __be16 port, | 285 | H245_TransportAddress *taddr, __be16 port, |
| 281 | struct nf_conntrack_expect *exp) | 286 | struct nf_conntrack_expect *exp) |
| 282 | { | 287 | { |
| @@ -303,12 +308,13 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct, | |||
| 303 | } | 308 | } |
| 304 | 309 | ||
| 305 | if (nated_port == 0) { /* No port available */ | 310 | if (nated_port == 0) { /* No port available */ |
| 306 | net_notice_ratelimited("nf_nat_h323: out of TCP ports\n"); | 311 | if (net_ratelimit()) |
| 312 | pr_notice("nf_nat_h323: out of TCP ports\n"); | ||
| 307 | return 0; | 313 | return 0; |
| 308 | } | 314 | } |
| 309 | 315 | ||
| 310 | /* Modify signal */ | 316 | /* Modify signal */ |
| 311 | if (set_h245_addr(skb, protoff, data, dataoff, taddr, | 317 | if (set_h245_addr(skb, data, dataoff, taddr, |
| 312 | &ct->tuplehash[!dir].tuple.dst.u3, | 318 | &ct->tuplehash[!dir].tuple.dst.u3, |
| 313 | htons(nated_port)) < 0) { | 319 | htons(nated_port)) < 0) { |
| 314 | nf_ct_unexpect_related(exp); | 320 | nf_ct_unexpect_related(exp); |
| @@ -327,11 +333,11 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct, | |||
| 327 | /****************************************************************************/ | 333 | /****************************************************************************/ |
| 328 | static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, | 334 | static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, |
| 329 | enum ip_conntrack_info ctinfo, | 335 | enum ip_conntrack_info ctinfo, |
| 330 | unsigned int protoff, unsigned char **data, int dataoff, | 336 | unsigned char **data, int dataoff, |
| 331 | TransportAddress *taddr, __be16 port, | 337 | TransportAddress *taddr, __be16 port, |
| 332 | struct nf_conntrack_expect *exp) | 338 | struct nf_conntrack_expect *exp) |
| 333 | { | 339 | { |
| 334 | struct nf_ct_h323_master *info = nfct_help_data(ct); | 340 | struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; |
| 335 | int dir = CTINFO2DIR(ctinfo); | 341 | int dir = CTINFO2DIR(ctinfo); |
| 336 | u_int16_t nated_port = ntohs(port); | 342 | u_int16_t nated_port = ntohs(port); |
| 337 | 343 | ||
| @@ -359,12 +365,13 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, | |||
| 359 | } | 365 | } |
| 360 | 366 | ||
| 361 | if (nated_port == 0) { /* No port available */ | 367 | if (nated_port == 0) { /* No port available */ |
| 362 | net_notice_ratelimited("nf_nat_q931: out of TCP ports\n"); | 368 | if (net_ratelimit()) |
| 369 | pr_notice("nf_nat_q931: out of TCP ports\n"); | ||
| 363 | return 0; | 370 | return 0; |
| 364 | } | 371 | } |
| 365 | 372 | ||
| 366 | /* Modify signal */ | 373 | /* Modify signal */ |
| 367 | if (set_h225_addr(skb, protoff, data, dataoff, taddr, | 374 | if (set_h225_addr(skb, data, dataoff, taddr, |
| 368 | &ct->tuplehash[!dir].tuple.dst.u3, | 375 | &ct->tuplehash[!dir].tuple.dst.u3, |
| 369 | htons(nated_port)) == 0) { | 376 | htons(nated_port)) == 0) { |
| 370 | /* Save ports */ | 377 | /* Save ports */ |
| @@ -402,27 +409,25 @@ static void ip_nat_q931_expect(struct nf_conn *new, | |||
| 402 | BUG_ON(new->status & IPS_NAT_DONE_MASK); | 409 | BUG_ON(new->status & IPS_NAT_DONE_MASK); |
| 403 | 410 | ||
| 404 | /* Change src to where master sends to */ | 411 | /* Change src to where master sends to */ |
| 405 | range.flags = NF_NAT_RANGE_MAP_IPS; | 412 | range.flags = IP_NAT_RANGE_MAP_IPS; |
| 406 | range.min_addr = range.max_addr = | 413 | range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip; |
| 407 | new->tuplehash[!this->dir].tuple.src.u3; | 414 | nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC); |
| 408 | nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC); | ||
| 409 | 415 | ||
| 410 | /* For DST manip, map port here to where it's expected. */ | 416 | /* For DST manip, map port here to where it's expected. */ |
| 411 | range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); | 417 | range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); |
| 412 | range.min_proto = range.max_proto = this->saved_proto; | 418 | range.min = range.max = this->saved_proto; |
| 413 | range.min_addr = range.max_addr = | 419 | range.min_ip = range.max_ip = |
| 414 | new->master->tuplehash[!this->dir].tuple.src.u3; | 420 | new->master->tuplehash[!this->dir].tuple.src.u3.ip; |
| 415 | nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST); | 421 | nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST); |
| 416 | } | 422 | } |
| 417 | 423 | ||
| 418 | /****************************************************************************/ | 424 | /****************************************************************************/ |
| 419 | static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, | 425 | static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, |
| 420 | enum ip_conntrack_info ctinfo, | 426 | enum ip_conntrack_info ctinfo, |
| 421 | unsigned int protoff, unsigned char **data, | 427 | unsigned char **data, TransportAddress *taddr, int idx, |
| 422 | TransportAddress *taddr, int idx, | ||
| 423 | __be16 port, struct nf_conntrack_expect *exp) | 428 | __be16 port, struct nf_conntrack_expect *exp) |
| 424 | { | 429 | { |
| 425 | struct nf_ct_h323_master *info = nfct_help_data(ct); | 430 | struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; |
| 426 | int dir = CTINFO2DIR(ctinfo); | 431 | int dir = CTINFO2DIR(ctinfo); |
| 427 | u_int16_t nated_port = ntohs(port); | 432 | u_int16_t nated_port = ntohs(port); |
| 428 | union nf_inet_addr addr; | 433 | union nf_inet_addr addr; |
| @@ -451,12 +456,13 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, | |||
| 451 | } | 456 | } |
| 452 | 457 | ||
| 453 | if (nated_port == 0) { /* No port available */ | 458 | if (nated_port == 0) { /* No port available */ |
| 454 | net_notice_ratelimited("nf_nat_ras: out of TCP ports\n"); | 459 | if (net_ratelimit()) |
| 460 | pr_notice("nf_nat_ras: out of TCP ports\n"); | ||
| 455 | return 0; | 461 | return 0; |
| 456 | } | 462 | } |
| 457 | 463 | ||
| 458 | /* Modify signal */ | 464 | /* Modify signal */ |
| 459 | if (set_h225_addr(skb, protoff, data, 0, &taddr[idx], | 465 | if (set_h225_addr(skb, data, 0, &taddr[idx], |
| 460 | &ct->tuplehash[!dir].tuple.dst.u3, | 466 | &ct->tuplehash[!dir].tuple.dst.u3, |
| 461 | htons(nated_port)) == 0) { | 467 | htons(nated_port)) == 0) { |
| 462 | /* Save ports */ | 468 | /* Save ports */ |
| @@ -467,7 +473,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, | |||
| 467 | if (idx > 0 && | 473 | if (idx > 0 && |
| 468 | get_h225_addr(ct, *data, &taddr[0], &addr, &port) && | 474 | get_h225_addr(ct, *data, &taddr[0], &addr, &port) && |
| 469 | (ntohl(addr.ip) & 0xff000000) == 0x7f000000) { | 475 | (ntohl(addr.ip) & 0xff000000) == 0x7f000000) { |
| 470 | set_h225_addr(skb, protoff, data, 0, &taddr[0], | 476 | set_h225_addr(skb, data, 0, &taddr[0], |
| 471 | &ct->tuplehash[!dir].tuple.dst.u3, | 477 | &ct->tuplehash[!dir].tuple.dst.u3, |
| 472 | info->sig_port[!dir]); | 478 | info->sig_port[!dir]); |
| 473 | } | 479 | } |
| @@ -496,22 +502,20 @@ static void ip_nat_callforwarding_expect(struct nf_conn *new, | |||
| 496 | BUG_ON(new->status & IPS_NAT_DONE_MASK); | 502 | BUG_ON(new->status & IPS_NAT_DONE_MASK); |
| 497 | 503 | ||
| 498 | /* Change src to where master sends to */ | 504 | /* Change src to where master sends to */ |
| 499 | range.flags = NF_NAT_RANGE_MAP_IPS; | 505 | range.flags = IP_NAT_RANGE_MAP_IPS; |
| 500 | range.min_addr = range.max_addr = | 506 | range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip; |
| 501 | new->tuplehash[!this->dir].tuple.src.u3; | 507 | nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC); |
| 502 | nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC); | ||
| 503 | 508 | ||
| 504 | /* For DST manip, map port here to where it's expected. */ | 509 | /* For DST manip, map port here to where it's expected. */ |
| 505 | range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); | 510 | range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); |
| 506 | range.min_proto = range.max_proto = this->saved_proto; | 511 | range.min = range.max = this->saved_proto; |
| 507 | range.min_addr = range.max_addr = this->saved_addr; | 512 | range.min_ip = range.max_ip = this->saved_ip; |
| 508 | nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST); | 513 | nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST); |
| 509 | } | 514 | } |
| 510 | 515 | ||
| 511 | /****************************************************************************/ | 516 | /****************************************************************************/ |
| 512 | static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, | 517 | static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, |
| 513 | enum ip_conntrack_info ctinfo, | 518 | enum ip_conntrack_info ctinfo, |
| 514 | unsigned int protoff, | ||
| 515 | unsigned char **data, int dataoff, | 519 | unsigned char **data, int dataoff, |
| 516 | TransportAddress *taddr, __be16 port, | 520 | TransportAddress *taddr, __be16 port, |
| 517 | struct nf_conntrack_expect *exp) | 521 | struct nf_conntrack_expect *exp) |
| @@ -520,7 +524,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, | |||
| 520 | u_int16_t nated_port; | 524 | u_int16_t nated_port; |
| 521 | 525 | ||
| 522 | /* Set expectations for NAT */ | 526 | /* Set expectations for NAT */ |
| 523 | exp->saved_addr = exp->tuple.dst.u3; | 527 | exp->saved_ip = exp->tuple.dst.u3.ip; |
| 524 | exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip; | 528 | exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip; |
| 525 | exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; | 529 | exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; |
| 526 | exp->expectfn = ip_nat_callforwarding_expect; | 530 | exp->expectfn = ip_nat_callforwarding_expect; |
| @@ -541,12 +545,13 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, | |||
| 541 | } | 545 | } |
| 542 | 546 | ||
| 543 | if (nated_port == 0) { /* No port available */ | 547 | if (nated_port == 0) { /* No port available */ |
| 544 | net_notice_ratelimited("nf_nat_q931: out of TCP ports\n"); | 548 | if (net_ratelimit()) |
| 549 | pr_notice("nf_nat_q931: out of TCP ports\n"); | ||
| 545 | return 0; | 550 | return 0; |
| 546 | } | 551 | } |
| 547 | 552 | ||
| 548 | /* Modify signal */ | 553 | /* Modify signal */ |
| 549 | if (!set_h225_addr(skb, protoff, data, dataoff, taddr, | 554 | if (!set_h225_addr(skb, data, dataoff, taddr, |
| 550 | &ct->tuplehash[!dir].tuple.dst.u3, | 555 | &ct->tuplehash[!dir].tuple.dst.u3, |
| 551 | htons(nated_port)) == 0) { | 556 | htons(nated_port)) == 0) { |
| 552 | nf_ct_unexpect_related(exp); | 557 | nf_ct_unexpect_related(exp); |
| @@ -563,16 +568,6 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, | |||
| 563 | return 0; | 568 | return 0; |
| 564 | } | 569 | } |
| 565 | 570 | ||
| 566 | static struct nf_ct_helper_expectfn q931_nat = { | ||
| 567 | .name = "Q.931", | ||
| 568 | .expectfn = ip_nat_q931_expect, | ||
| 569 | }; | ||
| 570 | |||
| 571 | static struct nf_ct_helper_expectfn callforwarding_nat = { | ||
| 572 | .name = "callforwarding", | ||
| 573 | .expectfn = ip_nat_callforwarding_expect, | ||
| 574 | }; | ||
| 575 | |||
| 576 | /****************************************************************************/ | 571 | /****************************************************************************/ |
| 577 | static int __init init(void) | 572 | static int __init init(void) |
| 578 | { | 573 | { |
| @@ -586,34 +581,30 @@ static int __init init(void) | |||
| 586 | BUG_ON(nat_callforwarding_hook != NULL); | 581 | BUG_ON(nat_callforwarding_hook != NULL); |
| 587 | BUG_ON(nat_q931_hook != NULL); | 582 | BUG_ON(nat_q931_hook != NULL); |
| 588 | 583 | ||
| 589 | RCU_INIT_POINTER(set_h245_addr_hook, set_h245_addr); | 584 | rcu_assign_pointer(set_h245_addr_hook, set_h245_addr); |
| 590 | RCU_INIT_POINTER(set_h225_addr_hook, set_h225_addr); | 585 | rcu_assign_pointer(set_h225_addr_hook, set_h225_addr); |
| 591 | RCU_INIT_POINTER(set_sig_addr_hook, set_sig_addr); | 586 | rcu_assign_pointer(set_sig_addr_hook, set_sig_addr); |
| 592 | RCU_INIT_POINTER(set_ras_addr_hook, set_ras_addr); | 587 | rcu_assign_pointer(set_ras_addr_hook, set_ras_addr); |
| 593 | RCU_INIT_POINTER(nat_rtp_rtcp_hook, nat_rtp_rtcp); | 588 | rcu_assign_pointer(nat_rtp_rtcp_hook, nat_rtp_rtcp); |
| 594 | RCU_INIT_POINTER(nat_t120_hook, nat_t120); | 589 | rcu_assign_pointer(nat_t120_hook, nat_t120); |
| 595 | RCU_INIT_POINTER(nat_h245_hook, nat_h245); | 590 | rcu_assign_pointer(nat_h245_hook, nat_h245); |
| 596 | RCU_INIT_POINTER(nat_callforwarding_hook, nat_callforwarding); | 591 | rcu_assign_pointer(nat_callforwarding_hook, nat_callforwarding); |
| 597 | RCU_INIT_POINTER(nat_q931_hook, nat_q931); | 592 | rcu_assign_pointer(nat_q931_hook, nat_q931); |
| 598 | nf_ct_helper_expectfn_register(&q931_nat); | ||
| 599 | nf_ct_helper_expectfn_register(&callforwarding_nat); | ||
| 600 | return 0; | 593 | return 0; |
| 601 | } | 594 | } |
| 602 | 595 | ||
| 603 | /****************************************************************************/ | 596 | /****************************************************************************/ |
| 604 | static void __exit fini(void) | 597 | static void __exit fini(void) |
| 605 | { | 598 | { |
| 606 | RCU_INIT_POINTER(set_h245_addr_hook, NULL); | 599 | rcu_assign_pointer(set_h245_addr_hook, NULL); |
| 607 | RCU_INIT_POINTER(set_h225_addr_hook, NULL); | 600 | rcu_assign_pointer(set_h225_addr_hook, NULL); |
| 608 | RCU_INIT_POINTER(set_sig_addr_hook, NULL); | 601 | rcu_assign_pointer(set_sig_addr_hook, NULL); |
| 609 | RCU_INIT_POINTER(set_ras_addr_hook, NULL); | 602 | rcu_assign_pointer(set_ras_addr_hook, NULL); |
| 610 | RCU_INIT_POINTER(nat_rtp_rtcp_hook, NULL); | 603 | rcu_assign_pointer(nat_rtp_rtcp_hook, NULL); |
| 611 | RCU_INIT_POINTER(nat_t120_hook, NULL); | 604 | rcu_assign_pointer(nat_t120_hook, NULL); |
| 612 | RCU_INIT_POINTER(nat_h245_hook, NULL); | 605 | rcu_assign_pointer(nat_h245_hook, NULL); |
| 613 | RCU_INIT_POINTER(nat_callforwarding_hook, NULL); | 606 | rcu_assign_pointer(nat_callforwarding_hook, NULL); |
| 614 | RCU_INIT_POINTER(nat_q931_hook, NULL); | 607 | rcu_assign_pointer(nat_q931_hook, NULL); |
| 615 | nf_ct_helper_expectfn_unregister(&q931_nat); | ||
| 616 | nf_ct_helper_expectfn_unregister(&callforwarding_nat); | ||
| 617 | synchronize_rcu(); | 608 | synchronize_rcu(); |
| 618 | } | 609 | } |
| 619 | 610 | ||
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c deleted file mode 100644 index d8b2e14efdd..00000000000 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ /dev/null | |||
| @@ -1,281 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * (C) 1999-2001 Paul `Rusty' Russell | ||
| 3 | * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> | ||
| 4 | * (C) 2011 Patrick McHardy <kaber@trash.net> | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License version 2 as | ||
| 8 | * published by the Free Software Foundation. | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include <linux/types.h> | ||
| 12 | #include <linux/module.h> | ||
| 13 | #include <linux/skbuff.h> | ||
| 14 | #include <linux/ip.h> | ||
| 15 | #include <linux/icmp.h> | ||
| 16 | #include <linux/netfilter.h> | ||
| 17 | #include <linux/netfilter_ipv4.h> | ||
| 18 | #include <net/secure_seq.h> | ||
| 19 | #include <net/checksum.h> | ||
| 20 | #include <net/route.h> | ||
| 21 | #include <net/ip.h> | ||
| 22 | |||
| 23 | #include <net/netfilter/nf_conntrack_core.h> | ||
| 24 | #include <net/netfilter/nf_conntrack.h> | ||
| 25 | #include <net/netfilter/nf_nat_core.h> | ||
| 26 | #include <net/netfilter/nf_nat_l3proto.h> | ||
| 27 | #include <net/netfilter/nf_nat_l4proto.h> | ||
| 28 | |||
| 29 | static const struct nf_nat_l3proto nf_nat_l3proto_ipv4; | ||
| 30 | |||
| 31 | #ifdef CONFIG_XFRM | ||
| 32 | static void nf_nat_ipv4_decode_session(struct sk_buff *skb, | ||
| 33 | const struct nf_conn *ct, | ||
| 34 | enum ip_conntrack_dir dir, | ||
| 35 | unsigned long statusbit, | ||
| 36 | struct flowi *fl) | ||
| 37 | { | ||
| 38 | const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; | ||
| 39 | struct flowi4 *fl4 = &fl->u.ip4; | ||
| 40 | |||
| 41 | if (ct->status & statusbit) { | ||
| 42 | fl4->daddr = t->dst.u3.ip; | ||
| 43 | if (t->dst.protonum == IPPROTO_TCP || | ||
| 44 | t->dst.protonum == IPPROTO_UDP || | ||
| 45 | t->dst.protonum == IPPROTO_UDPLITE || | ||
| 46 | t->dst.protonum == IPPROTO_DCCP || | ||
| 47 | t->dst.protonum == IPPROTO_SCTP) | ||
| 48 | fl4->fl4_dport = t->dst.u.all; | ||
| 49 | } | ||
| 50 | |||
| 51 | statusbit ^= IPS_NAT_MASK; | ||
| 52 | |||
| 53 | if (ct->status & statusbit) { | ||
| 54 | fl4->saddr = t->src.u3.ip; | ||
| 55 | if (t->dst.protonum == IPPROTO_TCP || | ||
| 56 | t->dst.protonum == IPPROTO_UDP || | ||
| 57 | t->dst.protonum == IPPROTO_UDPLITE || | ||
| 58 | t->dst.protonum == IPPROTO_DCCP || | ||
| 59 | t->dst.protonum == IPPROTO_SCTP) | ||
| 60 | fl4->fl4_sport = t->src.u.all; | ||
| 61 | } | ||
| 62 | } | ||
| 63 | #endif /* CONFIG_XFRM */ | ||
| 64 | |||
| 65 | static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t, | ||
| 66 | const struct nf_nat_range *range) | ||
| 67 | { | ||
| 68 | return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && | ||
| 69 | ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); | ||
| 70 | } | ||
| 71 | |||
| 72 | static u32 nf_nat_ipv4_secure_port(const struct nf_conntrack_tuple *t, | ||
| 73 | __be16 dport) | ||
| 74 | { | ||
| 75 | return secure_ipv4_port_ephemeral(t->src.u3.ip, t->dst.u3.ip, dport); | ||
| 76 | } | ||
| 77 | |||
| 78 | static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, | ||
| 79 | unsigned int iphdroff, | ||
| 80 | const struct nf_nat_l4proto *l4proto, | ||
| 81 | const struct nf_conntrack_tuple *target, | ||
| 82 | enum nf_nat_manip_type maniptype) | ||
| 83 | { | ||
| 84 | struct iphdr *iph; | ||
| 85 | unsigned int hdroff; | ||
| 86 | |||
| 87 | if (!skb_make_writable(skb, iphdroff + sizeof(*iph))) | ||
| 88 | return false; | ||
| 89 | |||
| 90 | iph = (void *)skb->data + iphdroff; | ||
| 91 | hdroff = iphdroff + iph->ihl * 4; | ||
| 92 | |||
| 93 | if (!l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, hdroff, | ||
| 94 | target, maniptype)) | ||
| 95 | return false; | ||
| 96 | iph = (void *)skb->data + iphdroff; | ||
| 97 | |||
| 98 | if (maniptype == NF_NAT_MANIP_SRC) { | ||
| 99 | csum_replace4(&iph->check, iph->saddr, target->src.u3.ip); | ||
| 100 | iph->saddr = target->src.u3.ip; | ||
| 101 | } else { | ||
| 102 | csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip); | ||
| 103 | iph->daddr = target->dst.u3.ip; | ||
| 104 | } | ||
| 105 | return true; | ||
| 106 | } | ||
| 107 | |||
| 108 | static void nf_nat_ipv4_csum_update(struct sk_buff *skb, | ||
| 109 | unsigned int iphdroff, __sum16 *check, | ||
| 110 | const struct nf_conntrack_tuple *t, | ||
| 111 | enum nf_nat_manip_type maniptype) | ||
| 112 | { | ||
| 113 | struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); | ||
| 114 | __be32 oldip, newip; | ||
| 115 | |||
| 116 | if (maniptype == NF_NAT_MANIP_SRC) { | ||
| 117 | oldip = iph->saddr; | ||
| 118 | newip = t->src.u3.ip; | ||
| 119 | } else { | ||
| 120 | oldip = iph->daddr; | ||
| 121 | newip = t->dst.u3.ip; | ||
| 122 | } | ||
| 123 | inet_proto_csum_replace4(check, skb, oldip, newip, 1); | ||
| 124 | } | ||
| 125 | |||
| 126 | static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, | ||
| 127 | u8 proto, void *data, __sum16 *check, | ||
| 128 | int datalen, int oldlen) | ||
| 129 | { | ||
| 130 | const struct iphdr *iph = ip_hdr(skb); | ||
| 131 | struct rtable *rt = skb_rtable(skb); | ||
| 132 | |||
| 133 | if (skb->ip_summed != CHECKSUM_PARTIAL) { | ||
| 134 | if (!(rt->rt_flags & RTCF_LOCAL) && | ||
| 135 | (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) { | ||
| 136 | skb->ip_summed = CHECKSUM_PARTIAL; | ||
| 137 | skb->csum_start = skb_headroom(skb) + | ||
| 138 | skb_network_offset(skb) + | ||
| 139 | ip_hdrlen(skb); | ||
| 140 | skb->csum_offset = (void *)check - data; | ||
| 141 | *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, | ||
| 142 | datalen, proto, 0); | ||
| 143 | } else { | ||
| 144 | *check = 0; | ||
| 145 | *check = csum_tcpudp_magic(iph->saddr, iph->daddr, | ||
| 146 | datalen, proto, | ||
| 147 | csum_partial(data, datalen, | ||
| 148 | 0)); | ||
| 149 | if (proto == IPPROTO_UDP && !*check) | ||
| 150 | *check = CSUM_MANGLED_0; | ||
| 151 | } | ||
| 152 | } else | ||
| 153 | inet_proto_csum_replace2(check, skb, | ||
| 154 | htons(oldlen), htons(datalen), 1); | ||
| 155 | } | ||
| 156 | |||
| 157 | static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], | ||
| 158 | struct nf_nat_range *range) | ||
| 159 | { | ||
| 160 | if (tb[CTA_NAT_V4_MINIP]) { | ||
| 161 | range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); | ||
| 162 | range->flags |= NF_NAT_RANGE_MAP_IPS; | ||
| 163 | } | ||
| 164 | |||
| 165 | if (tb[CTA_NAT_V4_MAXIP]) | ||
| 166 | range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]); | ||
| 167 | else | ||
| 168 | range->max_addr.ip = range->min_addr.ip; | ||
| 169 | |||
| 170 | return 0; | ||
| 171 | } | ||
| 172 | |||
| 173 | static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = { | ||
| 174 | .l3proto = NFPROTO_IPV4, | ||
| 175 | .in_range = nf_nat_ipv4_in_range, | ||
| 176 | .secure_port = nf_nat_ipv4_secure_port, | ||
| 177 | .manip_pkt = nf_nat_ipv4_manip_pkt, | ||
| 178 | .csum_update = nf_nat_ipv4_csum_update, | ||
| 179 | .csum_recalc = nf_nat_ipv4_csum_recalc, | ||
| 180 | .nlattr_to_range = nf_nat_ipv4_nlattr_to_range, | ||
| 181 | #ifdef CONFIG_XFRM | ||
| 182 | .decode_session = nf_nat_ipv4_decode_session, | ||
| 183 | #endif | ||
| 184 | }; | ||
| 185 | |||
| 186 | int nf_nat_icmp_reply_translation(struct sk_buff *skb, | ||
| 187 | struct nf_conn *ct, | ||
| 188 | enum ip_conntrack_info ctinfo, | ||
| 189 | unsigned int hooknum) | ||
| 190 | { | ||
| 191 | struct { | ||
| 192 | struct icmphdr icmp; | ||
| 193 | struct iphdr ip; | ||
| 194 | } *inside; | ||
| 195 | enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); | ||
| 196 | enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); | ||
| 197 | unsigned int hdrlen = ip_hdrlen(skb); | ||
| 198 | const struct nf_nat_l4proto *l4proto; | ||
| 199 | struct nf_conntrack_tuple target; | ||
| 200 | unsigned long statusbit; | ||
| 201 | |||
| 202 | NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY); | ||
| 203 | |||
| 204 | if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) | ||
| 205 | return 0; | ||
| 206 | if (nf_ip_checksum(skb, hooknum, hdrlen, 0)) | ||
| 207 | return 0; | ||
| 208 | |||
| 209 | inside = (void *)skb->data + hdrlen; | ||
| 210 | if (inside->icmp.type == ICMP_REDIRECT) { | ||
| 211 | if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) | ||
| 212 | return 0; | ||
| 213 | if (ct->status & IPS_NAT_MASK) | ||
| 214 | return 0; | ||
| 215 | } | ||
| 216 | |||
| 217 | if (manip == NF_NAT_MANIP_SRC) | ||
| 218 | statusbit = IPS_SRC_NAT; | ||
| 219 | else | ||
| 220 | statusbit = IPS_DST_NAT; | ||
| 221 | |||
| 222 | /* Invert if this is reply direction */ | ||
| 223 | if (dir == IP_CT_DIR_REPLY) | ||
| 224 | statusbit ^= IPS_NAT_MASK; | ||
| 225 | |||
| 226 | if (!(ct->status & statusbit)) | ||
| 227 | return 1; | ||
| 228 | |||
| 229 | l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol); | ||
| 230 | if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp), | ||
| 231 | l4proto, &ct->tuplehash[!dir].tuple, !manip)) | ||
| 232 | return 0; | ||
| 233 | |||
| 234 | if (skb->ip_summed != CHECKSUM_PARTIAL) { | ||
| 235 | /* Reloading "inside" here since manip_pkt may reallocate */ | ||
| 236 | inside = (void *)skb->data + hdrlen; | ||
| 237 | inside->icmp.checksum = 0; | ||
| 238 | inside->icmp.checksum = | ||
| 239 | csum_fold(skb_checksum(skb, hdrlen, | ||
| 240 | skb->len - hdrlen, 0)); | ||
| 241 | } | ||
| 242 | |||
| 243 | /* Change outer to look like the reply to an incoming packet */ | ||
| 244 | nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); | ||
| 245 | l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0); | ||
| 246 | if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip)) | ||
| 247 | return 0; | ||
| 248 | |||
| 249 | return 1; | ||
| 250 | } | ||
| 251 | EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); | ||
| 252 | |||
| 253 | static int __init nf_nat_l3proto_ipv4_init(void) | ||
| 254 | { | ||
| 255 | int err; | ||
| 256 | |||
| 257 | err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_icmp); | ||
| 258 | if (err < 0) | ||
| 259 | goto err1; | ||
| 260 | err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv4); | ||
| 261 | if (err < 0) | ||
| 262 | goto err2; | ||
| 263 | return err; | ||
| 264 | |||
| 265 | err2: | ||
| 266 | nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp); | ||
| 267 | err1: | ||
| 268 | return err; | ||
| 269 | } | ||
| 270 | |||
| 271 | static void __exit nf_nat_l3proto_ipv4_exit(void) | ||
| 272 | { | ||
| 273 | nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4); | ||
| 274 | nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp); | ||
| 275 | } | ||
| 276 | |||
| 277 | MODULE_LICENSE("GPL"); | ||
| 278 | MODULE_ALIAS("nf-nat-" __stringify(AF_INET)); | ||
| 279 | |||
| 280 | module_init(nf_nat_l3proto_ipv4_init); | ||
| 281 | module_exit(nf_nat_l3proto_ipv4_exit); | ||
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index a06d7d74817..4c060038d29 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | 22 | ||
| 23 | #include <net/netfilter/nf_nat.h> | 23 | #include <net/netfilter/nf_nat.h> |
| 24 | #include <net/netfilter/nf_nat_helper.h> | 24 | #include <net/netfilter/nf_nat_helper.h> |
| 25 | #include <net/netfilter/nf_nat_rule.h> | ||
| 25 | #include <net/netfilter/nf_conntrack_helper.h> | 26 | #include <net/netfilter/nf_conntrack_helper.h> |
| 26 | #include <net/netfilter/nf_conntrack_expect.h> | 27 | #include <net/netfilter/nf_conntrack_expect.h> |
| 27 | #include <net/netfilter/nf_conntrack_zones.h> | 28 | #include <net/netfilter/nf_conntrack_zones.h> |
| @@ -48,7 +49,7 @@ static void pptp_nat_expected(struct nf_conn *ct, | |||
| 48 | const struct nf_nat_pptp *nat_pptp_info; | 49 | const struct nf_nat_pptp *nat_pptp_info; |
| 49 | struct nf_nat_range range; | 50 | struct nf_nat_range range; |
| 50 | 51 | ||
| 51 | ct_pptp_info = nfct_help_data(master); | 52 | ct_pptp_info = &nfct_help(master)->help.ct_pptp_info; |
| 52 | nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info; | 53 | nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info; |
| 53 | 54 | ||
| 54 | /* And here goes the grand finale of corrosion... */ | 55 | /* And here goes the grand finale of corrosion... */ |
| @@ -87,24 +88,24 @@ static void pptp_nat_expected(struct nf_conn *ct, | |||
| 87 | BUG_ON(ct->status & IPS_NAT_DONE_MASK); | 88 | BUG_ON(ct->status & IPS_NAT_DONE_MASK); |
| 88 | 89 | ||
| 89 | /* Change src to where master sends to */ | 90 | /* Change src to where master sends to */ |
| 90 | range.flags = NF_NAT_RANGE_MAP_IPS; | 91 | range.flags = IP_NAT_RANGE_MAP_IPS; |
| 91 | range.min_addr = range.max_addr | 92 | range.min_ip = range.max_ip |
| 92 | = ct->master->tuplehash[!exp->dir].tuple.dst.u3; | 93 | = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; |
| 93 | if (exp->dir == IP_CT_DIR_ORIGINAL) { | 94 | if (exp->dir == IP_CT_DIR_ORIGINAL) { |
| 94 | range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; | 95 | range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; |
| 95 | range.min_proto = range.max_proto = exp->saved_proto; | 96 | range.min = range.max = exp->saved_proto; |
| 96 | } | 97 | } |
| 97 | nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); | 98 | nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC); |
| 98 | 99 | ||
| 99 | /* For DST manip, map port here to where it's expected. */ | 100 | /* For DST manip, map port here to where it's expected. */ |
| 100 | range.flags = NF_NAT_RANGE_MAP_IPS; | 101 | range.flags = IP_NAT_RANGE_MAP_IPS; |
| 101 | range.min_addr = range.max_addr | 102 | range.min_ip = range.max_ip |
| 102 | = ct->master->tuplehash[!exp->dir].tuple.src.u3; | 103 | = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip; |
| 103 | if (exp->dir == IP_CT_DIR_REPLY) { | 104 | if (exp->dir == IP_CT_DIR_REPLY) { |
| 104 | range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; | 105 | range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; |
| 105 | range.min_proto = range.max_proto = exp->saved_proto; | 106 | range.min = range.max = exp->saved_proto; |
| 106 | } | 107 | } |
| 107 | nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); | 108 | nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST); |
| 108 | } | 109 | } |
| 109 | 110 | ||
| 110 | /* outbound packets == from PNS to PAC */ | 111 | /* outbound packets == from PNS to PAC */ |
| @@ -112,7 +113,6 @@ static int | |||
| 112 | pptp_outbound_pkt(struct sk_buff *skb, | 113 | pptp_outbound_pkt(struct sk_buff *skb, |
| 113 | struct nf_conn *ct, | 114 | struct nf_conn *ct, |
| 114 | enum ip_conntrack_info ctinfo, | 115 | enum ip_conntrack_info ctinfo, |
| 115 | unsigned int protoff, | ||
| 116 | struct PptpControlHeader *ctlh, | 116 | struct PptpControlHeader *ctlh, |
| 117 | union pptp_ctrl_union *pptpReq) | 117 | union pptp_ctrl_union *pptpReq) |
| 118 | 118 | ||
| @@ -123,7 +123,7 @@ pptp_outbound_pkt(struct sk_buff *skb, | |||
| 123 | __be16 new_callid; | 123 | __be16 new_callid; |
| 124 | unsigned int cid_off; | 124 | unsigned int cid_off; |
| 125 | 125 | ||
| 126 | ct_pptp_info = nfct_help_data(ct); | 126 | ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info; |
| 127 | nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; | 127 | nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; |
| 128 | 128 | ||
| 129 | new_callid = ct_pptp_info->pns_call_id; | 129 | new_callid = ct_pptp_info->pns_call_id; |
| @@ -175,7 +175,7 @@ pptp_outbound_pkt(struct sk_buff *skb, | |||
| 175 | ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid)); | 175 | ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid)); |
| 176 | 176 | ||
| 177 | /* mangle packet */ | 177 | /* mangle packet */ |
| 178 | if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, | 178 | if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, |
| 179 | cid_off + sizeof(struct pptp_pkt_hdr) + | 179 | cid_off + sizeof(struct pptp_pkt_hdr) + |
| 180 | sizeof(struct PptpControlHeader), | 180 | sizeof(struct PptpControlHeader), |
| 181 | sizeof(new_callid), (char *)&new_callid, | 181 | sizeof(new_callid), (char *)&new_callid, |
| @@ -192,7 +192,7 @@ pptp_exp_gre(struct nf_conntrack_expect *expect_orig, | |||
| 192 | struct nf_ct_pptp_master *ct_pptp_info; | 192 | struct nf_ct_pptp_master *ct_pptp_info; |
| 193 | struct nf_nat_pptp *nat_pptp_info; | 193 | struct nf_nat_pptp *nat_pptp_info; |
| 194 | 194 | ||
| 195 | ct_pptp_info = nfct_help_data(ct); | 195 | ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info; |
| 196 | nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; | 196 | nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; |
| 197 | 197 | ||
| 198 | /* save original PAC call ID in nat_info */ | 198 | /* save original PAC call ID in nat_info */ |
| @@ -216,7 +216,6 @@ static int | |||
| 216 | pptp_inbound_pkt(struct sk_buff *skb, | 216 | pptp_inbound_pkt(struct sk_buff *skb, |
| 217 | struct nf_conn *ct, | 217 | struct nf_conn *ct, |
| 218 | enum ip_conntrack_info ctinfo, | 218 | enum ip_conntrack_info ctinfo, |
| 219 | unsigned int protoff, | ||
| 220 | struct PptpControlHeader *ctlh, | 219 | struct PptpControlHeader *ctlh, |
| 221 | union pptp_ctrl_union *pptpReq) | 220 | union pptp_ctrl_union *pptpReq) |
| 222 | { | 221 | { |
| @@ -269,7 +268,7 @@ pptp_inbound_pkt(struct sk_buff *skb, | |||
| 269 | pr_debug("altering peer call id from 0x%04x to 0x%04x\n", | 268 | pr_debug("altering peer call id from 0x%04x to 0x%04x\n", |
| 270 | ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid)); | 269 | ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid)); |
| 271 | 270 | ||
| 272 | if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, | 271 | if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, |
| 273 | pcid_off + sizeof(struct pptp_pkt_hdr) + | 272 | pcid_off + sizeof(struct pptp_pkt_hdr) + |
| 274 | sizeof(struct PptpControlHeader), | 273 | sizeof(struct PptpControlHeader), |
| 275 | sizeof(new_pcid), (char *)&new_pcid, | 274 | sizeof(new_pcid), (char *)&new_pcid, |
| @@ -283,25 +282,25 @@ static int __init nf_nat_helper_pptp_init(void) | |||
| 283 | nf_nat_need_gre(); | 282 | nf_nat_need_gre(); |
| 284 | 283 | ||
| 285 | BUG_ON(nf_nat_pptp_hook_outbound != NULL); | 284 | BUG_ON(nf_nat_pptp_hook_outbound != NULL); |
| 286 | RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, pptp_outbound_pkt); | 285 | rcu_assign_pointer(nf_nat_pptp_hook_outbound, pptp_outbound_pkt); |
| 287 | 286 | ||
| 288 | BUG_ON(nf_nat_pptp_hook_inbound != NULL); | 287 | BUG_ON(nf_nat_pptp_hook_inbound != NULL); |
| 289 | RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, pptp_inbound_pkt); | 288 | rcu_assign_pointer(nf_nat_pptp_hook_inbound, pptp_inbound_pkt); |
| 290 | 289 | ||
| 291 | BUG_ON(nf_nat_pptp_hook_exp_gre != NULL); | 290 | BUG_ON(nf_nat_pptp_hook_exp_gre != NULL); |
| 292 | RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, pptp_exp_gre); | 291 | rcu_assign_pointer(nf_nat_pptp_hook_exp_gre, pptp_exp_gre); |
| 293 | 292 | ||
| 294 | BUG_ON(nf_nat_pptp_hook_expectfn != NULL); | 293 | BUG_ON(nf_nat_pptp_hook_expectfn != NULL); |
| 295 | RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, pptp_nat_expected); | 294 | rcu_assign_pointer(nf_nat_pptp_hook_expectfn, pptp_nat_expected); |
| 296 | return 0; | 295 | return 0; |
| 297 | } | 296 | } |
| 298 | 297 | ||
| 299 | static void __exit nf_nat_helper_pptp_fini(void) | 298 | static void __exit nf_nat_helper_pptp_fini(void) |
| 300 | { | 299 | { |
| 301 | RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, NULL); | 300 | rcu_assign_pointer(nf_nat_pptp_hook_expectfn, NULL); |
| 302 | RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, NULL); | 301 | rcu_assign_pointer(nf_nat_pptp_hook_exp_gre, NULL); |
| 303 | RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, NULL); | 302 | rcu_assign_pointer(nf_nat_pptp_hook_inbound, NULL); |
| 304 | RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, NULL); | 303 | rcu_assign_pointer(nf_nat_pptp_hook_outbound, NULL); |
| 305 | synchronize_rcu(); | 304 | synchronize_rcu(); |
| 306 | } | 305 | } |
| 307 | 306 | ||
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index ea44f02563b..bc8d83a31c7 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c | |||
| @@ -28,7 +28,8 @@ | |||
| 28 | #include <linux/ip.h> | 28 | #include <linux/ip.h> |
| 29 | 29 | ||
| 30 | #include <net/netfilter/nf_nat.h> | 30 | #include <net/netfilter/nf_nat.h> |
| 31 | #include <net/netfilter/nf_nat_l4proto.h> | 31 | #include <net/netfilter/nf_nat_rule.h> |
| 32 | #include <net/netfilter/nf_nat_protocol.h> | ||
| 32 | #include <linux/netfilter/nf_conntrack_proto_gre.h> | 33 | #include <linux/netfilter/nf_conntrack_proto_gre.h> |
| 33 | 34 | ||
| 34 | MODULE_LICENSE("GPL"); | 35 | MODULE_LICENSE("GPL"); |
| @@ -37,8 +38,7 @@ MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); | |||
| 37 | 38 | ||
| 38 | /* generate unique tuple ... */ | 39 | /* generate unique tuple ... */ |
| 39 | static void | 40 | static void |
| 40 | gre_unique_tuple(const struct nf_nat_l3proto *l3proto, | 41 | gre_unique_tuple(struct nf_conntrack_tuple *tuple, |
| 41 | struct nf_conntrack_tuple *tuple, | ||
| 42 | const struct nf_nat_range *range, | 42 | const struct nf_nat_range *range, |
| 43 | enum nf_nat_manip_type maniptype, | 43 | enum nf_nat_manip_type maniptype, |
| 44 | const struct nf_conn *ct) | 44 | const struct nf_conn *ct) |
| @@ -52,18 +52,18 @@ gre_unique_tuple(const struct nf_nat_l3proto *l3proto, | |||
| 52 | if (!ct->master) | 52 | if (!ct->master) |
| 53 | return; | 53 | return; |
| 54 | 54 | ||
| 55 | if (maniptype == NF_NAT_MANIP_SRC) | 55 | if (maniptype == IP_NAT_MANIP_SRC) |
| 56 | keyptr = &tuple->src.u.gre.key; | 56 | keyptr = &tuple->src.u.gre.key; |
| 57 | else | 57 | else |
| 58 | keyptr = &tuple->dst.u.gre.key; | 58 | keyptr = &tuple->dst.u.gre.key; |
| 59 | 59 | ||
| 60 | if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { | 60 | if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { |
| 61 | pr_debug("%p: NATing GRE PPTP\n", ct); | 61 | pr_debug("%p: NATing GRE PPTP\n", ct); |
| 62 | min = 1; | 62 | min = 1; |
| 63 | range_size = 0xffff; | 63 | range_size = 0xffff; |
| 64 | } else { | 64 | } else { |
| 65 | min = ntohs(range->min_proto.gre.key); | 65 | min = ntohs(range->min.gre.key); |
| 66 | range_size = ntohs(range->max_proto.gre.key) - min + 1; | 66 | range_size = ntohs(range->max.gre.key) - min + 1; |
| 67 | } | 67 | } |
| 68 | 68 | ||
| 69 | pr_debug("min = %u, range_size = %u\n", min, range_size); | 69 | pr_debug("min = %u, range_size = %u\n", min, range_size); |
| @@ -80,14 +80,14 @@ gre_unique_tuple(const struct nf_nat_l3proto *l3proto, | |||
| 80 | 80 | ||
| 81 | /* manipulate a GRE packet according to maniptype */ | 81 | /* manipulate a GRE packet according to maniptype */ |
| 82 | static bool | 82 | static bool |
| 83 | gre_manip_pkt(struct sk_buff *skb, | 83 | gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, |
| 84 | const struct nf_nat_l3proto *l3proto, | ||
| 85 | unsigned int iphdroff, unsigned int hdroff, | ||
| 86 | const struct nf_conntrack_tuple *tuple, | 84 | const struct nf_conntrack_tuple *tuple, |
| 87 | enum nf_nat_manip_type maniptype) | 85 | enum nf_nat_manip_type maniptype) |
| 88 | { | 86 | { |
| 89 | const struct gre_hdr *greh; | 87 | const struct gre_hdr *greh; |
| 90 | struct gre_hdr_pptp *pgreh; | 88 | struct gre_hdr_pptp *pgreh; |
| 89 | const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); | ||
| 90 | unsigned int hdroff = iphdroff + iph->ihl * 4; | ||
| 91 | 91 | ||
| 92 | /* pgreh includes two optional 32bit fields which are not required | 92 | /* pgreh includes two optional 32bit fields which are not required |
| 93 | * to be there. That's where the magic '8' comes from */ | 93 | * to be there. That's where the magic '8' comes from */ |
| @@ -99,7 +99,7 @@ gre_manip_pkt(struct sk_buff *skb, | |||
| 99 | 99 | ||
| 100 | /* we only have destination manip of a packet, since 'source key' | 100 | /* we only have destination manip of a packet, since 'source key' |
| 101 | * is not present in the packet itself */ | 101 | * is not present in the packet itself */ |
| 102 | if (maniptype != NF_NAT_MANIP_DST) | 102 | if (maniptype != IP_NAT_MANIP_DST) |
| 103 | return true; | 103 | return true; |
| 104 | switch (greh->version) { | 104 | switch (greh->version) { |
| 105 | case GRE_VERSION_1701: | 105 | case GRE_VERSION_1701: |
| @@ -117,24 +117,26 @@ gre_manip_pkt(struct sk_buff *skb, | |||
| 117 | return true; | 117 | return true; |
| 118 | } | 118 | } |
| 119 | 119 | ||
| 120 | static const struct nf_nat_l4proto gre = { | 120 | static const struct nf_nat_protocol gre = { |
| 121 | .l4proto = IPPROTO_GRE, | 121 | .protonum = IPPROTO_GRE, |
| 122 | .me = THIS_MODULE, | ||
| 122 | .manip_pkt = gre_manip_pkt, | 123 | .manip_pkt = gre_manip_pkt, |
| 123 | .in_range = nf_nat_l4proto_in_range, | 124 | .in_range = nf_nat_proto_in_range, |
| 124 | .unique_tuple = gre_unique_tuple, | 125 | .unique_tuple = gre_unique_tuple, |
| 125 | #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) | 126 | #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) |
| 126 | .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, | 127 | .range_to_nlattr = nf_nat_proto_range_to_nlattr, |
| 128 | .nlattr_to_range = nf_nat_proto_nlattr_to_range, | ||
| 127 | #endif | 129 | #endif |
| 128 | }; | 130 | }; |
| 129 | 131 | ||
| 130 | static int __init nf_nat_proto_gre_init(void) | 132 | static int __init nf_nat_proto_gre_init(void) |
| 131 | { | 133 | { |
| 132 | return nf_nat_l4proto_register(NFPROTO_IPV4, &gre); | 134 | return nf_nat_protocol_register(&gre); |
| 133 | } | 135 | } |
| 134 | 136 | ||
| 135 | static void __exit nf_nat_proto_gre_fini(void) | 137 | static void __exit nf_nat_proto_gre_fini(void) |
| 136 | { | 138 | { |
| 137 | nf_nat_l4proto_unregister(NFPROTO_IPV4, &gre); | 139 | nf_nat_protocol_unregister(&gre); |
| 138 | } | 140 | } |
| 139 | 141 | ||
| 140 | module_init(nf_nat_proto_gre_init); | 142 | module_init(nf_nat_proto_gre_init); |
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index eb303471bcf..5744c3ec847 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c | |||
| @@ -8,14 +8,14 @@ | |||
| 8 | 8 | ||
| 9 | #include <linux/types.h> | 9 | #include <linux/types.h> |
| 10 | #include <linux/init.h> | 10 | #include <linux/init.h> |
| 11 | #include <linux/export.h> | ||
| 12 | #include <linux/ip.h> | 11 | #include <linux/ip.h> |
| 13 | #include <linux/icmp.h> | 12 | #include <linux/icmp.h> |
| 14 | 13 | ||
| 15 | #include <linux/netfilter.h> | 14 | #include <linux/netfilter.h> |
| 16 | #include <net/netfilter/nf_nat.h> | 15 | #include <net/netfilter/nf_nat.h> |
| 17 | #include <net/netfilter/nf_nat_core.h> | 16 | #include <net/netfilter/nf_nat_core.h> |
| 18 | #include <net/netfilter/nf_nat_l4proto.h> | 17 | #include <net/netfilter/nf_nat_rule.h> |
| 18 | #include <net/netfilter/nf_nat_protocol.h> | ||
| 19 | 19 | ||
| 20 | static bool | 20 | static bool |
| 21 | icmp_in_range(const struct nf_conntrack_tuple *tuple, | 21 | icmp_in_range(const struct nf_conntrack_tuple *tuple, |
| @@ -28,8 +28,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple, | |||
| 28 | } | 28 | } |
| 29 | 29 | ||
| 30 | static void | 30 | static void |
| 31 | icmp_unique_tuple(const struct nf_nat_l3proto *l3proto, | 31 | icmp_unique_tuple(struct nf_conntrack_tuple *tuple, |
| 32 | struct nf_conntrack_tuple *tuple, | ||
| 33 | const struct nf_nat_range *range, | 32 | const struct nf_nat_range *range, |
| 34 | enum nf_nat_manip_type maniptype, | 33 | enum nf_nat_manip_type maniptype, |
| 35 | const struct nf_conn *ct) | 34 | const struct nf_conn *ct) |
| @@ -38,14 +37,13 @@ icmp_unique_tuple(const struct nf_nat_l3proto *l3proto, | |||
| 38 | unsigned int range_size; | 37 | unsigned int range_size; |
| 39 | unsigned int i; | 38 | unsigned int i; |
| 40 | 39 | ||
| 41 | range_size = ntohs(range->max_proto.icmp.id) - | 40 | range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1; |
| 42 | ntohs(range->min_proto.icmp.id) + 1; | ||
| 43 | /* If no range specified... */ | 41 | /* If no range specified... */ |
| 44 | if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) | 42 | if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) |
| 45 | range_size = 0xFFFF; | 43 | range_size = 0xFFFF; |
| 46 | 44 | ||
| 47 | for (i = 0; ; ++id) { | 45 | for (i = 0; ; ++id) { |
| 48 | tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) + | 46 | tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) + |
| 49 | (id % range_size)); | 47 | (id % range_size)); |
| 50 | if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) | 48 | if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) |
| 51 | return; | 49 | return; |
| @@ -55,12 +53,13 @@ icmp_unique_tuple(const struct nf_nat_l3proto *l3proto, | |||
| 55 | 53 | ||
| 56 | static bool | 54 | static bool |
| 57 | icmp_manip_pkt(struct sk_buff *skb, | 55 | icmp_manip_pkt(struct sk_buff *skb, |
| 58 | const struct nf_nat_l3proto *l3proto, | 56 | unsigned int iphdroff, |
| 59 | unsigned int iphdroff, unsigned int hdroff, | ||
| 60 | const struct nf_conntrack_tuple *tuple, | 57 | const struct nf_conntrack_tuple *tuple, |
| 61 | enum nf_nat_manip_type maniptype) | 58 | enum nf_nat_manip_type maniptype) |
| 62 | { | 59 | { |
| 60 | const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); | ||
| 63 | struct icmphdr *hdr; | 61 | struct icmphdr *hdr; |
| 62 | unsigned int hdroff = iphdroff + iph->ihl*4; | ||
| 64 | 63 | ||
| 65 | if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) | 64 | if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) |
| 66 | return false; | 65 | return false; |
| @@ -72,12 +71,14 @@ icmp_manip_pkt(struct sk_buff *skb, | |||
| 72 | return true; | 71 | return true; |
| 73 | } | 72 | } |
| 74 | 73 | ||
| 75 | const struct nf_nat_l4proto nf_nat_l4proto_icmp = { | 74 | const struct nf_nat_protocol nf_nat_protocol_icmp = { |
| 76 | .l4proto = IPPROTO_ICMP, | 75 | .protonum = IPPROTO_ICMP, |
| 76 | .me = THIS_MODULE, | ||
| 77 | .manip_pkt = icmp_manip_pkt, | 77 | .manip_pkt = icmp_manip_pkt, |
| 78 | .in_range = icmp_in_range, | 78 | .in_range = icmp_in_range, |
| 79 | .unique_tuple = icmp_unique_tuple, | 79 | .unique_tuple = icmp_unique_tuple, |
| 80 | #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) | 80 | #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) |
| 81 | .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, | 81 | .range_to_nlattr = nf_nat_proto_range_to_nlattr, |
| 82 | .nlattr_to_range = nf_nat_proto_nlattr_to_range, | ||
| 82 | #endif | 83 | #endif |
| 83 | }; | 84 | }; |
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index bac712293fd..076b7c8c4aa 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c | |||
| @@ -400,12 +400,15 @@ static unsigned char asn1_octets_decode(struct asn1_ctx *ctx, | |||
| 400 | *len = 0; | 400 | *len = 0; |
| 401 | 401 | ||
| 402 | *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); | 402 | *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); |
| 403 | if (*octets == NULL) | 403 | if (*octets == NULL) { |
| 404 | if (net_ratelimit()) | ||
| 405 | pr_notice("OOM in bsalg (%d)\n", __LINE__); | ||
| 404 | return 0; | 406 | return 0; |
| 407 | } | ||
| 405 | 408 | ||
| 406 | ptr = *octets; | 409 | ptr = *octets; |
| 407 | while (ctx->pointer < eoc) { | 410 | while (ctx->pointer < eoc) { |
| 408 | if (!asn1_octet_decode(ctx, ptr++)) { | 411 | if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) { |
| 409 | kfree(*octets); | 412 | kfree(*octets); |
| 410 | *octets = NULL; | 413 | *octets = NULL; |
| 411 | return 0; | 414 | return 0; |
| @@ -448,8 +451,11 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx, | |||
| 448 | return 0; | 451 | return 0; |
| 449 | 452 | ||
| 450 | *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); | 453 | *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); |
| 451 | if (*oid == NULL) | 454 | if (*oid == NULL) { |
| 455 | if (net_ratelimit()) | ||
| 456 | pr_notice("OOM in bsalg (%d)\n", __LINE__); | ||
| 452 | return 0; | 457 | return 0; |
| 458 | } | ||
| 453 | 459 | ||
| 454 | optr = *oid; | 460 | optr = *oid; |
| 455 | 461 | ||
| @@ -722,6 +728,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, | |||
| 722 | *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); | 728 | *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); |
| 723 | if (*obj == NULL) { | 729 | if (*obj == NULL) { |
| 724 | kfree(id); | 730 | kfree(id); |
| 731 | if (net_ratelimit()) | ||
| 732 | pr_notice("OOM in bsalg (%d)\n", __LINE__); | ||
| 725 | return 0; | 733 | return 0; |
| 726 | } | 734 | } |
| 727 | (*obj)->syntax.l[0] = l; | 735 | (*obj)->syntax.l[0] = l; |
| @@ -736,6 +744,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, | |||
| 736 | if (*obj == NULL) { | 744 | if (*obj == NULL) { |
| 737 | kfree(p); | 745 | kfree(p); |
| 738 | kfree(id); | 746 | kfree(id); |
| 747 | if (net_ratelimit()) | ||
| 748 | pr_notice("OOM in bsalg (%d)\n", __LINE__); | ||
| 739 | return 0; | 749 | return 0; |
| 740 | } | 750 | } |
| 741 | memcpy((*obj)->syntax.c, p, len); | 751 | memcpy((*obj)->syntax.c, p, len); |
| @@ -749,6 +759,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, | |||
| 749 | *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); | 759 | *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); |
| 750 | if (*obj == NULL) { | 760 | if (*obj == NULL) { |
| 751 | kfree(id); | 761 | kfree(id); |
| 762 | if (net_ratelimit()) | ||
| 763 | pr_notice("OOM in bsalg (%d)\n", __LINE__); | ||
| 752 | return 0; | 764 | return 0; |
| 753 | } | 765 | } |
| 754 | if (!asn1_null_decode(ctx, end)) { | 766 | if (!asn1_null_decode(ctx, end)) { |
| @@ -759,7 +771,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, | |||
| 759 | } | 771 | } |
| 760 | break; | 772 | break; |
| 761 | case SNMP_OBJECTID: | 773 | case SNMP_OBJECTID: |
| 762 | if (!asn1_oid_decode(ctx, end, &lp, &len)) { | 774 | if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) { |
| 763 | kfree(id); | 775 | kfree(id); |
| 764 | return 0; | 776 | return 0; |
| 765 | } | 777 | } |
| @@ -768,6 +780,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, | |||
| 768 | if (*obj == NULL) { | 780 | if (*obj == NULL) { |
| 769 | kfree(lp); | 781 | kfree(lp); |
| 770 | kfree(id); | 782 | kfree(id); |
| 783 | if (net_ratelimit()) | ||
| 784 | pr_notice("OOM in bsalg (%d)\n", __LINE__); | ||
| 771 | return 0; | 785 | return 0; |
| 772 | } | 786 | } |
| 773 | memcpy((*obj)->syntax.ul, lp, len); | 787 | memcpy((*obj)->syntax.ul, lp, len); |
| @@ -787,6 +801,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, | |||
| 787 | if (*obj == NULL) { | 801 | if (*obj == NULL) { |
| 788 | kfree(p); | 802 | kfree(p); |
| 789 | kfree(id); | 803 | kfree(id); |
| 804 | if (net_ratelimit()) | ||
| 805 | pr_notice("OOM in bsalg (%d)\n", __LINE__); | ||
| 790 | return 0; | 806 | return 0; |
| 791 | } | 807 | } |
| 792 | memcpy((*obj)->syntax.uc, p, len); | 808 | memcpy((*obj)->syntax.uc, p, len); |
| @@ -803,6 +819,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, | |||
| 803 | *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); | 819 | *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); |
| 804 | if (*obj == NULL) { | 820 | if (*obj == NULL) { |
| 805 | kfree(id); | 821 | kfree(id); |
| 822 | if (net_ratelimit()) | ||
| 823 | pr_notice("OOM in bsalg (%d)\n", __LINE__); | ||
| 806 | return 0; | 824 | return 0; |
| 807 | } | 825 | } |
| 808 | (*obj)->syntax.ul[0] = ul; | 826 | (*obj)->syntax.ul[0] = ul; |
| @@ -1206,7 +1224,8 @@ static int snmp_translate(struct nf_conn *ct, | |||
| 1206 | 1224 | ||
| 1207 | if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr), | 1225 | if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr), |
| 1208 | paylen, &map, &udph->check)) { | 1226 | paylen, &map, &udph->check)) { |
| 1209 | net_warn_ratelimited("bsalg: parser failed\n"); | 1227 | if (net_ratelimit()) |
| 1228 | printk(KERN_WARNING "bsalg: parser failed\n"); | ||
| 1210 | return NF_DROP; | 1229 | return NF_DROP; |
| 1211 | } | 1230 | } |
| 1212 | return NF_ACCEPT; | 1231 | return NF_ACCEPT; |
| @@ -1240,8 +1259,9 @@ static int help(struct sk_buff *skb, unsigned int protoff, | |||
| 1240 | * can mess around with the payload. | 1259 | * can mess around with the payload. |
| 1241 | */ | 1260 | */ |
| 1242 | if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) { | 1261 | if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) { |
| 1243 | net_warn_ratelimited("SNMP: dropping malformed packet src=%pI4 dst=%pI4\n", | 1262 | if (net_ratelimit()) |
| 1244 | &iph->saddr, &iph->daddr); | 1263 | printk(KERN_WARNING "SNMP: dropping malformed packet src=%pI4 dst=%pI4\n", |
| 1264 | &iph->saddr, &iph->daddr); | ||
| 1245 | return NF_DROP; | 1265 | return NF_DROP; |
| 1246 | } | 1266 | } |
| 1247 | 1267 | ||
| @@ -1290,7 +1310,7 @@ static int __init nf_nat_snmp_basic_init(void) | |||
| 1290 | int ret = 0; | 1310 | int ret = 0; |
| 1291 | 1311 | ||
| 1292 | BUG_ON(nf_nat_snmp_hook != NULL); | 1312 | BUG_ON(nf_nat_snmp_hook != NULL); |
| 1293 | RCU_INIT_POINTER(nf_nat_snmp_hook, help); | 1313 | rcu_assign_pointer(nf_nat_snmp_hook, help); |
| 1294 | 1314 | ||
| 1295 | ret = nf_conntrack_helper_register(&snmp_trap_helper); | 1315 | ret = nf_conntrack_helper_register(&snmp_trap_helper); |
| 1296 | if (ret < 0) { | 1316 | if (ret < 0) { |
| @@ -1302,7 +1322,7 @@ static int __init nf_nat_snmp_basic_init(void) | |||
| 1302 | 1322 | ||
| 1303 | static void __exit nf_nat_snmp_basic_fini(void) | 1323 | static void __exit nf_nat_snmp_basic_fini(void) |
| 1304 | { | 1324 | { |
| 1305 | RCU_INIT_POINTER(nf_nat_snmp_hook, NULL); | 1325 | rcu_assign_pointer(nf_nat_snmp_hook, NULL); |
| 1306 | nf_conntrack_helper_unregister(&snmp_trap_helper); | 1326 | nf_conntrack_helper_unregister(&snmp_trap_helper); |
| 1307 | } | 1327 | } |
| 1308 | 1328 | ||
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 8f3d05424a3..39b403f854c 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | * | 20 | * |
| 21 | */ | 21 | */ |
| 22 | 22 | ||
| 23 | #include <asm/system.h> | ||
| 23 | #include <linux/uaccess.h> | 24 | #include <linux/uaccess.h> |
| 24 | #include <linux/types.h> | 25 | #include <linux/types.h> |
| 25 | #include <linux/fcntl.h> | 26 | #include <linux/fcntl.h> |
| @@ -38,7 +39,6 @@ | |||
| 38 | #include <net/protocol.h> | 39 | #include <net/protocol.h> |
| 39 | #include <linux/skbuff.h> | 40 | #include <linux/skbuff.h> |
| 40 | #include <linux/proc_fs.h> | 41 | #include <linux/proc_fs.h> |
| 41 | #include <linux/export.h> | ||
| 42 | #include <net/sock.h> | 42 | #include <net/sock.h> |
| 43 | #include <net/ping.h> | 43 | #include <net/ping.h> |
| 44 | #include <net/udp.h> | 44 | #include <net/udp.h> |
| @@ -51,16 +51,15 @@ static struct ping_table ping_table; | |||
| 51 | 51 | ||
| 52 | static u16 ping_port_rover; | 52 | static u16 ping_port_rover; |
| 53 | 53 | ||
| 54 | static inline int ping_hashfn(struct net *net, unsigned int num, unsigned int mask) | 54 | static inline int ping_hashfn(struct net *net, unsigned num, unsigned mask) |
| 55 | { | 55 | { |
| 56 | int res = (num + net_hash_mix(net)) & mask; | 56 | int res = (num + net_hash_mix(net)) & mask; |
| 57 | |||
| 58 | pr_debug("hash(%d) = %d\n", num, res); | 57 | pr_debug("hash(%d) = %d\n", num, res); |
| 59 | return res; | 58 | return res; |
| 60 | } | 59 | } |
| 61 | 60 | ||
| 62 | static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table, | 61 | static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table, |
| 63 | struct net *net, unsigned int num) | 62 | struct net *net, unsigned num) |
| 64 | { | 63 | { |
| 65 | return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)]; | 64 | return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)]; |
| 66 | } | 65 | } |
| @@ -140,14 +139,13 @@ static void ping_v4_unhash(struct sock *sk) | |||
| 140 | write_lock_bh(&ping_table.lock); | 139 | write_lock_bh(&ping_table.lock); |
| 141 | hlist_nulls_del(&sk->sk_nulls_node); | 140 | hlist_nulls_del(&sk->sk_nulls_node); |
| 142 | sock_put(sk); | 141 | sock_put(sk); |
| 143 | isk->inet_num = 0; | 142 | isk->inet_num = isk->inet_sport = 0; |
| 144 | isk->inet_sport = 0; | ||
| 145 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | 143 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); |
| 146 | write_unlock_bh(&ping_table.lock); | 144 | write_unlock_bh(&ping_table.lock); |
| 147 | } | 145 | } |
| 148 | } | 146 | } |
| 149 | 147 | ||
| 150 | static struct sock *ping_v4_lookup(struct net *net, __be32 saddr, __be32 daddr, | 148 | static struct sock *ping_v4_lookup(struct net *net, u32 saddr, u32 daddr, |
| 151 | u16 ident, int dif) | 149 | u16 ident, int dif) |
| 152 | { | 150 | { |
| 153 | struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident); | 151 | struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident); |
| @@ -155,15 +153,15 @@ static struct sock *ping_v4_lookup(struct net *net, __be32 saddr, __be32 daddr, | |||
| 155 | struct inet_sock *isk; | 153 | struct inet_sock *isk; |
| 156 | struct hlist_nulls_node *hnode; | 154 | struct hlist_nulls_node *hnode; |
| 157 | 155 | ||
| 158 | pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n", | 156 | pr_debug("try to find: num = %d, daddr = %ld, dif = %d\n", |
| 159 | (int)ident, &daddr, dif); | 157 | (int)ident, (unsigned long)daddr, dif); |
| 160 | read_lock_bh(&ping_table.lock); | 158 | read_lock_bh(&ping_table.lock); |
| 161 | 159 | ||
| 162 | ping_portaddr_for_each_entry(sk, hnode, hslot) { | 160 | ping_portaddr_for_each_entry(sk, hnode, hslot) { |
| 163 | isk = inet_sk(sk); | 161 | isk = inet_sk(sk); |
| 164 | 162 | ||
| 165 | pr_debug("found: %p: num = %d, daddr = %pI4, dif = %d\n", sk, | 163 | pr_debug("found: %p: num = %d, daddr = %ld, dif = %d\n", sk, |
| 166 | (int)isk->inet_num, &isk->inet_rcv_saddr, | 164 | (int)isk->inet_num, (unsigned long)isk->inet_rcv_saddr, |
| 167 | sk->sk_bound_dev_if); | 165 | sk->sk_bound_dev_if); |
| 168 | 166 | ||
| 169 | pr_debug("iterate\n"); | 167 | pr_debug("iterate\n"); |
| @@ -185,12 +183,11 @@ exit: | |||
| 185 | return sk; | 183 | return sk; |
| 186 | } | 184 | } |
| 187 | 185 | ||
| 188 | static void inet_get_ping_group_range_net(struct net *net, kgid_t *low, | 186 | static void inet_get_ping_group_range_net(struct net *net, gid_t *low, |
| 189 | kgid_t *high) | 187 | gid_t *high) |
| 190 | { | 188 | { |
| 191 | kgid_t *data = net->ipv4.sysctl_ping_group_range; | 189 | gid_t *data = net->ipv4.sysctl_ping_group_range; |
| 192 | unsigned int seq; | 190 | unsigned seq; |
| 193 | |||
| 194 | do { | 191 | do { |
| 195 | seq = read_seqbegin(&sysctl_local_ports.lock); | 192 | seq = read_seqbegin(&sysctl_local_ports.lock); |
| 196 | 193 | ||
| @@ -203,20 +200,21 @@ static void inet_get_ping_group_range_net(struct net *net, kgid_t *low, | |||
| 203 | static int ping_init_sock(struct sock *sk) | 200 | static int ping_init_sock(struct sock *sk) |
| 204 | { | 201 | { |
| 205 | struct net *net = sock_net(sk); | 202 | struct net *net = sock_net(sk); |
| 206 | kgid_t group = current_egid(); | 203 | gid_t group = current_egid(); |
| 204 | gid_t range[2]; | ||
| 207 | struct group_info *group_info = get_current_groups(); | 205 | struct group_info *group_info = get_current_groups(); |
| 208 | int i, j, count = group_info->ngroups; | 206 | int i, j, count = group_info->ngroups; |
| 209 | kgid_t low, high; | ||
| 210 | 207 | ||
| 211 | inet_get_ping_group_range_net(net, &low, &high); | 208 | inet_get_ping_group_range_net(net, range, range+1); |
| 212 | if (gid_lte(low, group) && gid_lte(group, high)) | 209 | if (range[0] <= group && group <= range[1]) |
| 213 | return 0; | 210 | return 0; |
| 214 | 211 | ||
| 215 | for (i = 0; i < group_info->nblocks; i++) { | 212 | for (i = 0; i < group_info->nblocks; i++) { |
| 216 | int cp_count = min_t(int, NGROUPS_PER_BLOCK, count); | 213 | int cp_count = min_t(int, NGROUPS_PER_BLOCK, count); |
| 214 | |||
| 217 | for (j = 0; j < cp_count; j++) { | 215 | for (j = 0; j < cp_count; j++) { |
| 218 | kgid_t gid = group_info->blocks[i][j]; | 216 | group = group_info->blocks[i][j]; |
| 219 | if (gid_lte(low, gid) && gid_lte(gid, high)) | 217 | if (range[0] <= group && group <= range[1]) |
| 220 | return 0; | 218 | return 0; |
| 221 | } | 219 | } |
| 222 | 220 | ||
| @@ -229,7 +227,7 @@ static int ping_init_sock(struct sock *sk) | |||
| 229 | static void ping_close(struct sock *sk, long timeout) | 227 | static void ping_close(struct sock *sk, long timeout) |
| 230 | { | 228 | { |
| 231 | pr_debug("ping_close(sk=%p,sk->num=%u)\n", | 229 | pr_debug("ping_close(sk=%p,sk->num=%u)\n", |
| 232 | inet_sk(sk), inet_sk(sk)->inet_num); | 230 | inet_sk(sk), inet_sk(sk)->inet_num); |
| 233 | pr_debug("isk->refcnt = %d\n", sk->sk_refcnt.counter); | 231 | pr_debug("isk->refcnt = %d\n", sk->sk_refcnt.counter); |
| 234 | 232 | ||
| 235 | sk_common_release(sk); | 233 | sk_common_release(sk); |
| @@ -252,10 +250,10 @@ static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) | |||
| 252 | return -EINVAL; | 250 | return -EINVAL; |
| 253 | 251 | ||
| 254 | pr_debug("ping_v4_bind(sk=%p,sa_addr=%08x,sa_port=%d)\n", | 252 | pr_debug("ping_v4_bind(sk=%p,sa_addr=%08x,sa_port=%d)\n", |
| 255 | sk, addr->sin_addr.s_addr, ntohs(addr->sin_port)); | 253 | sk, addr->sin_addr.s_addr, ntohs(addr->sin_port)); |
| 256 | 254 | ||
| 257 | chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); | 255 | chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); |
| 258 | if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) | 256 | if (addr->sin_addr.s_addr == INADDR_ANY) |
| 259 | chk_addr_ret = RTN_LOCAL; | 257 | chk_addr_ret = RTN_LOCAL; |
| 260 | 258 | ||
| 261 | if ((sysctl_ip_nonlocal_bind == 0 && | 259 | if ((sysctl_ip_nonlocal_bind == 0 && |
| @@ -279,10 +277,10 @@ static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) | |||
| 279 | goto out; | 277 | goto out; |
| 280 | } | 278 | } |
| 281 | 279 | ||
| 282 | pr_debug("after bind(): num = %d, daddr = %pI4, dif = %d\n", | 280 | pr_debug("after bind(): num = %d, daddr = %ld, dif = %d\n", |
| 283 | (int)isk->inet_num, | 281 | (int)isk->inet_num, |
| 284 | &isk->inet_rcv_saddr, | 282 | (unsigned long) isk->inet_rcv_saddr, |
| 285 | (int)sk->sk_bound_dev_if); | 283 | (int)sk->sk_bound_dev_if); |
| 286 | 284 | ||
| 287 | err = 0; | 285 | err = 0; |
| 288 | if (isk->inet_rcv_saddr) | 286 | if (isk->inet_rcv_saddr) |
| @@ -335,11 +333,12 @@ void ping_err(struct sk_buff *skb, u32 info) | |||
| 335 | return; | 333 | return; |
| 336 | 334 | ||
| 337 | pr_debug("ping_err(type=%04x,code=%04x,id=%04x,seq=%04x)\n", type, | 335 | pr_debug("ping_err(type=%04x,code=%04x,id=%04x,seq=%04x)\n", type, |
| 338 | code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); | 336 | code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); |
| 339 | 337 | ||
| 340 | sk = ping_v4_lookup(net, iph->daddr, iph->saddr, | 338 | sk = ping_v4_lookup(net, iph->daddr, iph->saddr, |
| 341 | ntohs(icmph->un.echo.id), skb->dev->ifindex); | 339 | ntohs(icmph->un.echo.id), skb->dev->ifindex); |
| 342 | if (sk == NULL) { | 340 | if (sk == NULL) { |
| 341 | ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); | ||
| 343 | pr_debug("no socket, dropping\n"); | 342 | pr_debug("no socket, dropping\n"); |
| 344 | return; /* No socket for error */ | 343 | return; /* No socket for error */ |
| 345 | } | 344 | } |
| @@ -365,7 +364,6 @@ void ping_err(struct sk_buff *skb, u32 info) | |||
| 365 | break; | 364 | break; |
| 366 | case ICMP_DEST_UNREACH: | 365 | case ICMP_DEST_UNREACH: |
| 367 | if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ | 366 | if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ |
| 368 | ipv4_sk_update_pmtu(skb, sk, info); | ||
| 369 | if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) { | 367 | if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) { |
| 370 | err = EMSGSIZE; | 368 | err = EMSGSIZE; |
| 371 | harderr = 1; | 369 | harderr = 1; |
| @@ -381,7 +379,6 @@ void ping_err(struct sk_buff *skb, u32 info) | |||
| 381 | break; | 379 | break; |
| 382 | case ICMP_REDIRECT: | 380 | case ICMP_REDIRECT: |
| 383 | /* See ICMP_SOURCE_QUENCH */ | 381 | /* See ICMP_SOURCE_QUENCH */ |
| 384 | ipv4_sk_redirect(skb, sk); | ||
| 385 | err = EREMOTEIO; | 382 | err = EREMOTEIO; |
| 386 | break; | 383 | break; |
| 387 | } | 384 | } |
| @@ -410,10 +407,10 @@ out: | |||
| 410 | struct pingfakehdr { | 407 | struct pingfakehdr { |
| 411 | struct icmphdr icmph; | 408 | struct icmphdr icmph; |
| 412 | struct iovec *iov; | 409 | struct iovec *iov; |
| 413 | __wsum wcheck; | 410 | u32 wcheck; |
| 414 | }; | 411 | }; |
| 415 | 412 | ||
| 416 | static int ping_getfrag(void *from, char *to, | 413 | static int ping_getfrag(void *from, char * to, |
| 417 | int offset, int fraglen, int odd, struct sk_buff *skb) | 414 | int offset, int fraglen, int odd, struct sk_buff *skb) |
| 418 | { | 415 | { |
| 419 | struct pingfakehdr *pfh = (struct pingfakehdr *)from; | 416 | struct pingfakehdr *pfh = (struct pingfakehdr *)from; |
| @@ -462,7 +459,7 @@ static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
| 462 | struct rtable *rt = NULL; | 459 | struct rtable *rt = NULL; |
| 463 | struct ip_options_data opt_copy; | 460 | struct ip_options_data opt_copy; |
| 464 | int free = 0; | 461 | int free = 0; |
| 465 | __be32 saddr, daddr, faddr; | 462 | u32 saddr, daddr, faddr; |
| 466 | u8 tos; | 463 | u8 tos; |
| 467 | int err; | 464 | int err; |
| 468 | 465 | ||
| @@ -558,8 +555,7 @@ static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
| 558 | ipc.oif = inet->mc_index; | 555 | ipc.oif = inet->mc_index; |
| 559 | if (!saddr) | 556 | if (!saddr) |
| 560 | saddr = inet->mc_addr; | 557 | saddr = inet->mc_addr; |
| 561 | } else if (!ipc.oif) | 558 | } |
| 562 | ipc.oif = inet->uc_index; | ||
| 563 | 559 | ||
| 564 | flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, | 560 | flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, |
| 565 | RT_SCOPE_UNIVERSE, sk->sk_protocol, | 561 | RT_SCOPE_UNIVERSE, sk->sk_protocol, |
| @@ -633,7 +629,6 @@ static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
| 633 | 629 | ||
| 634 | pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num); | 630 | pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num); |
| 635 | 631 | ||
| 636 | err = -EOPNOTSUPP; | ||
| 637 | if (flags & MSG_OOB) | 632 | if (flags & MSG_OOB) |
| 638 | goto out; | 633 | goto out; |
| 639 | 634 | ||
| @@ -681,8 +676,9 @@ out: | |||
| 681 | static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) | 676 | static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) |
| 682 | { | 677 | { |
| 683 | pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n", | 678 | pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n", |
| 684 | inet_sk(sk), inet_sk(sk)->inet_num, skb); | 679 | inet_sk(sk), inet_sk(sk)->inet_num, skb); |
| 685 | if (sock_queue_rcv_skb(sk, skb) < 0) { | 680 | if (sock_queue_rcv_skb(sk, skb) < 0) { |
| 681 | ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_INERRORS); | ||
| 686 | kfree_skb(skb); | 682 | kfree_skb(skb); |
| 687 | pr_debug("ping_queue_rcv_skb -> failed\n"); | 683 | pr_debug("ping_queue_rcv_skb -> failed\n"); |
| 688 | return -1; | 684 | return -1; |
| @@ -701,13 +697,13 @@ void ping_rcv(struct sk_buff *skb) | |||
| 701 | struct net *net = dev_net(skb->dev); | 697 | struct net *net = dev_net(skb->dev); |
| 702 | struct iphdr *iph = ip_hdr(skb); | 698 | struct iphdr *iph = ip_hdr(skb); |
| 703 | struct icmphdr *icmph = icmp_hdr(skb); | 699 | struct icmphdr *icmph = icmp_hdr(skb); |
| 704 | __be32 saddr = iph->saddr; | 700 | u32 saddr = iph->saddr; |
| 705 | __be32 daddr = iph->daddr; | 701 | u32 daddr = iph->daddr; |
| 706 | 702 | ||
| 707 | /* We assume the packet has already been checked by icmp_rcv */ | 703 | /* We assume the packet has already been checked by icmp_rcv */ |
| 708 | 704 | ||
| 709 | pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n", | 705 | pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n", |
| 710 | skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); | 706 | skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); |
| 711 | 707 | ||
| 712 | /* Push ICMP header back */ | 708 | /* Push ICMP header back */ |
| 713 | skb_push(skb, skb->data - (u8 *)icmph); | 709 | skb_push(skb, skb->data - (u8 *)icmph); |
| @@ -839,9 +835,7 @@ static void ping_format_sock(struct sock *sp, struct seq_file *f, | |||
| 839 | bucket, src, srcp, dest, destp, sp->sk_state, | 835 | bucket, src, srcp, dest, destp, sp->sk_state, |
| 840 | sk_wmem_alloc_get(sp), | 836 | sk_wmem_alloc_get(sp), |
| 841 | sk_rmem_alloc_get(sp), | 837 | sk_rmem_alloc_get(sp), |
| 842 | 0, 0L, 0, | 838 | 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), |
| 843 | from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), | ||
| 844 | 0, sock_i_ino(sp), | ||
| 845 | atomic_read(&sp->sk_refcnt), sp, | 839 | atomic_read(&sp->sk_refcnt), sp, |
| 846 | atomic_read(&sp->sk_drops), len); | 840 | atomic_read(&sp->sk_drops), len); |
| 847 | } | 841 | } |
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8de53e1ddd5..4bfad5da94f 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c | |||
| @@ -42,7 +42,6 @@ | |||
| 42 | #include <linux/inetdevice.h> | 42 | #include <linux/inetdevice.h> |
| 43 | #include <linux/proc_fs.h> | 43 | #include <linux/proc_fs.h> |
| 44 | #include <linux/seq_file.h> | 44 | #include <linux/seq_file.h> |
| 45 | #include <linux/export.h> | ||
| 46 | #include <net/sock.h> | 45 | #include <net/sock.h> |
| 47 | #include <net/raw.h> | 46 | #include <net/raw.h> |
| 48 | 47 | ||
| @@ -56,17 +55,17 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) | |||
| 56 | 55 | ||
| 57 | local_bh_disable(); | 56 | local_bh_disable(); |
| 58 | orphans = percpu_counter_sum_positive(&tcp_orphan_count); | 57 | orphans = percpu_counter_sum_positive(&tcp_orphan_count); |
| 59 | sockets = proto_sockets_allocated_sum_positive(&tcp_prot); | 58 | sockets = percpu_counter_sum_positive(&tcp_sockets_allocated); |
| 60 | local_bh_enable(); | 59 | local_bh_enable(); |
| 61 | 60 | ||
| 62 | socket_seq_show(seq); | 61 | socket_seq_show(seq); |
| 63 | seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", | 62 | seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", |
| 64 | sock_prot_inuse_get(net, &tcp_prot), orphans, | 63 | sock_prot_inuse_get(net, &tcp_prot), orphans, |
| 65 | tcp_death_row.tw_count, sockets, | 64 | tcp_death_row.tw_count, sockets, |
| 66 | proto_memory_allocated(&tcp_prot)); | 65 | atomic_long_read(&tcp_memory_allocated)); |
| 67 | seq_printf(seq, "UDP: inuse %d mem %ld\n", | 66 | seq_printf(seq, "UDP: inuse %d mem %ld\n", |
| 68 | sock_prot_inuse_get(net, &udp_prot), | 67 | sock_prot_inuse_get(net, &udp_prot), |
| 69 | proto_memory_allocated(&udp_prot)); | 68 | atomic_long_read(&udp_memory_allocated)); |
| 70 | seq_printf(seq, "UDPLITE: inuse %d\n", | 69 | seq_printf(seq, "UDPLITE: inuse %d\n", |
| 71 | sock_prot_inuse_get(net, &udplite_prot)); | 70 | sock_prot_inuse_get(net, &udplite_prot)); |
| 72 | seq_printf(seq, "RAW: inuse %d\n", | 71 | seq_printf(seq, "RAW: inuse %d\n", |
| @@ -216,6 +215,7 @@ static const struct snmp_mib snmp4_net_list[] = { | |||
| 216 | SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO), | 215 | SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO), |
| 217 | SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO), | 216 | SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO), |
| 218 | SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO), | 217 | SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO), |
| 218 | SNMP_MIB_ITEM("TCPLoss", LINUX_MIB_TCPLOSS), | ||
| 219 | SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT), | 219 | SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT), |
| 220 | SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES), | 220 | SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES), |
| 221 | SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES), | 221 | SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES), |
| @@ -232,6 +232,7 @@ static const struct snmp_mib snmp4_net_list[] = { | |||
| 232 | SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), | 232 | SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), |
| 233 | SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), | 233 | SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), |
| 234 | SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV), | 234 | SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV), |
| 235 | SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN), | ||
| 235 | SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA), | 236 | SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA), |
| 236 | SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE), | 237 | SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE), |
| 237 | SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY), | 238 | SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY), |
| @@ -255,18 +256,6 @@ static const struct snmp_mib snmp4_net_list[] = { | |||
| 255 | SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW), | 256 | SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW), |
| 256 | SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES), | 257 | SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES), |
| 257 | SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP), | 258 | SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP), |
| 258 | SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL), | ||
| 259 | SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE), | ||
| 260 | SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE), | ||
| 261 | SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP), | ||
| 262 | SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE), | ||
| 263 | SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), | ||
| 264 | SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), | ||
| 265 | SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), | ||
| 266 | SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE), | ||
| 267 | SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), | ||
| 268 | SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), | ||
| 269 | SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), | ||
| 270 | SNMP_MIB_SENTINEL | 259 | SNMP_MIB_SENTINEL |
| 271 | }; | 260 | }; |
| 272 | 261 | ||
| @@ -298,7 +287,7 @@ static void icmpmsg_put(struct seq_file *seq) | |||
| 298 | 287 | ||
| 299 | count = 0; | 288 | count = 0; |
| 300 | for (i = 0; i < ICMPMSG_MIB_MAX; i++) { | 289 | for (i = 0; i < ICMPMSG_MIB_MAX; i++) { |
| 301 | val = atomic_long_read(&net->mib.icmpmsg_statistics->mibs[i]); | 290 | val = snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, i); |
| 302 | if (val) { | 291 | if (val) { |
| 303 | type[count] = i; | 292 | type[count] = i; |
| 304 | vals[count++] = val; | 293 | vals[count++] = val; |
| @@ -317,7 +306,6 @@ static void icmp_put(struct seq_file *seq) | |||
| 317 | { | 306 | { |
| 318 | int i; | 307 | int i; |
| 319 | struct net *net = seq->private; | 308 | struct net *net = seq->private; |
| 320 | atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs; | ||
| 321 | 309 | ||
| 322 | seq_puts(seq, "\nIcmp: InMsgs InErrors"); | 310 | seq_puts(seq, "\nIcmp: InMsgs InErrors"); |
| 323 | for (i=0; icmpmibmap[i].name != NULL; i++) | 311 | for (i=0; icmpmibmap[i].name != NULL; i++) |
| @@ -330,13 +318,15 @@ static void icmp_put(struct seq_file *seq) | |||
| 330 | snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS)); | 318 | snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS)); |
| 331 | for (i=0; icmpmibmap[i].name != NULL; i++) | 319 | for (i=0; icmpmibmap[i].name != NULL; i++) |
| 332 | seq_printf(seq, " %lu", | 320 | seq_printf(seq, " %lu", |
| 333 | atomic_long_read(ptr + icmpmibmap[i].index)); | 321 | snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, |
| 322 | icmpmibmap[i].index)); | ||
| 334 | seq_printf(seq, " %lu %lu", | 323 | seq_printf(seq, " %lu %lu", |
| 335 | snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), | 324 | snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), |
| 336 | snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); | 325 | snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); |
| 337 | for (i=0; icmpmibmap[i].name != NULL; i++) | 326 | for (i=0; icmpmibmap[i].name != NULL; i++) |
| 338 | seq_printf(seq, " %lu", | 327 | seq_printf(seq, " %lu", |
| 339 | atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); | 328 | snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, |
| 329 | icmpmibmap[i].index | 0x100)); | ||
| 340 | } | 330 | } |
| 341 | 331 | ||
| 342 | /* | 332 | /* |
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 0f9d09f54bd..9ae5c01cd0b 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c | |||
| @@ -29,7 +29,6 @@ | |||
| 29 | #include <net/protocol.h> | 29 | #include <net/protocol.h> |
| 30 | 30 | ||
| 31 | const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; | 31 | const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; |
| 32 | const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; | ||
| 33 | 32 | ||
| 34 | /* | 33 | /* |
| 35 | * Add a protocol handler to the hash tables | 34 | * Add a protocol handler to the hash tables |
| @@ -37,17 +36,12 @@ const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; | |||
| 37 | 36 | ||
| 38 | int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) | 37 | int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) |
| 39 | { | 38 | { |
| 40 | return !cmpxchg((const struct net_protocol **)&inet_protos[protocol], | 39 | int hash = protocol & (MAX_INET_PROTOS - 1); |
| 41 | NULL, prot) ? 0 : -1; | ||
| 42 | } | ||
| 43 | EXPORT_SYMBOL(inet_add_protocol); | ||
| 44 | 40 | ||
| 45 | int inet_add_offload(const struct net_offload *prot, unsigned char protocol) | 41 | return !cmpxchg((const struct net_protocol **)&inet_protos[hash], |
| 46 | { | ||
| 47 | return !cmpxchg((const struct net_offload **)&inet_offloads[protocol], | ||
| 48 | NULL, prot) ? 0 : -1; | 42 | NULL, prot) ? 0 : -1; |
| 49 | } | 43 | } |
| 50 | EXPORT_SYMBOL(inet_add_offload); | 44 | EXPORT_SYMBOL(inet_add_protocol); |
| 51 | 45 | ||
| 52 | /* | 46 | /* |
| 53 | * Remove a protocol from the hash tables. | 47 | * Remove a protocol from the hash tables. |
| @@ -55,9 +49,9 @@ EXPORT_SYMBOL(inet_add_offload); | |||
| 55 | 49 | ||
| 56 | int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) | 50 | int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) |
| 57 | { | 51 | { |
| 58 | int ret; | 52 | int ret, hash = protocol & (MAX_INET_PROTOS - 1); |
| 59 | 53 | ||
| 60 | ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol], | 54 | ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash], |
| 61 | prot, NULL) == prot) ? 0 : -1; | 55 | prot, NULL) == prot) ? 0 : -1; |
| 62 | 56 | ||
| 63 | synchronize_net(); | 57 | synchronize_net(); |
| @@ -65,16 +59,3 @@ int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) | |||
| 65 | return ret; | 59 | return ret; |
| 66 | } | 60 | } |
| 67 | EXPORT_SYMBOL(inet_del_protocol); | 61 | EXPORT_SYMBOL(inet_del_protocol); |
| 68 | |||
| 69 | int inet_del_offload(const struct net_offload *prot, unsigned char protocol) | ||
| 70 | { | ||
| 71 | int ret; | ||
| 72 | |||
| 73 | ret = (cmpxchg((const struct net_offload **)&inet_offloads[protocol], | ||
| 74 | prot, NULL) == prot) ? 0 : -1; | ||
| 75 | |||
| 76 | synchronize_net(); | ||
| 77 | |||
| 78 | return ret; | ||
| 79 | } | ||
| 80 | EXPORT_SYMBOL(inet_del_offload); | ||
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 73d1e4df4bf..61714bd5292 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c | |||
| @@ -48,7 +48,6 @@ | |||
| 48 | #include <linux/errno.h> | 48 | #include <linux/errno.h> |
| 49 | #include <linux/aio.h> | 49 | #include <linux/aio.h> |
| 50 | #include <linux/kernel.h> | 50 | #include <linux/kernel.h> |
| 51 | #include <linux/export.h> | ||
| 52 | #include <linux/spinlock.h> | 51 | #include <linux/spinlock.h> |
| 53 | #include <linux/sockios.h> | 52 | #include <linux/sockios.h> |
| 54 | #include <linux/socket.h> | 53 | #include <linux/socket.h> |
| @@ -131,20 +130,18 @@ found: | |||
| 131 | * 0 - deliver | 130 | * 0 - deliver |
| 132 | * 1 - block | 131 | * 1 - block |
| 133 | */ | 132 | */ |
| 134 | static int icmp_filter(const struct sock *sk, const struct sk_buff *skb) | 133 | static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) |
| 135 | { | 134 | { |
| 136 | struct icmphdr _hdr; | 135 | int type; |
| 137 | const struct icmphdr *hdr; | ||
| 138 | 136 | ||
| 139 | hdr = skb_header_pointer(skb, skb_transport_offset(skb), | 137 | if (!pskb_may_pull(skb, sizeof(struct icmphdr))) |
| 140 | sizeof(_hdr), &_hdr); | ||
| 141 | if (!hdr) | ||
| 142 | return 1; | 138 | return 1; |
| 143 | 139 | ||
| 144 | if (hdr->type < 32) { | 140 | type = icmp_hdr(skb)->type; |
| 141 | if (type < 32) { | ||
| 145 | __u32 data = raw_sk(sk)->filter.data; | 142 | __u32 data = raw_sk(sk)->filter.data; |
| 146 | 143 | ||
| 147 | return ((1U << hdr->type) & data) != 0; | 144 | return ((1 << type) & data) != 0; |
| 148 | } | 145 | } |
| 149 | 146 | ||
| 150 | /* Do not block unknown ICMP types */ | 147 | /* Do not block unknown ICMP types */ |
| @@ -218,11 +215,6 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) | |||
| 218 | int err = 0; | 215 | int err = 0; |
| 219 | int harderr = 0; | 216 | int harderr = 0; |
| 220 | 217 | ||
| 221 | if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) | ||
| 222 | ipv4_sk_update_pmtu(skb, sk, info); | ||
| 223 | else if (type == ICMP_REDIRECT) | ||
| 224 | ipv4_sk_redirect(skb, sk); | ||
| 225 | |||
| 226 | /* Report error on raw socket, if: | 218 | /* Report error on raw socket, if: |
| 227 | 1. User requested ip_recverr. | 219 | 1. User requested ip_recverr. |
| 228 | 2. Socket is connected (otherwise the error indication | 220 | 2. Socket is connected (otherwise the error indication |
| @@ -295,12 +287,11 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) | |||
| 295 | read_unlock(&raw_v4_hashinfo.lock); | 287 | read_unlock(&raw_v4_hashinfo.lock); |
| 296 | } | 288 | } |
| 297 | 289 | ||
| 298 | static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) | 290 | static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) |
| 299 | { | 291 | { |
| 300 | /* Charge it to the socket. */ | 292 | /* Charge it to the socket. */ |
| 301 | 293 | ||
| 302 | ipv4_pktinfo_prepare(skb); | 294 | if (ip_queue_rcv_skb(sk, skb) < 0) { |
| 303 | if (sock_queue_rcv_skb(sk, skb) < 0) { | ||
| 304 | kfree_skb(skb); | 295 | kfree_skb(skb); |
| 305 | return NET_RX_DROP; | 296 | return NET_RX_DROP; |
| 306 | } | 297 | } |
| @@ -335,7 +326,6 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, | |||
| 335 | unsigned int iphlen; | 326 | unsigned int iphlen; |
| 336 | int err; | 327 | int err; |
| 337 | struct rtable *rt = *rtp; | 328 | struct rtable *rt = *rtp; |
| 338 | int hlen, tlen; | ||
| 339 | 329 | ||
| 340 | if (length > rt->dst.dev->mtu) { | 330 | if (length > rt->dst.dev->mtu) { |
| 341 | ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, | 331 | ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, |
| @@ -345,14 +335,12 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, | |||
| 345 | if (flags&MSG_PROBE) | 335 | if (flags&MSG_PROBE) |
| 346 | goto out; | 336 | goto out; |
| 347 | 337 | ||
| 348 | hlen = LL_RESERVED_SPACE(rt->dst.dev); | ||
| 349 | tlen = rt->dst.dev->needed_tailroom; | ||
| 350 | skb = sock_alloc_send_skb(sk, | 338 | skb = sock_alloc_send_skb(sk, |
| 351 | length + hlen + tlen + 15, | 339 | length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15, |
| 352 | flags & MSG_DONTWAIT, &err); | 340 | flags & MSG_DONTWAIT, &err); |
| 353 | if (skb == NULL) | 341 | if (skb == NULL) |
| 354 | goto error; | 342 | goto error; |
| 355 | skb_reserve(skb, hlen); | 343 | skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev)); |
| 356 | 344 | ||
| 357 | skb->priority = sk->sk_priority; | 345 | skb->priority = sk->sk_priority; |
| 358 | skb->mark = sk->sk_mark; | 346 | skb->mark = sk->sk_mark; |
| @@ -498,8 +486,11 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
| 498 | if (msg->msg_namelen < sizeof(*usin)) | 486 | if (msg->msg_namelen < sizeof(*usin)) |
| 499 | goto out; | 487 | goto out; |
| 500 | if (usin->sin_family != AF_INET) { | 488 | if (usin->sin_family != AF_INET) { |
| 501 | pr_info_once("%s: %s forgot to set AF_INET. Fix it!\n", | 489 | static int complained; |
| 502 | __func__, current->comm); | 490 | if (!complained++) |
| 491 | printk(KERN_INFO "%s forgot to set AF_INET in " | ||
| 492 | "raw sendmsg. Fix it!\n", | ||
| 493 | current->comm); | ||
| 503 | err = -EAFNOSUPPORT; | 494 | err = -EAFNOSUPPORT; |
| 504 | if (usin->sin_family) | 495 | if (usin->sin_family) |
| 505 | goto out; | 496 | goto out; |
| @@ -567,8 +558,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
| 567 | ipc.oif = inet->mc_index; | 558 | ipc.oif = inet->mc_index; |
| 568 | if (!saddr) | 559 | if (!saddr) |
| 569 | saddr = inet->mc_addr; | 560 | saddr = inet->mc_addr; |
| 570 | } else if (!ipc.oif) | 561 | } |
| 571 | ipc.oif = inet->uc_index; | ||
| 572 | 562 | ||
| 573 | flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, | 563 | flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, |
| 574 | RT_SCOPE_UNIVERSE, | 564 | RT_SCOPE_UNIVERSE, |
| @@ -994,9 +984,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) | |||
| 994 | i, src, srcp, dest, destp, sp->sk_state, | 984 | i, src, srcp, dest, destp, sp->sk_state, |
| 995 | sk_wmem_alloc_get(sp), | 985 | sk_wmem_alloc_get(sp), |
| 996 | sk_rmem_alloc_get(sp), | 986 | sk_rmem_alloc_get(sp), |
| 997 | 0, 0L, 0, | 987 | 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), |
| 998 | from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), | ||
| 999 | 0, sock_i_ino(sp), | ||
| 1000 | atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); | 988 | atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); |
| 1001 | } | 989 | } |
| 1002 | 990 | ||
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 844a9ef60db..b5638545deb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
| @@ -62,14 +62,14 @@ | |||
| 62 | * 2 of the License, or (at your option) any later version. | 62 | * 2 of the License, or (at your option) any later version. |
| 63 | */ | 63 | */ |
| 64 | 64 | ||
| 65 | #define pr_fmt(fmt) "IPv4: " fmt | ||
| 66 | |||
| 67 | #include <linux/module.h> | 65 | #include <linux/module.h> |
| 68 | #include <asm/uaccess.h> | 66 | #include <asm/uaccess.h> |
| 67 | #include <asm/system.h> | ||
| 69 | #include <linux/bitops.h> | 68 | #include <linux/bitops.h> |
| 70 | #include <linux/types.h> | 69 | #include <linux/types.h> |
| 71 | #include <linux/kernel.h> | 70 | #include <linux/kernel.h> |
| 72 | #include <linux/mm.h> | 71 | #include <linux/mm.h> |
| 72 | #include <linux/bootmem.h> | ||
| 73 | #include <linux/string.h> | 73 | #include <linux/string.h> |
| 74 | #include <linux/socket.h> | 74 | #include <linux/socket.h> |
| 75 | #include <linux/sockios.h> | 75 | #include <linux/sockios.h> |
| @@ -79,6 +79,7 @@ | |||
| 79 | #include <linux/netdevice.h> | 79 | #include <linux/netdevice.h> |
| 80 | #include <linux/proc_fs.h> | 80 | #include <linux/proc_fs.h> |
| 81 | #include <linux/init.h> | 81 | #include <linux/init.h> |
| 82 | #include <linux/workqueue.h> | ||
| 82 | #include <linux/skbuff.h> | 83 | #include <linux/skbuff.h> |
| 83 | #include <linux/inetdevice.h> | 84 | #include <linux/inetdevice.h> |
| 84 | #include <linux/igmp.h> | 85 | #include <linux/igmp.h> |
| @@ -86,9 +87,11 @@ | |||
| 86 | #include <linux/mroute.h> | 87 | #include <linux/mroute.h> |
| 87 | #include <linux/netfilter_ipv4.h> | 88 | #include <linux/netfilter_ipv4.h> |
| 88 | #include <linux/random.h> | 89 | #include <linux/random.h> |
| 90 | #include <linux/jhash.h> | ||
| 89 | #include <linux/rcupdate.h> | 91 | #include <linux/rcupdate.h> |
| 90 | #include <linux/times.h> | 92 | #include <linux/times.h> |
| 91 | #include <linux/slab.h> | 93 | #include <linux/slab.h> |
| 94 | #include <linux/prefetch.h> | ||
| 92 | #include <net/dst.h> | 95 | #include <net/dst.h> |
| 93 | #include <net/net_namespace.h> | 96 | #include <net/net_namespace.h> |
| 94 | #include <net/protocol.h> | 97 | #include <net/protocol.h> |
| @@ -105,8 +108,8 @@ | |||
| 105 | #include <net/rtnetlink.h> | 108 | #include <net/rtnetlink.h> |
| 106 | #ifdef CONFIG_SYSCTL | 109 | #ifdef CONFIG_SYSCTL |
| 107 | #include <linux/sysctl.h> | 110 | #include <linux/sysctl.h> |
| 108 | #include <linux/kmemleak.h> | ||
| 109 | #endif | 111 | #endif |
| 112 | #include <net/atmclip.h> | ||
| 110 | #include <net/secure_seq.h> | 113 | #include <net/secure_seq.h> |
| 111 | 114 | ||
| 112 | #define RT_FL_TOS(oldflp4) \ | 115 | #define RT_FL_TOS(oldflp4) \ |
| @@ -118,7 +121,7 @@ | |||
| 118 | 121 | ||
| 119 | static int ip_rt_max_size; | 122 | static int ip_rt_max_size; |
| 120 | static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; | 123 | static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; |
| 121 | static int ip_rt_gc_interval __read_mostly = 60 * HZ; | 124 | static int ip_rt_gc_interval __read_mostly = 60 * HZ; |
| 122 | static int ip_rt_gc_min_interval __read_mostly = HZ / 2; | 125 | static int ip_rt_gc_min_interval __read_mostly = HZ / 2; |
| 123 | static int ip_rt_redirect_number __read_mostly = 9; | 126 | static int ip_rt_redirect_number __read_mostly = 9; |
| 124 | static int ip_rt_redirect_load __read_mostly = HZ / 50; | 127 | static int ip_rt_redirect_load __read_mostly = HZ / 50; |
| @@ -129,6 +132,11 @@ static int ip_rt_gc_elasticity __read_mostly = 8; | |||
| 129 | static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; | 132 | static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; |
| 130 | static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; | 133 | static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; |
| 131 | static int ip_rt_min_advmss __read_mostly = 256; | 134 | static int ip_rt_min_advmss __read_mostly = 256; |
| 135 | static int rt_chain_length_max __read_mostly = 20; | ||
| 136 | static int redirect_genid; | ||
| 137 | |||
| 138 | static struct delayed_work expires_work; | ||
| 139 | static unsigned long expires_ljiffies; | ||
| 132 | 140 | ||
| 133 | /* | 141 | /* |
| 134 | * Interface to generic destination cache. | 142 | * Interface to generic destination cache. |
| @@ -136,14 +144,12 @@ static int ip_rt_min_advmss __read_mostly = 256; | |||
| 136 | 144 | ||
| 137 | static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); | 145 | static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); |
| 138 | static unsigned int ipv4_default_advmss(const struct dst_entry *dst); | 146 | static unsigned int ipv4_default_advmss(const struct dst_entry *dst); |
| 139 | static unsigned int ipv4_mtu(const struct dst_entry *dst); | 147 | static unsigned int ipv4_default_mtu(const struct dst_entry *dst); |
| 148 | static void ipv4_dst_destroy(struct dst_entry *dst); | ||
| 140 | static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); | 149 | static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); |
| 141 | static void ipv4_link_failure(struct sk_buff *skb); | 150 | static void ipv4_link_failure(struct sk_buff *skb); |
| 142 | static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, | 151 | static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); |
| 143 | struct sk_buff *skb, u32 mtu); | 152 | static int rt_garbage_collect(struct dst_ops *ops); |
| 144 | static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, | ||
| 145 | struct sk_buff *skb); | ||
| 146 | static void ipv4_dst_destroy(struct dst_entry *dst); | ||
| 147 | 153 | ||
| 148 | static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, | 154 | static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, |
| 149 | int how) | 155 | int how) |
| @@ -152,27 +158,54 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, | |||
| 152 | 158 | ||
| 153 | static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) | 159 | static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) |
| 154 | { | 160 | { |
| 155 | WARN_ON(1); | 161 | struct rtable *rt = (struct rtable *) dst; |
| 156 | return NULL; | 162 | struct inet_peer *peer; |
| 163 | u32 *p = NULL; | ||
| 164 | |||
| 165 | if (!rt->peer) | ||
| 166 | rt_bind_peer(rt, rt->rt_dst, 1); | ||
| 167 | |||
| 168 | peer = rt->peer; | ||
| 169 | if (peer) { | ||
| 170 | u32 *old_p = __DST_METRICS_PTR(old); | ||
| 171 | unsigned long prev, new; | ||
| 172 | |||
| 173 | p = peer->metrics; | ||
| 174 | if (inet_metrics_new(peer)) | ||
| 175 | memcpy(p, old_p, sizeof(u32) * RTAX_MAX); | ||
| 176 | |||
| 177 | new = (unsigned long) p; | ||
| 178 | prev = cmpxchg(&dst->_metrics, old, new); | ||
| 179 | |||
| 180 | if (prev != old) { | ||
| 181 | p = __DST_METRICS_PTR(prev); | ||
| 182 | if (prev & DST_METRICS_READ_ONLY) | ||
| 183 | p = NULL; | ||
| 184 | } else { | ||
| 185 | if (rt->fi) { | ||
| 186 | fib_info_put(rt->fi); | ||
| 187 | rt->fi = NULL; | ||
| 188 | } | ||
| 189 | } | ||
| 190 | } | ||
| 191 | return p; | ||
| 157 | } | 192 | } |
| 158 | 193 | ||
| 159 | static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, | 194 | static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr); |
| 160 | struct sk_buff *skb, | ||
| 161 | const void *daddr); | ||
| 162 | 195 | ||
| 163 | static struct dst_ops ipv4_dst_ops = { | 196 | static struct dst_ops ipv4_dst_ops = { |
| 164 | .family = AF_INET, | 197 | .family = AF_INET, |
| 165 | .protocol = cpu_to_be16(ETH_P_IP), | 198 | .protocol = cpu_to_be16(ETH_P_IP), |
| 199 | .gc = rt_garbage_collect, | ||
| 166 | .check = ipv4_dst_check, | 200 | .check = ipv4_dst_check, |
| 167 | .default_advmss = ipv4_default_advmss, | 201 | .default_advmss = ipv4_default_advmss, |
| 168 | .mtu = ipv4_mtu, | 202 | .default_mtu = ipv4_default_mtu, |
| 169 | .cow_metrics = ipv4_cow_metrics, | 203 | .cow_metrics = ipv4_cow_metrics, |
| 170 | .destroy = ipv4_dst_destroy, | 204 | .destroy = ipv4_dst_destroy, |
| 171 | .ifdown = ipv4_dst_ifdown, | 205 | .ifdown = ipv4_dst_ifdown, |
| 172 | .negative_advice = ipv4_negative_advice, | 206 | .negative_advice = ipv4_negative_advice, |
| 173 | .link_failure = ipv4_link_failure, | 207 | .link_failure = ipv4_link_failure, |
| 174 | .update_pmtu = ip_rt_update_pmtu, | 208 | .update_pmtu = ip_rt_update_pmtu, |
| 175 | .redirect = ip_do_redirect, | ||
| 176 | .local_out = __ip_local_out, | 209 | .local_out = __ip_local_out, |
| 177 | .neigh_lookup = ipv4_neigh_lookup, | 210 | .neigh_lookup = ipv4_neigh_lookup, |
| 178 | }; | 211 | }; |
| @@ -197,27 +230,186 @@ const __u8 ip_tos2prio[16] = { | |||
| 197 | TC_PRIO_INTERACTIVE_BULK, | 230 | TC_PRIO_INTERACTIVE_BULK, |
| 198 | ECN_OR_COST(INTERACTIVE_BULK) | 231 | ECN_OR_COST(INTERACTIVE_BULK) |
| 199 | }; | 232 | }; |
| 200 | EXPORT_SYMBOL(ip_tos2prio); | 233 | |
| 234 | |||
| 235 | /* | ||
| 236 | * Route cache. | ||
| 237 | */ | ||
| 238 | |||
| 239 | /* The locking scheme is rather straight forward: | ||
| 240 | * | ||
| 241 | * 1) Read-Copy Update protects the buckets of the central route hash. | ||
| 242 | * 2) Only writers remove entries, and they hold the lock | ||
| 243 | * as they look at rtable reference counts. | ||
| 244 | * 3) Only readers acquire references to rtable entries, | ||
| 245 | * they do so with atomic increments and with the | ||
| 246 | * lock held. | ||
| 247 | */ | ||
| 248 | |||
| 249 | struct rt_hash_bucket { | ||
| 250 | struct rtable __rcu *chain; | ||
| 251 | }; | ||
| 252 | |||
| 253 | #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ | ||
| 254 | defined(CONFIG_PROVE_LOCKING) | ||
| 255 | /* | ||
| 256 | * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks | ||
| 257 | * The size of this table is a power of two and depends on the number of CPUS. | ||
| 258 | * (on lockdep we have a quite big spinlock_t, so keep the size down there) | ||
| 259 | */ | ||
| 260 | #ifdef CONFIG_LOCKDEP | ||
| 261 | # define RT_HASH_LOCK_SZ 256 | ||
| 262 | #else | ||
| 263 | # if NR_CPUS >= 32 | ||
| 264 | # define RT_HASH_LOCK_SZ 4096 | ||
| 265 | # elif NR_CPUS >= 16 | ||
| 266 | # define RT_HASH_LOCK_SZ 2048 | ||
| 267 | # elif NR_CPUS >= 8 | ||
| 268 | # define RT_HASH_LOCK_SZ 1024 | ||
| 269 | # elif NR_CPUS >= 4 | ||
| 270 | # define RT_HASH_LOCK_SZ 512 | ||
| 271 | # else | ||
| 272 | # define RT_HASH_LOCK_SZ 256 | ||
| 273 | # endif | ||
| 274 | #endif | ||
| 275 | |||
| 276 | static spinlock_t *rt_hash_locks; | ||
| 277 | # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] | ||
| 278 | |||
| 279 | static __init void rt_hash_lock_init(void) | ||
| 280 | { | ||
| 281 | int i; | ||
| 282 | |||
| 283 | rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, | ||
| 284 | GFP_KERNEL); | ||
| 285 | if (!rt_hash_locks) | ||
| 286 | panic("IP: failed to allocate rt_hash_locks\n"); | ||
| 287 | |||
| 288 | for (i = 0; i < RT_HASH_LOCK_SZ; i++) | ||
| 289 | spin_lock_init(&rt_hash_locks[i]); | ||
| 290 | } | ||
| 291 | #else | ||
| 292 | # define rt_hash_lock_addr(slot) NULL | ||
| 293 | |||
| 294 | static inline void rt_hash_lock_init(void) | ||
| 295 | { | ||
| 296 | } | ||
| 297 | #endif | ||
| 298 | |||
| 299 | static struct rt_hash_bucket *rt_hash_table __read_mostly; | ||
| 300 | static unsigned rt_hash_mask __read_mostly; | ||
| 301 | static unsigned int rt_hash_log __read_mostly; | ||
| 201 | 302 | ||
| 202 | static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); | 303 | static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); |
| 203 | #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) | 304 | #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) |
| 204 | 305 | ||
| 306 | static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, | ||
| 307 | int genid) | ||
| 308 | { | ||
| 309 | return jhash_3words((__force u32)daddr, (__force u32)saddr, | ||
| 310 | idx, genid) | ||
| 311 | & rt_hash_mask; | ||
| 312 | } | ||
| 313 | |||
| 314 | static inline int rt_genid(struct net *net) | ||
| 315 | { | ||
| 316 | return atomic_read(&net->ipv4.rt_genid); | ||
| 317 | } | ||
| 318 | |||
| 205 | #ifdef CONFIG_PROC_FS | 319 | #ifdef CONFIG_PROC_FS |
| 320 | struct rt_cache_iter_state { | ||
| 321 | struct seq_net_private p; | ||
| 322 | int bucket; | ||
| 323 | int genid; | ||
| 324 | }; | ||
| 325 | |||
| 326 | static struct rtable *rt_cache_get_first(struct seq_file *seq) | ||
| 327 | { | ||
| 328 | struct rt_cache_iter_state *st = seq->private; | ||
| 329 | struct rtable *r = NULL; | ||
| 330 | |||
| 331 | for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { | ||
| 332 | if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)) | ||
| 333 | continue; | ||
| 334 | rcu_read_lock_bh(); | ||
| 335 | r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); | ||
| 336 | while (r) { | ||
| 337 | if (dev_net(r->dst.dev) == seq_file_net(seq) && | ||
| 338 | r->rt_genid == st->genid) | ||
| 339 | return r; | ||
| 340 | r = rcu_dereference_bh(r->dst.rt_next); | ||
| 341 | } | ||
| 342 | rcu_read_unlock_bh(); | ||
| 343 | } | ||
| 344 | return r; | ||
| 345 | } | ||
| 346 | |||
| 347 | static struct rtable *__rt_cache_get_next(struct seq_file *seq, | ||
| 348 | struct rtable *r) | ||
| 349 | { | ||
| 350 | struct rt_cache_iter_state *st = seq->private; | ||
| 351 | |||
| 352 | r = rcu_dereference_bh(r->dst.rt_next); | ||
| 353 | while (!r) { | ||
| 354 | rcu_read_unlock_bh(); | ||
| 355 | do { | ||
| 356 | if (--st->bucket < 0) | ||
| 357 | return NULL; | ||
| 358 | } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)); | ||
| 359 | rcu_read_lock_bh(); | ||
| 360 | r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); | ||
| 361 | } | ||
| 362 | return r; | ||
| 363 | } | ||
| 364 | |||
| 365 | static struct rtable *rt_cache_get_next(struct seq_file *seq, | ||
| 366 | struct rtable *r) | ||
| 367 | { | ||
| 368 | struct rt_cache_iter_state *st = seq->private; | ||
| 369 | while ((r = __rt_cache_get_next(seq, r)) != NULL) { | ||
| 370 | if (dev_net(r->dst.dev) != seq_file_net(seq)) | ||
| 371 | continue; | ||
| 372 | if (r->rt_genid == st->genid) | ||
| 373 | break; | ||
| 374 | } | ||
| 375 | return r; | ||
| 376 | } | ||
| 377 | |||
| 378 | static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) | ||
| 379 | { | ||
| 380 | struct rtable *r = rt_cache_get_first(seq); | ||
| 381 | |||
| 382 | if (r) | ||
| 383 | while (pos && (r = rt_cache_get_next(seq, r))) | ||
| 384 | --pos; | ||
| 385 | return pos ? NULL : r; | ||
| 386 | } | ||
| 387 | |||
| 206 | static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) | 388 | static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) |
| 207 | { | 389 | { |
| 390 | struct rt_cache_iter_state *st = seq->private; | ||
| 208 | if (*pos) | 391 | if (*pos) |
| 209 | return NULL; | 392 | return rt_cache_get_idx(seq, *pos - 1); |
| 393 | st->genid = rt_genid(seq_file_net(seq)); | ||
| 210 | return SEQ_START_TOKEN; | 394 | return SEQ_START_TOKEN; |
| 211 | } | 395 | } |
| 212 | 396 | ||
| 213 | static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) | 397 | static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) |
| 214 | { | 398 | { |
| 399 | struct rtable *r; | ||
| 400 | |||
| 401 | if (v == SEQ_START_TOKEN) | ||
| 402 | r = rt_cache_get_first(seq); | ||
| 403 | else | ||
| 404 | r = rt_cache_get_next(seq, v); | ||
| 215 | ++*pos; | 405 | ++*pos; |
| 216 | return NULL; | 406 | return r; |
| 217 | } | 407 | } |
| 218 | 408 | ||
| 219 | static void rt_cache_seq_stop(struct seq_file *seq, void *v) | 409 | static void rt_cache_seq_stop(struct seq_file *seq, void *v) |
| 220 | { | 410 | { |
| 411 | if (v && v != SEQ_START_TOKEN) | ||
| 412 | rcu_read_unlock_bh(); | ||
| 221 | } | 413 | } |
| 222 | 414 | ||
| 223 | static int rt_cache_seq_show(struct seq_file *seq, void *v) | 415 | static int rt_cache_seq_show(struct seq_file *seq, void *v) |
| @@ -227,6 +419,34 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) | |||
| 227 | "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" | 419 | "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" |
| 228 | "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" | 420 | "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" |
| 229 | "HHUptod\tSpecDst"); | 421 | "HHUptod\tSpecDst"); |
| 422 | else { | ||
| 423 | struct rtable *r = v; | ||
| 424 | struct neighbour *n; | ||
| 425 | int len, HHUptod; | ||
| 426 | |||
| 427 | rcu_read_lock(); | ||
| 428 | n = dst_get_neighbour(&r->dst); | ||
| 429 | HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0; | ||
| 430 | rcu_read_unlock(); | ||
| 431 | |||
| 432 | seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" | ||
| 433 | "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", | ||
| 434 | r->dst.dev ? r->dst.dev->name : "*", | ||
| 435 | (__force u32)r->rt_dst, | ||
| 436 | (__force u32)r->rt_gateway, | ||
| 437 | r->rt_flags, atomic_read(&r->dst.__refcnt), | ||
| 438 | r->dst.__use, 0, (__force u32)r->rt_src, | ||
| 439 | dst_metric_advmss(&r->dst) + 40, | ||
| 440 | dst_metric(&r->dst, RTAX_WINDOW), | ||
| 441 | (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + | ||
| 442 | dst_metric(&r->dst, RTAX_RTTVAR)), | ||
| 443 | r->rt_key_tos, | ||
| 444 | -1, | ||
| 445 | HHUptod, | ||
| 446 | r->rt_spec_dst, &len); | ||
| 447 | |||
| 448 | seq_printf(seq, "%*s\n", 127 - len, ""); | ||
| 449 | } | ||
| 230 | return 0; | 450 | return 0; |
| 231 | } | 451 | } |
| 232 | 452 | ||
| @@ -239,7 +459,8 @@ static const struct seq_operations rt_cache_seq_ops = { | |||
| 239 | 459 | ||
| 240 | static int rt_cache_seq_open(struct inode *inode, struct file *file) | 460 | static int rt_cache_seq_open(struct inode *inode, struct file *file) |
| 241 | { | 461 | { |
| 242 | return seq_open(file, &rt_cache_seq_ops); | 462 | return seq_open_net(inode, file, &rt_cache_seq_ops, |
| 463 | sizeof(struct rt_cache_iter_state)); | ||
| 243 | } | 464 | } |
| 244 | 465 | ||
| 245 | static const struct file_operations rt_cache_seq_fops = { | 466 | static const struct file_operations rt_cache_seq_fops = { |
| @@ -247,7 +468,7 @@ static const struct file_operations rt_cache_seq_fops = { | |||
| 247 | .open = rt_cache_seq_open, | 468 | .open = rt_cache_seq_open, |
| 248 | .read = seq_read, | 469 | .read = seq_read, |
| 249 | .llseek = seq_lseek, | 470 | .llseek = seq_lseek, |
| 250 | .release = seq_release, | 471 | .release = seq_release_net, |
| 251 | }; | 472 | }; |
| 252 | 473 | ||
| 253 | 474 | ||
| @@ -437,252 +658,791 @@ static inline int ip_rt_proc_init(void) | |||
| 437 | } | 658 | } |
| 438 | #endif /* CONFIG_PROC_FS */ | 659 | #endif /* CONFIG_PROC_FS */ |
| 439 | 660 | ||
| 440 | static inline bool rt_is_expired(const struct rtable *rth) | 661 | static inline void rt_free(struct rtable *rt) |
| 441 | { | 662 | { |
| 442 | return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); | 663 | call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); |
| 443 | } | 664 | } |
| 444 | 665 | ||
| 445 | void rt_cache_flush(struct net *net) | 666 | static inline void rt_drop(struct rtable *rt) |
| 446 | { | 667 | { |
| 447 | rt_genid_bump(net); | 668 | ip_rt_put(rt); |
| 669 | call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); | ||
| 448 | } | 670 | } |
| 449 | 671 | ||
| 450 | static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, | 672 | static inline int rt_fast_clean(struct rtable *rth) |
| 451 | struct sk_buff *skb, | ||
| 452 | const void *daddr) | ||
| 453 | { | 673 | { |
| 454 | struct net_device *dev = dst->dev; | 674 | /* Kill broadcast/multicast entries very aggresively, if they |
| 455 | const __be32 *pkey = daddr; | 675 | collide in hash table with more useful entries */ |
| 456 | const struct rtable *rt; | 676 | return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && |
| 457 | struct neighbour *n; | 677 | rt_is_input_route(rth) && rth->dst.rt_next; |
| 678 | } | ||
| 458 | 679 | ||
| 459 | rt = (const struct rtable *) dst; | 680 | static inline int rt_valuable(struct rtable *rth) |
| 460 | if (rt->rt_gateway) | 681 | { |
| 461 | pkey = (const __be32 *) &rt->rt_gateway; | 682 | return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || |
| 462 | else if (skb) | 683 | (rth->peer && rth->peer->pmtu_expires); |
| 463 | pkey = &ip_hdr(skb)->daddr; | 684 | } |
| 464 | 685 | ||
| 465 | n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); | 686 | static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) |
| 466 | if (n) | 687 | { |
| 467 | return n; | 688 | unsigned long age; |
| 468 | return neigh_create(&arp_tbl, pkey, dev); | 689 | int ret = 0; |
| 690 | |||
| 691 | if (atomic_read(&rth->dst.__refcnt)) | ||
| 692 | goto out; | ||
| 693 | |||
| 694 | age = jiffies - rth->dst.lastuse; | ||
| 695 | if ((age <= tmo1 && !rt_fast_clean(rth)) || | ||
| 696 | (age <= tmo2 && rt_valuable(rth))) | ||
| 697 | goto out; | ||
| 698 | ret = 1; | ||
| 699 | out: return ret; | ||
| 469 | } | 700 | } |
| 470 | 701 | ||
| 471 | /* | 702 | /* Bits of score are: |
| 472 | * Peer allocation may fail only in serious out-of-memory conditions. However | 703 | * 31: very valuable |
| 473 | * we still can generate some output. | 704 | * 30: not quite useless |
| 474 | * Random ID selection looks a bit dangerous because we have no chances to | 705 | * 29..0: usage counter |
| 475 | * select ID being unique in a reasonable period of time. | ||
| 476 | * But broken packet identifier may be better than no packet at all. | ||
| 477 | */ | 706 | */ |
| 478 | static void ip_select_fb_ident(struct iphdr *iph) | 707 | static inline u32 rt_score(struct rtable *rt) |
| 479 | { | 708 | { |
| 480 | static DEFINE_SPINLOCK(ip_fb_id_lock); | 709 | u32 score = jiffies - rt->dst.lastuse; |
| 481 | static u32 ip_fallback_id; | ||
| 482 | u32 salt; | ||
| 483 | 710 | ||
| 484 | spin_lock_bh(&ip_fb_id_lock); | 711 | score = ~score & ~(3<<30); |
| 485 | salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); | 712 | |
| 486 | iph->id = htons(salt & 0xFFFF); | 713 | if (rt_valuable(rt)) |
| 487 | ip_fallback_id = salt; | 714 | score |= (1<<31); |
| 488 | spin_unlock_bh(&ip_fb_id_lock); | 715 | |
| 716 | if (rt_is_output_route(rt) || | ||
| 717 | !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) | ||
| 718 | score |= (1<<30); | ||
| 719 | |||
| 720 | return score; | ||
| 489 | } | 721 | } |
| 490 | 722 | ||
| 491 | void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) | 723 | static inline bool rt_caching(const struct net *net) |
| 492 | { | 724 | { |
| 493 | struct net *net = dev_net(dst->dev); | 725 | return net->ipv4.current_rt_cache_rebuild_count <= |
| 494 | struct inet_peer *peer; | 726 | net->ipv4.sysctl_rt_cache_rebuild_count; |
| 727 | } | ||
| 495 | 728 | ||
| 496 | peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1); | 729 | static inline bool compare_hash_inputs(const struct rtable *rt1, |
| 497 | if (peer) { | 730 | const struct rtable *rt2) |
| 498 | iph->id = htons(inet_getid(peer, more)); | 731 | { |
| 499 | inet_putpeer(peer); | 732 | return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | |
| 500 | return; | 733 | ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | |
| 501 | } | 734 | (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0); |
| 735 | } | ||
| 502 | 736 | ||
| 503 | ip_select_fb_ident(iph); | 737 | static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) |
| 738 | { | ||
| 739 | return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | | ||
| 740 | ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | | ||
| 741 | (rt1->rt_mark ^ rt2->rt_mark) | | ||
| 742 | (rt1->rt_key_tos ^ rt2->rt_key_tos) | | ||
| 743 | (rt1->rt_route_iif ^ rt2->rt_route_iif) | | ||
| 744 | (rt1->rt_oif ^ rt2->rt_oif)) == 0; | ||
| 504 | } | 745 | } |
| 505 | EXPORT_SYMBOL(__ip_select_ident); | ||
| 506 | 746 | ||
| 507 | static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, | 747 | static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) |
| 508 | const struct iphdr *iph, | 748 | { |
| 509 | int oif, u8 tos, | 749 | return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev)); |
| 510 | u8 prot, u32 mark, int flow_flags) | 750 | } |
| 751 | |||
| 752 | static inline int rt_is_expired(struct rtable *rth) | ||
| 753 | { | ||
| 754 | return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); | ||
| 755 | } | ||
| 756 | |||
| 757 | /* | ||
| 758 | * Perform a full scan of hash table and free all entries. | ||
| 759 | * Can be called by a softirq or a process. | ||
| 760 | * In the later case, we want to be reschedule if necessary | ||
| 761 | */ | ||
| 762 | static void rt_do_flush(struct net *net, int process_context) | ||
| 763 | { | ||
| 764 | unsigned int i; | ||
| 765 | struct rtable *rth, *next; | ||
| 766 | |||
| 767 | for (i = 0; i <= rt_hash_mask; i++) { | ||
| 768 | struct rtable __rcu **pprev; | ||
| 769 | struct rtable *list; | ||
| 770 | |||
| 771 | if (process_context && need_resched()) | ||
| 772 | cond_resched(); | ||
| 773 | rth = rcu_dereference_raw(rt_hash_table[i].chain); | ||
| 774 | if (!rth) | ||
| 775 | continue; | ||
| 776 | |||
| 777 | spin_lock_bh(rt_hash_lock_addr(i)); | ||
| 778 | |||
| 779 | list = NULL; | ||
| 780 | pprev = &rt_hash_table[i].chain; | ||
| 781 | rth = rcu_dereference_protected(*pprev, | ||
| 782 | lockdep_is_held(rt_hash_lock_addr(i))); | ||
| 783 | |||
| 784 | while (rth) { | ||
| 785 | next = rcu_dereference_protected(rth->dst.rt_next, | ||
| 786 | lockdep_is_held(rt_hash_lock_addr(i))); | ||
| 787 | |||
| 788 | if (!net || | ||
| 789 | net_eq(dev_net(rth->dst.dev), net)) { | ||
| 790 | rcu_assign_pointer(*pprev, next); | ||
| 791 | rcu_assign_pointer(rth->dst.rt_next, list); | ||
| 792 | list = rth; | ||
| 793 | } else { | ||
| 794 | pprev = &rth->dst.rt_next; | ||
| 795 | } | ||
| 796 | rth = next; | ||
| 797 | } | ||
| 798 | |||
| 799 | spin_unlock_bh(rt_hash_lock_addr(i)); | ||
| 800 | |||
| 801 | for (; list; list = next) { | ||
| 802 | next = rcu_dereference_protected(list->dst.rt_next, 1); | ||
| 803 | rt_free(list); | ||
| 804 | } | ||
| 805 | } | ||
| 806 | } | ||
| 807 | |||
| 808 | /* | ||
| 809 | * While freeing expired entries, we compute average chain length | ||
| 810 | * and standard deviation, using fixed-point arithmetic. | ||
| 811 | * This to have an estimation of rt_chain_length_max | ||
| 812 | * rt_chain_length_max = max(elasticity, AVG + 4*SD) | ||
| 813 | * We use 3 bits for frational part, and 29 (or 61) for magnitude. | ||
| 814 | */ | ||
| 815 | |||
| 816 | #define FRACT_BITS 3 | ||
| 817 | #define ONE (1UL << FRACT_BITS) | ||
| 818 | |||
| 819 | /* | ||
| 820 | * Given a hash chain and an item in this hash chain, | ||
| 821 | * find if a previous entry has the same hash_inputs | ||
| 822 | * (but differs on tos, mark or oif) | ||
| 823 | * Returns 0 if an alias is found. | ||
| 824 | * Returns ONE if rth has no alias before itself. | ||
| 825 | */ | ||
| 826 | static int has_noalias(const struct rtable *head, const struct rtable *rth) | ||
| 511 | { | 827 | { |
| 512 | if (sk) { | 828 | const struct rtable *aux = head; |
| 513 | const struct inet_sock *inet = inet_sk(sk); | ||
| 514 | 829 | ||
| 515 | oif = sk->sk_bound_dev_if; | 830 | while (aux != rth) { |
| 516 | mark = sk->sk_mark; | 831 | if (compare_hash_inputs(aux, rth)) |
| 517 | tos = RT_CONN_FLAGS(sk); | 832 | return 0; |
| 518 | prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; | 833 | aux = rcu_dereference_protected(aux->dst.rt_next, 1); |
| 519 | } | 834 | } |
| 520 | flowi4_init_output(fl4, oif, mark, tos, | 835 | return ONE; |
| 521 | RT_SCOPE_UNIVERSE, prot, | ||
| 522 | flow_flags, | ||
| 523 | iph->daddr, iph->saddr, 0, 0); | ||
| 524 | } | 836 | } |
| 525 | 837 | ||
| 526 | static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, | 838 | static void rt_check_expire(void) |
| 527 | const struct sock *sk) | ||
| 528 | { | 839 | { |
| 529 | const struct iphdr *iph = ip_hdr(skb); | 840 | static unsigned int rover; |
| 530 | int oif = skb->dev->ifindex; | 841 | unsigned int i = rover, goal; |
| 531 | u8 tos = RT_TOS(iph->tos); | 842 | struct rtable *rth; |
| 532 | u8 prot = iph->protocol; | 843 | struct rtable __rcu **rthp; |
| 533 | u32 mark = skb->mark; | 844 | unsigned long samples = 0; |
| 845 | unsigned long sum = 0, sum2 = 0; | ||
| 846 | unsigned long delta; | ||
| 847 | u64 mult; | ||
| 848 | |||
| 849 | delta = jiffies - expires_ljiffies; | ||
| 850 | expires_ljiffies = jiffies; | ||
| 851 | mult = ((u64)delta) << rt_hash_log; | ||
| 852 | if (ip_rt_gc_timeout > 1) | ||
| 853 | do_div(mult, ip_rt_gc_timeout); | ||
| 854 | goal = (unsigned int)mult; | ||
| 855 | if (goal > rt_hash_mask) | ||
| 856 | goal = rt_hash_mask + 1; | ||
| 857 | for (; goal > 0; goal--) { | ||
| 858 | unsigned long tmo = ip_rt_gc_timeout; | ||
| 859 | unsigned long length; | ||
| 860 | |||
| 861 | i = (i + 1) & rt_hash_mask; | ||
| 862 | rthp = &rt_hash_table[i].chain; | ||
| 863 | |||
| 864 | if (need_resched()) | ||
| 865 | cond_resched(); | ||
| 866 | |||
| 867 | samples++; | ||
| 868 | |||
| 869 | if (rcu_dereference_raw(*rthp) == NULL) | ||
| 870 | continue; | ||
| 871 | length = 0; | ||
| 872 | spin_lock_bh(rt_hash_lock_addr(i)); | ||
| 873 | while ((rth = rcu_dereference_protected(*rthp, | ||
| 874 | lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { | ||
| 875 | prefetch(rth->dst.rt_next); | ||
| 876 | if (rt_is_expired(rth)) { | ||
| 877 | *rthp = rth->dst.rt_next; | ||
| 878 | rt_free(rth); | ||
| 879 | continue; | ||
| 880 | } | ||
| 881 | if (rth->dst.expires) { | ||
| 882 | /* Entry is expired even if it is in use */ | ||
| 883 | if (time_before_eq(jiffies, rth->dst.expires)) { | ||
| 884 | nofree: | ||
| 885 | tmo >>= 1; | ||
| 886 | rthp = &rth->dst.rt_next; | ||
| 887 | /* | ||
| 888 | * We only count entries on | ||
| 889 | * a chain with equal hash inputs once | ||
| 890 | * so that entries for different QOS | ||
| 891 | * levels, and other non-hash input | ||
| 892 | * attributes don't unfairly skew | ||
| 893 | * the length computation | ||
| 894 | */ | ||
| 895 | length += has_noalias(rt_hash_table[i].chain, rth); | ||
| 896 | continue; | ||
| 897 | } | ||
| 898 | } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) | ||
| 899 | goto nofree; | ||
| 900 | |||
| 901 | /* Cleanup aged off entries. */ | ||
| 902 | *rthp = rth->dst.rt_next; | ||
| 903 | rt_free(rth); | ||
| 904 | } | ||
| 905 | spin_unlock_bh(rt_hash_lock_addr(i)); | ||
| 906 | sum += length; | ||
| 907 | sum2 += length*length; | ||
| 908 | } | ||
| 909 | if (samples) { | ||
| 910 | unsigned long avg = sum / samples; | ||
| 911 | unsigned long sd = int_sqrt(sum2 / samples - avg*avg); | ||
| 912 | rt_chain_length_max = max_t(unsigned long, | ||
| 913 | ip_rt_gc_elasticity, | ||
| 914 | (avg + 4*sd) >> FRACT_BITS); | ||
| 915 | } | ||
| 916 | rover = i; | ||
| 917 | } | ||
| 534 | 918 | ||
| 535 | __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0); | 919 | /* |
| 920 | * rt_worker_func() is run in process context. | ||
| 921 | * we call rt_check_expire() to scan part of the hash table | ||
| 922 | */ | ||
| 923 | static void rt_worker_func(struct work_struct *work) | ||
| 924 | { | ||
| 925 | rt_check_expire(); | ||
| 926 | schedule_delayed_work(&expires_work, ip_rt_gc_interval); | ||
| 536 | } | 927 | } |
| 537 | 928 | ||
| 538 | static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) | 929 | /* |
| 930 | * Perturbation of rt_genid by a small quantity [1..256] | ||
| 931 | * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() | ||
| 932 | * many times (2^24) without giving recent rt_genid. | ||
| 933 | * Jenkins hash is strong enough that litle changes of rt_genid are OK. | ||
| 934 | */ | ||
| 935 | static void rt_cache_invalidate(struct net *net) | ||
| 539 | { | 936 | { |
| 540 | const struct inet_sock *inet = inet_sk(sk); | 937 | unsigned char shuffle; |
| 541 | const struct ip_options_rcu *inet_opt; | ||
| 542 | __be32 daddr = inet->inet_daddr; | ||
| 543 | 938 | ||
| 544 | rcu_read_lock(); | 939 | get_random_bytes(&shuffle, sizeof(shuffle)); |
| 545 | inet_opt = rcu_dereference(inet->inet_opt); | 940 | atomic_add(shuffle + 1U, &net->ipv4.rt_genid); |
| 546 | if (inet_opt && inet_opt->opt.srr) | 941 | redirect_genid++; |
| 547 | daddr = inet_opt->opt.faddr; | 942 | } |
| 548 | flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, | 943 | |
| 549 | RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, | 944 | /* |
| 550 | inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, | 945 | * delay < 0 : invalidate cache (fast : entries will be deleted later) |
| 551 | inet_sk_flowi_flags(sk), | 946 | * delay >= 0 : invalidate & flush cache (can be long) |
| 552 | daddr, inet->inet_saddr, 0, 0); | 947 | */ |
| 553 | rcu_read_unlock(); | 948 | void rt_cache_flush(struct net *net, int delay) |
| 949 | { | ||
| 950 | rt_cache_invalidate(net); | ||
| 951 | if (delay >= 0) | ||
| 952 | rt_do_flush(net, !in_softirq()); | ||
| 554 | } | 953 | } |
| 555 | 954 | ||
| 556 | static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, | 955 | /* Flush previous cache invalidated entries from the cache */ |
| 557 | const struct sk_buff *skb) | 956 | void rt_cache_flush_batch(struct net *net) |
| 558 | { | 957 | { |
| 559 | if (skb) | 958 | rt_do_flush(net, !in_softirq()); |
| 560 | build_skb_flow_key(fl4, skb, sk); | ||
| 561 | else | ||
| 562 | build_sk_flow_key(fl4, sk); | ||
| 563 | } | 959 | } |
| 564 | 960 | ||
| 565 | static inline void rt_free(struct rtable *rt) | 961 | static void rt_emergency_hash_rebuild(struct net *net) |
| 566 | { | 962 | { |
| 567 | call_rcu(&rt->dst.rcu_head, dst_rcu_free); | 963 | if (net_ratelimit()) |
| 964 | printk(KERN_WARNING "Route hash chain too long!\n"); | ||
| 965 | rt_cache_invalidate(net); | ||
| 568 | } | 966 | } |
| 569 | 967 | ||
| 570 | static DEFINE_SPINLOCK(fnhe_lock); | 968 | /* |
| 969 | Short description of GC goals. | ||
| 970 | |||
| 971 | We want to build algorithm, which will keep routing cache | ||
| 972 | at some equilibrium point, when number of aged off entries | ||
| 973 | is kept approximately equal to newly generated ones. | ||
| 974 | |||
| 975 | Current expiration strength is variable "expire". | ||
| 976 | We try to adjust it dynamically, so that if networking | ||
| 977 | is idle expires is large enough to keep enough of warm entries, | ||
| 978 | and when load increases it reduces to limit cache size. | ||
| 979 | */ | ||
| 571 | 980 | ||
| 572 | static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) | 981 | static int rt_garbage_collect(struct dst_ops *ops) |
| 573 | { | 982 | { |
| 574 | struct fib_nh_exception *fnhe, *oldest; | 983 | static unsigned long expire = RT_GC_TIMEOUT; |
| 575 | struct rtable *orig; | 984 | static unsigned long last_gc; |
| 985 | static int rover; | ||
| 986 | static int equilibrium; | ||
| 987 | struct rtable *rth; | ||
| 988 | struct rtable __rcu **rthp; | ||
| 989 | unsigned long now = jiffies; | ||
| 990 | int goal; | ||
| 991 | int entries = dst_entries_get_fast(&ipv4_dst_ops); | ||
| 992 | |||
| 993 | /* | ||
| 994 | * Garbage collection is pretty expensive, | ||
| 995 | * do not make it too frequently. | ||
| 996 | */ | ||
| 997 | |||
| 998 | RT_CACHE_STAT_INC(gc_total); | ||
| 999 | |||
| 1000 | if (now - last_gc < ip_rt_gc_min_interval && | ||
| 1001 | entries < ip_rt_max_size) { | ||
| 1002 | RT_CACHE_STAT_INC(gc_ignored); | ||
| 1003 | goto out; | ||
| 1004 | } | ||
| 1005 | |||
| 1006 | entries = dst_entries_get_slow(&ipv4_dst_ops); | ||
| 1007 | /* Calculate number of entries, which we want to expire now. */ | ||
| 1008 | goal = entries - (ip_rt_gc_elasticity << rt_hash_log); | ||
| 1009 | if (goal <= 0) { | ||
| 1010 | if (equilibrium < ipv4_dst_ops.gc_thresh) | ||
| 1011 | equilibrium = ipv4_dst_ops.gc_thresh; | ||
| 1012 | goal = entries - equilibrium; | ||
| 1013 | if (goal > 0) { | ||
| 1014 | equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); | ||
| 1015 | goal = entries - equilibrium; | ||
| 1016 | } | ||
| 1017 | } else { | ||
| 1018 | /* We are in dangerous area. Try to reduce cache really | ||
| 1019 | * aggressively. | ||
| 1020 | */ | ||
| 1021 | goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); | ||
| 1022 | equilibrium = entries - goal; | ||
| 1023 | } | ||
| 576 | 1024 | ||
| 577 | oldest = rcu_dereference(hash->chain); | 1025 | if (now - last_gc >= ip_rt_gc_min_interval) |
| 578 | for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; | 1026 | last_gc = now; |
| 579 | fnhe = rcu_dereference(fnhe->fnhe_next)) { | 1027 | |
| 580 | if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) | 1028 | if (goal <= 0) { |
| 581 | oldest = fnhe; | 1029 | equilibrium += goal; |
| 1030 | goto work_done; | ||
| 582 | } | 1031 | } |
| 583 | orig = rcu_dereference(oldest->fnhe_rth); | 1032 | |
| 584 | if (orig) { | 1033 | do { |
| 585 | RCU_INIT_POINTER(oldest->fnhe_rth, NULL); | 1034 | int i, k; |
| 586 | rt_free(orig); | 1035 | |
| 1036 | for (i = rt_hash_mask, k = rover; i >= 0; i--) { | ||
| 1037 | unsigned long tmo = expire; | ||
| 1038 | |||
| 1039 | k = (k + 1) & rt_hash_mask; | ||
| 1040 | rthp = &rt_hash_table[k].chain; | ||
| 1041 | spin_lock_bh(rt_hash_lock_addr(k)); | ||
| 1042 | while ((rth = rcu_dereference_protected(*rthp, | ||
| 1043 | lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) { | ||
| 1044 | if (!rt_is_expired(rth) && | ||
| 1045 | !rt_may_expire(rth, tmo, expire)) { | ||
| 1046 | tmo >>= 1; | ||
| 1047 | rthp = &rth->dst.rt_next; | ||
| 1048 | continue; | ||
| 1049 | } | ||
| 1050 | *rthp = rth->dst.rt_next; | ||
| 1051 | rt_free(rth); | ||
| 1052 | goal--; | ||
| 1053 | } | ||
| 1054 | spin_unlock_bh(rt_hash_lock_addr(k)); | ||
| 1055 | if (goal <= 0) | ||
| 1056 | break; | ||
| 1057 | } | ||
| 1058 | rover = k; | ||
| 1059 | |||
| 1060 | if (goal <= 0) | ||
| 1061 | goto work_done; | ||
| 1062 | |||
| 1063 | /* Goal is not achieved. We stop process if: | ||
| 1064 | |||
| 1065 | - if expire reduced to zero. Otherwise, expire is halfed. | ||
| 1066 | - if table is not full. | ||
| 1067 | - if we are called from interrupt. | ||
| 1068 | - jiffies check is just fallback/debug loop breaker. | ||
| 1069 | We will not spin here for long time in any case. | ||
| 1070 | */ | ||
| 1071 | |||
| 1072 | RT_CACHE_STAT_INC(gc_goal_miss); | ||
| 1073 | |||
| 1074 | if (expire == 0) | ||
| 1075 | break; | ||
| 1076 | |||
| 1077 | expire >>= 1; | ||
| 1078 | |||
| 1079 | if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) | ||
| 1080 | goto out; | ||
| 1081 | } while (!in_softirq() && time_before_eq(jiffies, now)); | ||
| 1082 | |||
| 1083 | if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) | ||
| 1084 | goto out; | ||
| 1085 | if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size) | ||
| 1086 | goto out; | ||
| 1087 | if (net_ratelimit()) | ||
| 1088 | printk(KERN_WARNING "dst cache overflow\n"); | ||
| 1089 | RT_CACHE_STAT_INC(gc_dst_overflow); | ||
| 1090 | return 1; | ||
| 1091 | |||
| 1092 | work_done: | ||
| 1093 | expire += ip_rt_gc_min_interval; | ||
| 1094 | if (expire > ip_rt_gc_timeout || | ||
| 1095 | dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh || | ||
| 1096 | dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh) | ||
| 1097 | expire = ip_rt_gc_timeout; | ||
| 1098 | out: return 0; | ||
| 1099 | } | ||
| 1100 | |||
| 1101 | /* | ||
| 1102 | * Returns number of entries in a hash chain that have different hash_inputs | ||
| 1103 | */ | ||
| 1104 | static int slow_chain_length(const struct rtable *head) | ||
| 1105 | { | ||
| 1106 | int length = 0; | ||
| 1107 | const struct rtable *rth = head; | ||
| 1108 | |||
| 1109 | while (rth) { | ||
| 1110 | length += has_noalias(head, rth); | ||
| 1111 | rth = rcu_dereference_protected(rth->dst.rt_next, 1); | ||
| 587 | } | 1112 | } |
| 588 | return oldest; | 1113 | return length >> FRACT_BITS; |
| 589 | } | 1114 | } |
| 590 | 1115 | ||
| 591 | static inline u32 fnhe_hashfun(__be32 daddr) | 1116 | static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) |
| 592 | { | 1117 | { |
| 593 | u32 hval; | 1118 | struct neigh_table *tbl = &arp_tbl; |
| 1119 | static const __be32 inaddr_any = 0; | ||
| 1120 | struct net_device *dev = dst->dev; | ||
| 1121 | const __be32 *pkey = daddr; | ||
| 1122 | struct neighbour *n; | ||
| 594 | 1123 | ||
| 595 | hval = (__force u32) daddr; | 1124 | #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) |
| 596 | hval ^= (hval >> 11) ^ (hval >> 22); | 1125 | if (dev->type == ARPHRD_ATM) |
| 1126 | tbl = clip_tbl_hook; | ||
| 1127 | #endif | ||
| 1128 | if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) | ||
| 1129 | pkey = &inaddr_any; | ||
| 597 | 1130 | ||
| 598 | return hval & (FNHE_HASH_SIZE - 1); | 1131 | n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey); |
| 1132 | if (n) | ||
| 1133 | return n; | ||
| 1134 | return neigh_create(tbl, pkey, dev); | ||
| 599 | } | 1135 | } |
| 600 | 1136 | ||
| 601 | static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, | 1137 | static int rt_bind_neighbour(struct rtable *rt) |
| 602 | u32 pmtu, unsigned long expires) | ||
| 603 | { | 1138 | { |
| 604 | struct fnhe_hash_bucket *hash; | 1139 | struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); |
| 605 | struct fib_nh_exception *fnhe; | 1140 | if (IS_ERR(n)) |
| 606 | int depth; | 1141 | return PTR_ERR(n); |
| 607 | u32 hval = fnhe_hashfun(daddr); | 1142 | dst_set_neighbour(&rt->dst, n); |
| 608 | 1143 | ||
| 609 | spin_lock_bh(&fnhe_lock); | 1144 | return 0; |
| 1145 | } | ||
| 1146 | |||
| 1147 | static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt, | ||
| 1148 | struct sk_buff *skb, int ifindex) | ||
| 1149 | { | ||
| 1150 | struct rtable *rth, *cand; | ||
| 1151 | struct rtable __rcu **rthp, **candp; | ||
| 1152 | unsigned long now; | ||
| 1153 | u32 min_score; | ||
| 1154 | int chain_length; | ||
| 1155 | int attempts = !in_softirq(); | ||
| 610 | 1156 | ||
| 611 | hash = nh->nh_exceptions; | 1157 | restart: |
| 612 | if (!hash) { | 1158 | chain_length = 0; |
| 613 | hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC); | 1159 | min_score = ~(u32)0; |
| 614 | if (!hash) | 1160 | cand = NULL; |
| 615 | goto out_unlock; | 1161 | candp = NULL; |
| 616 | nh->nh_exceptions = hash; | 1162 | now = jiffies; |
| 1163 | |||
| 1164 | if (!rt_caching(dev_net(rt->dst.dev))) { | ||
| 1165 | /* | ||
| 1166 | * If we're not caching, just tell the caller we | ||
| 1167 | * were successful and don't touch the route. The | ||
| 1168 | * caller hold the sole reference to the cache entry, and | ||
| 1169 | * it will be released when the caller is done with it. | ||
| 1170 | * If we drop it here, the callers have no way to resolve routes | ||
| 1171 | * when we're not caching. Instead, just point *rp at rt, so | ||
| 1172 | * the caller gets a single use out of the route | ||
| 1173 | * Note that we do rt_free on this new route entry, so that | ||
| 1174 | * once its refcount hits zero, we are still able to reap it | ||
| 1175 | * (Thanks Alexey) | ||
| 1176 | * Note: To avoid expensive rcu stuff for this uncached dst, | ||
| 1177 | * we set DST_NOCACHE so that dst_release() can free dst without | ||
| 1178 | * waiting a grace period. | ||
| 1179 | */ | ||
| 1180 | |||
| 1181 | rt->dst.flags |= DST_NOCACHE; | ||
| 1182 | if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { | ||
| 1183 | int err = rt_bind_neighbour(rt); | ||
| 1184 | if (err) { | ||
| 1185 | if (net_ratelimit()) | ||
| 1186 | printk(KERN_WARNING | ||
| 1187 | "Neighbour table failure & not caching routes.\n"); | ||
| 1188 | ip_rt_put(rt); | ||
| 1189 | return ERR_PTR(err); | ||
| 1190 | } | ||
| 1191 | } | ||
| 1192 | |||
| 1193 | goto skip_hashing; | ||
| 617 | } | 1194 | } |
| 618 | 1195 | ||
| 619 | hash += hval; | 1196 | rthp = &rt_hash_table[hash].chain; |
| 620 | 1197 | ||
| 621 | depth = 0; | 1198 | spin_lock_bh(rt_hash_lock_addr(hash)); |
| 622 | for (fnhe = rcu_dereference(hash->chain); fnhe; | 1199 | while ((rth = rcu_dereference_protected(*rthp, |
| 623 | fnhe = rcu_dereference(fnhe->fnhe_next)) { | 1200 | lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { |
| 624 | if (fnhe->fnhe_daddr == daddr) | 1201 | if (rt_is_expired(rth)) { |
| 625 | break; | 1202 | *rthp = rth->dst.rt_next; |
| 626 | depth++; | 1203 | rt_free(rth); |
| 1204 | continue; | ||
| 1205 | } | ||
| 1206 | if (compare_keys(rth, rt) && compare_netns(rth, rt)) { | ||
| 1207 | /* Put it first */ | ||
| 1208 | *rthp = rth->dst.rt_next; | ||
| 1209 | /* | ||
| 1210 | * Since lookup is lockfree, the deletion | ||
| 1211 | * must be visible to another weakly ordered CPU before | ||
| 1212 | * the insertion at the start of the hash chain. | ||
| 1213 | */ | ||
| 1214 | rcu_assign_pointer(rth->dst.rt_next, | ||
| 1215 | rt_hash_table[hash].chain); | ||
| 1216 | /* | ||
| 1217 | * Since lookup is lockfree, the update writes | ||
| 1218 | * must be ordered for consistency on SMP. | ||
| 1219 | */ | ||
| 1220 | rcu_assign_pointer(rt_hash_table[hash].chain, rth); | ||
| 1221 | |||
| 1222 | dst_use(&rth->dst, now); | ||
| 1223 | spin_unlock_bh(rt_hash_lock_addr(hash)); | ||
| 1224 | |||
| 1225 | rt_drop(rt); | ||
| 1226 | if (skb) | ||
| 1227 | skb_dst_set(skb, &rth->dst); | ||
| 1228 | return rth; | ||
| 1229 | } | ||
| 1230 | |||
| 1231 | if (!atomic_read(&rth->dst.__refcnt)) { | ||
| 1232 | u32 score = rt_score(rth); | ||
| 1233 | |||
| 1234 | if (score <= min_score) { | ||
| 1235 | cand = rth; | ||
| 1236 | candp = rthp; | ||
| 1237 | min_score = score; | ||
| 1238 | } | ||
| 1239 | } | ||
| 1240 | |||
| 1241 | chain_length++; | ||
| 1242 | |||
| 1243 | rthp = &rth->dst.rt_next; | ||
| 627 | } | 1244 | } |
| 628 | 1245 | ||
| 629 | if (fnhe) { | 1246 | if (cand) { |
| 630 | if (gw) | 1247 | /* ip_rt_gc_elasticity used to be average length of chain |
| 631 | fnhe->fnhe_gw = gw; | 1248 | * length, when exceeded gc becomes really aggressive. |
| 632 | if (pmtu) { | 1249 | * |
| 633 | fnhe->fnhe_pmtu = pmtu; | 1250 | * The second limit is less certain. At the moment it allows |
| 634 | fnhe->fnhe_expires = expires; | 1251 | * only 2 entries per bucket. We will see. |
| 1252 | */ | ||
| 1253 | if (chain_length > ip_rt_gc_elasticity) { | ||
| 1254 | *candp = cand->dst.rt_next; | ||
| 1255 | rt_free(cand); | ||
| 635 | } | 1256 | } |
| 636 | } else { | 1257 | } else { |
| 637 | if (depth > FNHE_RECLAIM_DEPTH) | 1258 | if (chain_length > rt_chain_length_max && |
| 638 | fnhe = fnhe_oldest(hash); | 1259 | slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { |
| 639 | else { | 1260 | struct net *net = dev_net(rt->dst.dev); |
| 640 | fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); | 1261 | int num = ++net->ipv4.current_rt_cache_rebuild_count; |
| 641 | if (!fnhe) | 1262 | if (!rt_caching(net)) { |
| 642 | goto out_unlock; | 1263 | printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", |
| 643 | 1264 | rt->dst.dev->name, num); | |
| 644 | fnhe->fnhe_next = hash->chain; | 1265 | } |
| 645 | rcu_assign_pointer(hash->chain, fnhe); | 1266 | rt_emergency_hash_rebuild(net); |
| 1267 | spin_unlock_bh(rt_hash_lock_addr(hash)); | ||
| 1268 | |||
| 1269 | hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, | ||
| 1270 | ifindex, rt_genid(net)); | ||
| 1271 | goto restart; | ||
| 646 | } | 1272 | } |
| 647 | fnhe->fnhe_daddr = daddr; | ||
| 648 | fnhe->fnhe_gw = gw; | ||
| 649 | fnhe->fnhe_pmtu = pmtu; | ||
| 650 | fnhe->fnhe_expires = expires; | ||
| 651 | } | 1273 | } |
| 652 | 1274 | ||
| 653 | fnhe->fnhe_stamp = jiffies; | 1275 | /* Try to bind route to arp only if it is output |
| 1276 | route or unicast forwarding path. | ||
| 1277 | */ | ||
| 1278 | if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { | ||
| 1279 | int err = rt_bind_neighbour(rt); | ||
| 1280 | if (err) { | ||
| 1281 | spin_unlock_bh(rt_hash_lock_addr(hash)); | ||
| 1282 | |||
| 1283 | if (err != -ENOBUFS) { | ||
| 1284 | rt_drop(rt); | ||
| 1285 | return ERR_PTR(err); | ||
| 1286 | } | ||
| 654 | 1287 | ||
| 655 | out_unlock: | 1288 | /* Neighbour tables are full and nothing |
| 656 | spin_unlock_bh(&fnhe_lock); | 1289 | can be released. Try to shrink route cache, |
| 657 | return; | 1290 | it is most likely it holds some neighbour records. |
| 1291 | */ | ||
| 1292 | if (attempts-- > 0) { | ||
| 1293 | int saved_elasticity = ip_rt_gc_elasticity; | ||
| 1294 | int saved_int = ip_rt_gc_min_interval; | ||
| 1295 | ip_rt_gc_elasticity = 1; | ||
| 1296 | ip_rt_gc_min_interval = 0; | ||
| 1297 | rt_garbage_collect(&ipv4_dst_ops); | ||
| 1298 | ip_rt_gc_min_interval = saved_int; | ||
| 1299 | ip_rt_gc_elasticity = saved_elasticity; | ||
| 1300 | goto restart; | ||
| 1301 | } | ||
| 1302 | |||
| 1303 | if (net_ratelimit()) | ||
| 1304 | printk(KERN_WARNING "ipv4: Neighbour table overflow.\n"); | ||
| 1305 | rt_drop(rt); | ||
| 1306 | return ERR_PTR(-ENOBUFS); | ||
| 1307 | } | ||
| 1308 | } | ||
| 1309 | |||
| 1310 | rt->dst.rt_next = rt_hash_table[hash].chain; | ||
| 1311 | |||
| 1312 | /* | ||
| 1313 | * Since lookup is lockfree, we must make sure | ||
| 1314 | * previous writes to rt are committed to memory | ||
| 1315 | * before making rt visible to other CPUS. | ||
| 1316 | */ | ||
| 1317 | rcu_assign_pointer(rt_hash_table[hash].chain, rt); | ||
| 1318 | |||
| 1319 | spin_unlock_bh(rt_hash_lock_addr(hash)); | ||
| 1320 | |||
| 1321 | skip_hashing: | ||
| 1322 | if (skb) | ||
| 1323 | skb_dst_set(skb, &rt->dst); | ||
| 1324 | return rt; | ||
| 658 | } | 1325 | } |
| 659 | 1326 | ||
| 660 | static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, | 1327 | static atomic_t __rt_peer_genid = ATOMIC_INIT(0); |
| 661 | bool kill_route) | 1328 | |
| 1329 | static u32 rt_peer_genid(void) | ||
| 662 | { | 1330 | { |
| 663 | __be32 new_gw = icmp_hdr(skb)->un.gateway; | 1331 | return atomic_read(&__rt_peer_genid); |
| 664 | __be32 old_gw = ip_hdr(skb)->saddr; | 1332 | } |
| 665 | struct net_device *dev = skb->dev; | ||
| 666 | struct in_device *in_dev; | ||
| 667 | struct fib_result res; | ||
| 668 | struct neighbour *n; | ||
| 669 | struct net *net; | ||
| 670 | 1333 | ||
| 671 | switch (icmp_hdr(skb)->code & 7) { | 1334 | void rt_bind_peer(struct rtable *rt, __be32 daddr, int create) |
| 672 | case ICMP_REDIR_NET: | 1335 | { |
| 673 | case ICMP_REDIR_NETTOS: | 1336 | struct inet_peer *peer; |
| 674 | case ICMP_REDIR_HOST: | ||
| 675 | case ICMP_REDIR_HOSTTOS: | ||
| 676 | break; | ||
| 677 | 1337 | ||
| 678 | default: | 1338 | peer = inet_getpeer_v4(daddr, create); |
| 679 | return; | 1339 | |
| 1340 | if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) | ||
| 1341 | inet_putpeer(peer); | ||
| 1342 | else | ||
| 1343 | rt->rt_peer_genid = rt_peer_genid(); | ||
| 1344 | } | ||
| 1345 | |||
| 1346 | /* | ||
| 1347 | * Peer allocation may fail only in serious out-of-memory conditions. However | ||
| 1348 | * we still can generate some output. | ||
| 1349 | * Random ID selection looks a bit dangerous because we have no chances to | ||
| 1350 | * select ID being unique in a reasonable period of time. | ||
| 1351 | * But broken packet identifier may be better than no packet at all. | ||
| 1352 | */ | ||
| 1353 | static void ip_select_fb_ident(struct iphdr *iph) | ||
| 1354 | { | ||
| 1355 | static DEFINE_SPINLOCK(ip_fb_id_lock); | ||
| 1356 | static u32 ip_fallback_id; | ||
| 1357 | u32 salt; | ||
| 1358 | |||
| 1359 | spin_lock_bh(&ip_fb_id_lock); | ||
| 1360 | salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); | ||
| 1361 | iph->id = htons(salt & 0xFFFF); | ||
| 1362 | ip_fallback_id = salt; | ||
| 1363 | spin_unlock_bh(&ip_fb_id_lock); | ||
| 1364 | } | ||
| 1365 | |||
| 1366 | void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) | ||
| 1367 | { | ||
| 1368 | struct rtable *rt = (struct rtable *) dst; | ||
| 1369 | |||
| 1370 | if (rt && !(rt->dst.flags & DST_NOPEER)) { | ||
| 1371 | if (rt->peer == NULL) | ||
| 1372 | rt_bind_peer(rt, rt->rt_dst, 1); | ||
| 1373 | |||
| 1374 | /* If peer is attached to destination, it is never detached, | ||
| 1375 | so that we need not to grab a lock to dereference it. | ||
| 1376 | */ | ||
| 1377 | if (rt->peer) { | ||
| 1378 | iph->id = htons(inet_getid(rt->peer, more)); | ||
| 1379 | return; | ||
| 1380 | } | ||
| 1381 | } else if (!rt) | ||
| 1382 | printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", | ||
| 1383 | __builtin_return_address(0)); | ||
| 1384 | |||
| 1385 | ip_select_fb_ident(iph); | ||
| 1386 | } | ||
| 1387 | EXPORT_SYMBOL(__ip_select_ident); | ||
| 1388 | |||
| 1389 | static void rt_del(unsigned hash, struct rtable *rt) | ||
| 1390 | { | ||
| 1391 | struct rtable __rcu **rthp; | ||
| 1392 | struct rtable *aux; | ||
| 1393 | |||
| 1394 | rthp = &rt_hash_table[hash].chain; | ||
| 1395 | spin_lock_bh(rt_hash_lock_addr(hash)); | ||
| 1396 | ip_rt_put(rt); | ||
| 1397 | while ((aux = rcu_dereference_protected(*rthp, | ||
| 1398 | lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { | ||
| 1399 | if (aux == rt || rt_is_expired(aux)) { | ||
| 1400 | *rthp = aux->dst.rt_next; | ||
| 1401 | rt_free(aux); | ||
| 1402 | continue; | ||
| 1403 | } | ||
| 1404 | rthp = &aux->dst.rt_next; | ||
| 680 | } | 1405 | } |
| 1406 | spin_unlock_bh(rt_hash_lock_addr(hash)); | ||
| 1407 | } | ||
| 1408 | |||
| 1409 | static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) | ||
| 1410 | { | ||
| 1411 | struct rtable *rt = (struct rtable *) dst; | ||
| 1412 | __be32 orig_gw = rt->rt_gateway; | ||
| 1413 | struct neighbour *n, *old_n; | ||
| 1414 | |||
| 1415 | dst_confirm(&rt->dst); | ||
| 681 | 1416 | ||
| 682 | if (rt->rt_gateway != old_gw) | 1417 | rt->rt_gateway = peer->redirect_learned.a4; |
| 1418 | |||
| 1419 | n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); | ||
| 1420 | if (IS_ERR(n)) { | ||
| 1421 | rt->rt_gateway = orig_gw; | ||
| 683 | return; | 1422 | return; |
| 1423 | } | ||
| 1424 | old_n = xchg(&rt->dst._neighbour, n); | ||
| 1425 | if (old_n) | ||
| 1426 | neigh_release(old_n); | ||
| 1427 | if (!(n->nud_state & NUD_VALID)) { | ||
| 1428 | neigh_event_send(n, NULL); | ||
| 1429 | } else { | ||
| 1430 | rt->rt_flags |= RTCF_REDIRECTED; | ||
| 1431 | call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); | ||
| 1432 | } | ||
| 1433 | } | ||
| 1434 | |||
| 1435 | /* called in rcu_read_lock() section */ | ||
| 1436 | void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, | ||
| 1437 | __be32 saddr, struct net_device *dev) | ||
| 1438 | { | ||
| 1439 | int s, i; | ||
| 1440 | struct in_device *in_dev = __in_dev_get_rcu(dev); | ||
| 1441 | __be32 skeys[2] = { saddr, 0 }; | ||
| 1442 | int ikeys[2] = { dev->ifindex, 0 }; | ||
| 1443 | struct inet_peer *peer; | ||
| 1444 | struct net *net; | ||
| 684 | 1445 | ||
| 685 | in_dev = __in_dev_get_rcu(dev); | ||
| 686 | if (!in_dev) | 1446 | if (!in_dev) |
| 687 | return; | 1447 | return; |
| 688 | 1448 | ||
| @@ -702,50 +1462,74 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow | |||
| 702 | goto reject_redirect; | 1462 | goto reject_redirect; |
| 703 | } | 1463 | } |
| 704 | 1464 | ||
| 705 | n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); | 1465 | for (s = 0; s < 2; s++) { |
| 706 | if (n) { | 1466 | for (i = 0; i < 2; i++) { |
| 707 | if (!(n->nud_state & NUD_VALID)) { | 1467 | unsigned int hash; |
| 708 | neigh_event_send(n, NULL); | 1468 | struct rtable __rcu **rthp; |
| 709 | } else { | 1469 | struct rtable *rt; |
| 710 | if (fib_lookup(net, fl4, &res) == 0) { | 1470 | |
| 711 | struct fib_nh *nh = &FIB_RES_NH(res); | 1471 | hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net)); |
| 712 | 1472 | ||
| 713 | update_or_create_fnhe(nh, fl4->daddr, new_gw, | 1473 | rthp = &rt_hash_table[hash].chain; |
| 714 | 0, 0); | 1474 | |
| 1475 | while ((rt = rcu_dereference(*rthp)) != NULL) { | ||
| 1476 | rthp = &rt->dst.rt_next; | ||
| 1477 | |||
| 1478 | if (rt->rt_key_dst != daddr || | ||
| 1479 | rt->rt_key_src != skeys[s] || | ||
| 1480 | rt->rt_oif != ikeys[i] || | ||
| 1481 | rt_is_input_route(rt) || | ||
| 1482 | rt_is_expired(rt) || | ||
| 1483 | !net_eq(dev_net(rt->dst.dev), net) || | ||
| 1484 | rt->dst.error || | ||
| 1485 | rt->dst.dev != dev || | ||
| 1486 | rt->rt_gateway != old_gw) | ||
| 1487 | continue; | ||
| 1488 | |||
| 1489 | if (!rt->peer) | ||
| 1490 | rt_bind_peer(rt, rt->rt_dst, 1); | ||
| 1491 | |||
| 1492 | peer = rt->peer; | ||
| 1493 | if (peer) { | ||
| 1494 | if (peer->redirect_learned.a4 != new_gw || | ||
| 1495 | peer->redirect_genid != redirect_genid) { | ||
| 1496 | peer->redirect_learned.a4 = new_gw; | ||
| 1497 | peer->redirect_genid = redirect_genid; | ||
| 1498 | atomic_inc(&__rt_peer_genid); | ||
| 1499 | } | ||
| 1500 | check_peer_redir(&rt->dst, peer); | ||
| 1501 | } | ||
| 715 | } | 1502 | } |
| 716 | if (kill_route) | ||
| 717 | rt->dst.obsolete = DST_OBSOLETE_KILL; | ||
| 718 | call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); | ||
| 719 | } | 1503 | } |
| 720 | neigh_release(n); | ||
| 721 | } | 1504 | } |
| 722 | return; | 1505 | return; |
| 723 | 1506 | ||
| 724 | reject_redirect: | 1507 | reject_redirect: |
| 725 | #ifdef CONFIG_IP_ROUTE_VERBOSE | 1508 | #ifdef CONFIG_IP_ROUTE_VERBOSE |
| 726 | if (IN_DEV_LOG_MARTIANS(in_dev)) { | 1509 | if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) |
| 727 | const struct iphdr *iph = (const struct iphdr *) skb->data; | 1510 | printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n" |
| 728 | __be32 daddr = iph->daddr; | 1511 | " Advised path = %pI4 -> %pI4\n", |
| 729 | __be32 saddr = iph->saddr; | 1512 | &old_gw, dev->name, &new_gw, |
| 730 | 1513 | &saddr, &daddr); | |
| 731 | net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" | ||
| 732 | " Advised path = %pI4 -> %pI4\n", | ||
| 733 | &old_gw, dev->name, &new_gw, | ||
| 734 | &saddr, &daddr); | ||
| 735 | } | ||
| 736 | #endif | 1514 | #endif |
| 737 | ; | 1515 | ; |
| 738 | } | 1516 | } |
| 739 | 1517 | ||
| 740 | static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) | 1518 | static bool peer_pmtu_expired(struct inet_peer *peer) |
| 741 | { | 1519 | { |
| 742 | struct rtable *rt; | 1520 | unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); |
| 743 | struct flowi4 fl4; | ||
| 744 | 1521 | ||
| 745 | rt = (struct rtable *) dst; | 1522 | return orig && |
| 1523 | time_after_eq(jiffies, orig) && | ||
| 1524 | cmpxchg(&peer->pmtu_expires, orig, 0) == orig; | ||
| 1525 | } | ||
| 1526 | |||
| 1527 | static bool peer_pmtu_cleaned(struct inet_peer *peer) | ||
| 1528 | { | ||
| 1529 | unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); | ||
| 746 | 1530 | ||
| 747 | ip_rt_build_flow_key(&fl4, sk, skb); | 1531 | return orig && |
| 748 | __ip_do_redirect(rt, skb, &fl4, true); | 1532 | cmpxchg(&peer->pmtu_expires, orig, 0) == orig; |
| 749 | } | 1533 | } |
| 750 | 1534 | ||
| 751 | static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) | 1535 | static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) |
| @@ -757,10 +1541,14 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) | |||
| 757 | if (dst->obsolete > 0) { | 1541 | if (dst->obsolete > 0) { |
| 758 | ip_rt_put(rt); | 1542 | ip_rt_put(rt); |
| 759 | ret = NULL; | 1543 | ret = NULL; |
| 760 | } else if ((rt->rt_flags & RTCF_REDIRECTED) || | 1544 | } else if (rt->rt_flags & RTCF_REDIRECTED) { |
| 761 | rt->dst.expires) { | 1545 | unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, |
| 762 | ip_rt_put(rt); | 1546 | rt->rt_oif, |
| 1547 | rt_genid(dev_net(dst->dev))); | ||
| 1548 | rt_del(hash, rt); | ||
| 763 | ret = NULL; | 1549 | ret = NULL; |
| 1550 | } else if (rt->peer && peer_pmtu_expired(rt->peer)) { | ||
| 1551 | dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig); | ||
| 764 | } | 1552 | } |
| 765 | } | 1553 | } |
| 766 | return ret; | 1554 | return ret; |
| @@ -787,7 +1575,6 @@ void ip_rt_send_redirect(struct sk_buff *skb) | |||
| 787 | struct rtable *rt = skb_rtable(skb); | 1575 | struct rtable *rt = skb_rtable(skb); |
| 788 | struct in_device *in_dev; | 1576 | struct in_device *in_dev; |
| 789 | struct inet_peer *peer; | 1577 | struct inet_peer *peer; |
| 790 | struct net *net; | ||
| 791 | int log_martians; | 1578 | int log_martians; |
| 792 | 1579 | ||
| 793 | rcu_read_lock(); | 1580 | rcu_read_lock(); |
| @@ -799,11 +1586,11 @@ void ip_rt_send_redirect(struct sk_buff *skb) | |||
| 799 | log_martians = IN_DEV_LOG_MARTIANS(in_dev); | 1586 | log_martians = IN_DEV_LOG_MARTIANS(in_dev); |
| 800 | rcu_read_unlock(); | 1587 | rcu_read_unlock(); |
| 801 | 1588 | ||
| 802 | net = dev_net(rt->dst.dev); | 1589 | if (!rt->peer) |
| 803 | peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); | 1590 | rt_bind_peer(rt, rt->rt_dst, 1); |
| 1591 | peer = rt->peer; | ||
| 804 | if (!peer) { | 1592 | if (!peer) { |
| 805 | icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, | 1593 | icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); |
| 806 | rt_nexthop(rt, ip_hdr(skb)->daddr)); | ||
| 807 | return; | 1594 | return; |
| 808 | } | 1595 | } |
| 809 | 1596 | ||
| @@ -818,7 +1605,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) | |||
| 818 | */ | 1605 | */ |
| 819 | if (peer->rate_tokens >= ip_rt_redirect_number) { | 1606 | if (peer->rate_tokens >= ip_rt_redirect_number) { |
| 820 | peer->rate_last = jiffies; | 1607 | peer->rate_last = jiffies; |
| 821 | goto out_put_peer; | 1608 | return; |
| 822 | } | 1609 | } |
| 823 | 1610 | ||
| 824 | /* Check for load limit; set rate_last to the latest sent | 1611 | /* Check for load limit; set rate_last to the latest sent |
| @@ -828,47 +1615,28 @@ void ip_rt_send_redirect(struct sk_buff *skb) | |||
| 828 | time_after(jiffies, | 1615 | time_after(jiffies, |
| 829 | (peer->rate_last + | 1616 | (peer->rate_last + |
| 830 | (ip_rt_redirect_load << peer->rate_tokens)))) { | 1617 | (ip_rt_redirect_load << peer->rate_tokens)))) { |
| 831 | __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); | 1618 | icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); |
| 832 | |||
| 833 | icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); | ||
| 834 | peer->rate_last = jiffies; | 1619 | peer->rate_last = jiffies; |
| 835 | ++peer->rate_tokens; | 1620 | ++peer->rate_tokens; |
| 836 | #ifdef CONFIG_IP_ROUTE_VERBOSE | 1621 | #ifdef CONFIG_IP_ROUTE_VERBOSE |
| 837 | if (log_martians && | 1622 | if (log_martians && |
| 838 | peer->rate_tokens == ip_rt_redirect_number) | 1623 | peer->rate_tokens == ip_rt_redirect_number && |
| 839 | net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", | 1624 | net_ratelimit()) |
| 840 | &ip_hdr(skb)->saddr, inet_iif(skb), | 1625 | printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", |
| 841 | &ip_hdr(skb)->daddr, &gw); | 1626 | &ip_hdr(skb)->saddr, rt->rt_iif, |
| 1627 | &rt->rt_dst, &rt->rt_gateway); | ||
| 842 | #endif | 1628 | #endif |
| 843 | } | 1629 | } |
| 844 | out_put_peer: | ||
| 845 | inet_putpeer(peer); | ||
| 846 | } | 1630 | } |
| 847 | 1631 | ||
| 848 | static int ip_error(struct sk_buff *skb) | 1632 | static int ip_error(struct sk_buff *skb) |
| 849 | { | 1633 | { |
| 850 | struct in_device *in_dev = __in_dev_get_rcu(skb->dev); | ||
| 851 | struct rtable *rt = skb_rtable(skb); | 1634 | struct rtable *rt = skb_rtable(skb); |
| 852 | struct inet_peer *peer; | 1635 | struct inet_peer *peer; |
| 853 | unsigned long now; | 1636 | unsigned long now; |
| 854 | struct net *net; | ||
| 855 | bool send; | 1637 | bool send; |
| 856 | int code; | 1638 | int code; |
| 857 | 1639 | ||
| 858 | net = dev_net(rt->dst.dev); | ||
| 859 | if (!IN_DEV_FORWARD(in_dev)) { | ||
| 860 | switch (rt->dst.error) { | ||
| 861 | case EHOSTUNREACH: | ||
| 862 | IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS); | ||
| 863 | break; | ||
| 864 | |||
| 865 | case ENETUNREACH: | ||
| 866 | IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); | ||
| 867 | break; | ||
| 868 | } | ||
| 869 | goto out; | ||
| 870 | } | ||
| 871 | |||
| 872 | switch (rt->dst.error) { | 1640 | switch (rt->dst.error) { |
| 873 | case EINVAL: | 1641 | case EINVAL: |
| 874 | default: | 1642 | default: |
| @@ -878,14 +1646,17 @@ static int ip_error(struct sk_buff *skb) | |||
| 878 | break; | 1646 | break; |
| 879 | case ENETUNREACH: | 1647 | case ENETUNREACH: |
| 880 | code = ICMP_NET_UNREACH; | 1648 | code = ICMP_NET_UNREACH; |
| 881 | IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); | 1649 | IP_INC_STATS_BH(dev_net(rt->dst.dev), |
| 1650 | IPSTATS_MIB_INNOROUTES); | ||
| 882 | break; | 1651 | break; |
| 883 | case EACCES: | 1652 | case EACCES: |
| 884 | code = ICMP_PKT_FILTERED; | 1653 | code = ICMP_PKT_FILTERED; |
| 885 | break; | 1654 | break; |
| 886 | } | 1655 | } |
| 887 | 1656 | ||
| 888 | peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); | 1657 | if (!rt->peer) |
| 1658 | rt_bind_peer(rt, rt->rt_dst, 1); | ||
| 1659 | peer = rt->peer; | ||
| 889 | 1660 | ||
| 890 | send = true; | 1661 | send = true; |
| 891 | if (peer) { | 1662 | if (peer) { |
| @@ -898,7 +1669,6 @@ static int ip_error(struct sk_buff *skb) | |||
| 898 | peer->rate_tokens -= ip_rt_error_cost; | 1669 | peer->rate_tokens -= ip_rt_error_cost; |
| 899 | else | 1670 | else |
| 900 | send = false; | 1671 | send = false; |
| 901 | inet_putpeer(peer); | ||
| 902 | } | 1672 | } |
| 903 | if (send) | 1673 | if (send) |
| 904 | icmp_send(skb, ICMP_DEST_UNREACH, code, 0); | 1674 | icmp_send(skb, ICMP_DEST_UNREACH, code, 0); |
| @@ -907,125 +1677,165 @@ out: kfree_skb(skb); | |||
| 907 | return 0; | 1677 | return 0; |
| 908 | } | 1678 | } |
| 909 | 1679 | ||
| 910 | static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) | 1680 | /* |
| 1681 | * The last two values are not from the RFC but | ||
| 1682 | * are needed for AMPRnet AX.25 paths. | ||
| 1683 | */ | ||
| 1684 | |||
| 1685 | static const unsigned short mtu_plateau[] = | ||
| 1686 | {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; | ||
| 1687 | |||
| 1688 | static inline unsigned short guess_mtu(unsigned short old_mtu) | ||
| 911 | { | 1689 | { |
| 912 | struct dst_entry *dst = &rt->dst; | 1690 | int i; |
| 913 | struct fib_result res; | ||
| 914 | 1691 | ||
| 915 | if (dst->dev->mtu < mtu) | 1692 | for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) |
| 916 | return; | 1693 | if (old_mtu > mtu_plateau[i]) |
| 1694 | return mtu_plateau[i]; | ||
| 1695 | return 68; | ||
| 1696 | } | ||
| 917 | 1697 | ||
| 918 | if (mtu < ip_rt_min_pmtu) | 1698 | unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph, |
| 919 | mtu = ip_rt_min_pmtu; | 1699 | unsigned short new_mtu, |
| 1700 | struct net_device *dev) | ||
| 1701 | { | ||
| 1702 | unsigned short old_mtu = ntohs(iph->tot_len); | ||
| 1703 | unsigned short est_mtu = 0; | ||
| 1704 | struct inet_peer *peer; | ||
| 920 | 1705 | ||
| 921 | if (!rt->rt_pmtu) { | 1706 | peer = inet_getpeer_v4(iph->daddr, 1); |
| 922 | dst->obsolete = DST_OBSOLETE_KILL; | 1707 | if (peer) { |
| 923 | } else { | 1708 | unsigned short mtu = new_mtu; |
| 924 | rt->rt_pmtu = mtu; | ||
| 925 | dst->expires = max(1UL, jiffies + ip_rt_mtu_expires); | ||
| 926 | } | ||
| 927 | 1709 | ||
| 928 | rcu_read_lock(); | 1710 | if (new_mtu < 68 || new_mtu >= old_mtu) { |
| 929 | if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) { | 1711 | /* BSD 4.2 derived systems incorrectly adjust |
| 930 | struct fib_nh *nh = &FIB_RES_NH(res); | 1712 | * tot_len by the IP header length, and report |
| 1713 | * a zero MTU in the ICMP message. | ||
| 1714 | */ | ||
| 1715 | if (mtu == 0 && | ||
| 1716 | old_mtu >= 68 + (iph->ihl << 2)) | ||
| 1717 | old_mtu -= iph->ihl << 2; | ||
| 1718 | mtu = guess_mtu(old_mtu); | ||
| 1719 | } | ||
| 1720 | |||
| 1721 | if (mtu < ip_rt_min_pmtu) | ||
| 1722 | mtu = ip_rt_min_pmtu; | ||
| 1723 | if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { | ||
| 1724 | unsigned long pmtu_expires; | ||
| 931 | 1725 | ||
| 932 | update_or_create_fnhe(nh, fl4->daddr, 0, mtu, | 1726 | pmtu_expires = jiffies + ip_rt_mtu_expires; |
| 933 | jiffies + ip_rt_mtu_expires); | 1727 | if (!pmtu_expires) |
| 1728 | pmtu_expires = 1UL; | ||
| 1729 | |||
| 1730 | est_mtu = mtu; | ||
| 1731 | peer->pmtu_learned = mtu; | ||
| 1732 | peer->pmtu_expires = pmtu_expires; | ||
| 1733 | atomic_inc(&__rt_peer_genid); | ||
| 1734 | } | ||
| 1735 | |||
| 1736 | inet_putpeer(peer); | ||
| 934 | } | 1737 | } |
| 935 | rcu_read_unlock(); | 1738 | return est_mtu ? : new_mtu; |
| 936 | } | 1739 | } |
| 937 | 1740 | ||
| 938 | static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, | 1741 | static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) |
| 939 | struct sk_buff *skb, u32 mtu) | ||
| 940 | { | 1742 | { |
| 941 | struct rtable *rt = (struct rtable *) dst; | 1743 | unsigned long expires = ACCESS_ONCE(peer->pmtu_expires); |
| 942 | struct flowi4 fl4; | ||
| 943 | 1744 | ||
| 944 | ip_rt_build_flow_key(&fl4, sk, skb); | 1745 | if (!expires) |
| 945 | __ip_rt_update_pmtu(rt, &fl4, mtu); | 1746 | return; |
| 1747 | if (time_before(jiffies, expires)) { | ||
| 1748 | u32 orig_dst_mtu = dst_mtu(dst); | ||
| 1749 | if (peer->pmtu_learned < orig_dst_mtu) { | ||
| 1750 | if (!peer->pmtu_orig) | ||
| 1751 | peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU); | ||
| 1752 | dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned); | ||
| 1753 | } | ||
| 1754 | } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires) | ||
| 1755 | dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); | ||
| 946 | } | 1756 | } |
| 947 | 1757 | ||
| 948 | void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, | 1758 | static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) |
| 949 | int oif, u32 mark, u8 protocol, int flow_flags) | ||
| 950 | { | 1759 | { |
| 951 | const struct iphdr *iph = (const struct iphdr *) skb->data; | 1760 | struct rtable *rt = (struct rtable *) dst; |
| 952 | struct flowi4 fl4; | 1761 | struct inet_peer *peer; |
| 953 | struct rtable *rt; | ||
| 954 | 1762 | ||
| 955 | __build_flow_key(&fl4, NULL, iph, oif, | 1763 | dst_confirm(dst); |
| 956 | RT_TOS(iph->tos), protocol, mark, flow_flags); | ||
| 957 | rt = __ip_route_output_key(net, &fl4); | ||
| 958 | if (!IS_ERR(rt)) { | ||
| 959 | __ip_rt_update_pmtu(rt, &fl4, mtu); | ||
| 960 | ip_rt_put(rt); | ||
| 961 | } | ||
| 962 | } | ||
| 963 | EXPORT_SYMBOL_GPL(ipv4_update_pmtu); | ||
| 964 | 1764 | ||
| 965 | void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) | 1765 | if (!rt->peer) |
| 966 | { | 1766 | rt_bind_peer(rt, rt->rt_dst, 1); |
| 967 | const struct iphdr *iph = (const struct iphdr *) skb->data; | 1767 | peer = rt->peer; |
| 968 | struct flowi4 fl4; | 1768 | if (peer) { |
| 969 | struct rtable *rt; | 1769 | unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires); |
| 970 | 1770 | ||
| 971 | __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); | 1771 | if (mtu < ip_rt_min_pmtu) |
| 972 | rt = __ip_route_output_key(sock_net(sk), &fl4); | 1772 | mtu = ip_rt_min_pmtu; |
| 973 | if (!IS_ERR(rt)) { | 1773 | if (!pmtu_expires || mtu < peer->pmtu_learned) { |
| 974 | __ip_rt_update_pmtu(rt, &fl4, mtu); | ||
| 975 | ip_rt_put(rt); | ||
| 976 | } | ||
| 977 | } | ||
| 978 | EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); | ||
| 979 | 1774 | ||
| 980 | void ipv4_redirect(struct sk_buff *skb, struct net *net, | 1775 | pmtu_expires = jiffies + ip_rt_mtu_expires; |
| 981 | int oif, u32 mark, u8 protocol, int flow_flags) | 1776 | if (!pmtu_expires) |
| 982 | { | 1777 | pmtu_expires = 1UL; |
| 983 | const struct iphdr *iph = (const struct iphdr *) skb->data; | ||
| 984 | struct flowi4 fl4; | ||
| 985 | struct rtable *rt; | ||
| 986 | 1778 | ||
| 987 | __build_flow_key(&fl4, NULL, iph, oif, | 1779 | peer->pmtu_learned = mtu; |
| 988 | RT_TOS(iph->tos), protocol, mark, flow_flags); | 1780 | peer->pmtu_expires = pmtu_expires; |
| 989 | rt = __ip_route_output_key(net, &fl4); | 1781 | |
| 990 | if (!IS_ERR(rt)) { | 1782 | atomic_inc(&__rt_peer_genid); |
| 991 | __ip_do_redirect(rt, skb, &fl4, false); | 1783 | rt->rt_peer_genid = rt_peer_genid(); |
| 992 | ip_rt_put(rt); | 1784 | } |
| 1785 | check_peer_pmtu(dst, peer); | ||
| 993 | } | 1786 | } |
| 994 | } | 1787 | } |
| 995 | EXPORT_SYMBOL_GPL(ipv4_redirect); | ||
| 996 | 1788 | ||
| 997 | void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) | 1789 | |
| 1790 | static void ipv4_validate_peer(struct rtable *rt) | ||
| 998 | { | 1791 | { |
| 999 | const struct iphdr *iph = (const struct iphdr *) skb->data; | 1792 | if (rt->rt_peer_genid != rt_peer_genid()) { |
| 1000 | struct flowi4 fl4; | 1793 | struct inet_peer *peer; |
| 1001 | struct rtable *rt; | 1794 | |
| 1795 | if (!rt->peer) | ||
| 1796 | rt_bind_peer(rt, rt->rt_dst, 0); | ||
| 1002 | 1797 | ||
| 1003 | __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); | 1798 | peer = rt->peer; |
| 1004 | rt = __ip_route_output_key(sock_net(sk), &fl4); | 1799 | if (peer) { |
| 1005 | if (!IS_ERR(rt)) { | 1800 | check_peer_pmtu(&rt->dst, peer); |
| 1006 | __ip_do_redirect(rt, skb, &fl4, false); | 1801 | |
| 1007 | ip_rt_put(rt); | 1802 | if (peer->redirect_genid != redirect_genid) |
| 1803 | peer->redirect_learned.a4 = 0; | ||
| 1804 | if (peer->redirect_learned.a4 && | ||
| 1805 | peer->redirect_learned.a4 != rt->rt_gateway) | ||
| 1806 | check_peer_redir(&rt->dst, peer); | ||
| 1807 | } | ||
| 1808 | |||
| 1809 | rt->rt_peer_genid = rt_peer_genid(); | ||
| 1008 | } | 1810 | } |
| 1009 | } | 1811 | } |
| 1010 | EXPORT_SYMBOL_GPL(ipv4_sk_redirect); | ||
| 1011 | 1812 | ||
| 1012 | static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) | 1813 | static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) |
| 1013 | { | 1814 | { |
| 1014 | struct rtable *rt = (struct rtable *) dst; | 1815 | struct rtable *rt = (struct rtable *) dst; |
| 1015 | 1816 | ||
| 1016 | /* All IPV4 dsts are created with ->obsolete set to the value | 1817 | if (rt_is_expired(rt)) |
| 1017 | * DST_OBSOLETE_FORCE_CHK which forces validation calls down | ||
| 1018 | * into this function always. | ||
| 1019 | * | ||
| 1020 | * When a PMTU/redirect information update invalidates a | ||
| 1021 | * route, this is indicated by setting obsolete to | ||
| 1022 | * DST_OBSOLETE_KILL. | ||
| 1023 | */ | ||
| 1024 | if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt)) | ||
| 1025 | return NULL; | 1818 | return NULL; |
| 1819 | ipv4_validate_peer(rt); | ||
| 1026 | return dst; | 1820 | return dst; |
| 1027 | } | 1821 | } |
| 1028 | 1822 | ||
| 1823 | static void ipv4_dst_destroy(struct dst_entry *dst) | ||
| 1824 | { | ||
| 1825 | struct rtable *rt = (struct rtable *) dst; | ||
| 1826 | struct inet_peer *peer = rt->peer; | ||
| 1827 | |||
| 1828 | if (rt->fi) { | ||
| 1829 | fib_info_put(rt->fi); | ||
| 1830 | rt->fi = NULL; | ||
| 1831 | } | ||
| 1832 | if (peer) { | ||
| 1833 | rt->peer = NULL; | ||
| 1834 | inet_putpeer(peer); | ||
| 1835 | } | ||
| 1836 | } | ||
| 1837 | |||
| 1838 | |||
| 1029 | static void ipv4_link_failure(struct sk_buff *skb) | 1839 | static void ipv4_link_failure(struct sk_buff *skb) |
| 1030 | { | 1840 | { |
| 1031 | struct rtable *rt; | 1841 | struct rtable *rt; |
| @@ -1033,15 +1843,15 @@ static void ipv4_link_failure(struct sk_buff *skb) | |||
| 1033 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); | 1843 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); |
| 1034 | 1844 | ||
| 1035 | rt = skb_rtable(skb); | 1845 | rt = skb_rtable(skb); |
| 1036 | if (rt) | 1846 | if (rt && rt->peer && peer_pmtu_cleaned(rt->peer)) |
| 1037 | dst_set_expires(&rt->dst, 0); | 1847 | dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); |
| 1038 | } | 1848 | } |
| 1039 | 1849 | ||
| 1040 | static int ip_rt_bug(struct sk_buff *skb) | 1850 | static int ip_rt_bug(struct sk_buff *skb) |
| 1041 | { | 1851 | { |
| 1042 | pr_debug("%s: %pI4 -> %pI4, %s\n", | 1852 | printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n", |
| 1043 | __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, | 1853 | &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, |
| 1044 | skb->dev ? skb->dev->name : "?"); | 1854 | skb->dev ? skb->dev->name : "?"); |
| 1045 | kfree_skb(skb); | 1855 | kfree_skb(skb); |
| 1046 | WARN_ON(1); | 1856 | WARN_ON(1); |
| 1047 | return 0; | 1857 | return 0; |
| @@ -1081,9 +1891,8 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) | |||
| 1081 | if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) | 1891 | if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) |
| 1082 | src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); | 1892 | src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); |
| 1083 | else | 1893 | else |
| 1084 | src = inet_select_addr(rt->dst.dev, | 1894 | src = inet_select_addr(rt->dst.dev, rt->rt_gateway, |
| 1085 | rt_nexthop(rt, iph->daddr), | 1895 | RT_SCOPE_UNIVERSE); |
| 1086 | RT_SCOPE_UNIVERSE); | ||
| 1087 | rcu_read_unlock(); | 1896 | rcu_read_unlock(); |
| 1088 | } | 1897 | } |
| 1089 | memcpy(addr, &src, 4); | 1898 | memcpy(addr, &src, 4); |
| @@ -1112,21 +1921,14 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) | |||
| 1112 | return advmss; | 1921 | return advmss; |
| 1113 | } | 1922 | } |
| 1114 | 1923 | ||
| 1115 | static unsigned int ipv4_mtu(const struct dst_entry *dst) | 1924 | static unsigned int ipv4_default_mtu(const struct dst_entry *dst) |
| 1116 | { | 1925 | { |
| 1117 | const struct rtable *rt = (const struct rtable *) dst; | 1926 | unsigned int mtu = dst->dev->mtu; |
| 1118 | unsigned int mtu = rt->rt_pmtu; | ||
| 1119 | |||
| 1120 | if (!mtu || time_after_eq(jiffies, rt->dst.expires)) | ||
| 1121 | mtu = dst_metric_raw(dst, RTAX_MTU); | ||
| 1122 | |||
| 1123 | if (mtu && rt_is_output_route(rt)) | ||
| 1124 | return mtu; | ||
| 1125 | |||
| 1126 | mtu = dst->dev->mtu; | ||
| 1127 | 1927 | ||
| 1128 | if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { | 1928 | if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { |
| 1129 | if (rt->rt_uses_gateway && mtu > 576) | 1929 | const struct rtable *rt = (const struct rtable *) dst; |
| 1930 | |||
| 1931 | if (rt->rt_gateway != rt->rt_dst && mtu > 576) | ||
| 1130 | mtu = 576; | 1932 | mtu = 576; |
| 1131 | } | 1933 | } |
| 1132 | 1934 | ||
| @@ -1136,184 +1938,77 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) | |||
| 1136 | return mtu; | 1938 | return mtu; |
| 1137 | } | 1939 | } |
| 1138 | 1940 | ||
| 1139 | static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) | 1941 | static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, |
| 1942 | struct fib_info *fi) | ||
| 1140 | { | 1943 | { |
| 1141 | struct fnhe_hash_bucket *hash = nh->nh_exceptions; | 1944 | struct inet_peer *peer; |
| 1142 | struct fib_nh_exception *fnhe; | 1945 | int create = 0; |
| 1143 | u32 hval; | ||
| 1144 | |||
| 1145 | if (!hash) | ||
| 1146 | return NULL; | ||
| 1147 | |||
| 1148 | hval = fnhe_hashfun(daddr); | ||
| 1149 | |||
| 1150 | for (fnhe = rcu_dereference(hash[hval].chain); fnhe; | ||
| 1151 | fnhe = rcu_dereference(fnhe->fnhe_next)) { | ||
| 1152 | if (fnhe->fnhe_daddr == daddr) | ||
| 1153 | return fnhe; | ||
| 1154 | } | ||
| 1155 | return NULL; | ||
| 1156 | } | ||
| 1157 | |||
| 1158 | static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, | ||
| 1159 | __be32 daddr) | ||
| 1160 | { | ||
| 1161 | bool ret = false; | ||
| 1162 | |||
| 1163 | spin_lock_bh(&fnhe_lock); | ||
| 1164 | 1946 | ||
| 1165 | if (daddr == fnhe->fnhe_daddr) { | 1947 | /* If a peer entry exists for this destination, we must hook |
| 1166 | struct rtable *orig = rcu_dereference(fnhe->fnhe_rth); | 1948 | * it up in order to get at cached metrics. |
| 1167 | if (orig && rt_is_expired(orig)) { | 1949 | */ |
| 1168 | fnhe->fnhe_gw = 0; | 1950 | if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) |
| 1169 | fnhe->fnhe_pmtu = 0; | 1951 | create = 1; |
| 1170 | fnhe->fnhe_expires = 0; | ||
| 1171 | } | ||
| 1172 | if (fnhe->fnhe_pmtu) { | ||
| 1173 | unsigned long expires = fnhe->fnhe_expires; | ||
| 1174 | unsigned long diff = expires - jiffies; | ||
| 1175 | 1952 | ||
| 1176 | if (time_before(jiffies, expires)) { | 1953 | rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create); |
| 1177 | rt->rt_pmtu = fnhe->fnhe_pmtu; | 1954 | if (peer) { |
| 1178 | dst_set_expires(&rt->dst, diff); | 1955 | rt->rt_peer_genid = rt_peer_genid(); |
| 1179 | } | 1956 | if (inet_metrics_new(peer)) |
| 1180 | } | 1957 | memcpy(peer->metrics, fi->fib_metrics, |
| 1181 | if (fnhe->fnhe_gw) { | 1958 | sizeof(u32) * RTAX_MAX); |
| 1959 | dst_init_metrics(&rt->dst, peer->metrics, false); | ||
| 1960 | |||
| 1961 | check_peer_pmtu(&rt->dst, peer); | ||
| 1962 | if (peer->redirect_genid != redirect_genid) | ||
| 1963 | peer->redirect_learned.a4 = 0; | ||
| 1964 | if (peer->redirect_learned.a4 && | ||
| 1965 | peer->redirect_learned.a4 != rt->rt_gateway) { | ||
| 1966 | rt->rt_gateway = peer->redirect_learned.a4; | ||
| 1182 | rt->rt_flags |= RTCF_REDIRECTED; | 1967 | rt->rt_flags |= RTCF_REDIRECTED; |
| 1183 | rt->rt_gateway = fnhe->fnhe_gw; | 1968 | } |
| 1184 | rt->rt_uses_gateway = 1; | ||
| 1185 | } else if (!rt->rt_gateway) | ||
| 1186 | rt->rt_gateway = daddr; | ||
| 1187 | |||
| 1188 | rcu_assign_pointer(fnhe->fnhe_rth, rt); | ||
| 1189 | if (orig) | ||
| 1190 | rt_free(orig); | ||
| 1191 | |||
| 1192 | fnhe->fnhe_stamp = jiffies; | ||
| 1193 | ret = true; | ||
| 1194 | } | ||
| 1195 | spin_unlock_bh(&fnhe_lock); | ||
| 1196 | |||
| 1197 | return ret; | ||
| 1198 | } | ||
| 1199 | |||
| 1200 | static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) | ||
| 1201 | { | ||
| 1202 | struct rtable *orig, *prev, **p; | ||
| 1203 | bool ret = true; | ||
| 1204 | |||
| 1205 | if (rt_is_input_route(rt)) { | ||
| 1206 | p = (struct rtable **)&nh->nh_rth_input; | ||
| 1207 | } else { | 1969 | } else { |
| 1208 | p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output); | 1970 | if (fi->fib_metrics != (u32 *) dst_default_metrics) { |
| 1209 | } | 1971 | rt->fi = fi; |
| 1210 | orig = *p; | 1972 | atomic_inc(&fi->fib_clntref); |
| 1211 | |||
| 1212 | prev = cmpxchg(p, orig, rt); | ||
| 1213 | if (prev == orig) { | ||
| 1214 | if (orig) | ||
| 1215 | rt_free(orig); | ||
| 1216 | } else | ||
| 1217 | ret = false; | ||
| 1218 | |||
| 1219 | return ret; | ||
| 1220 | } | ||
| 1221 | |||
| 1222 | static DEFINE_SPINLOCK(rt_uncached_lock); | ||
| 1223 | static LIST_HEAD(rt_uncached_list); | ||
| 1224 | |||
| 1225 | static void rt_add_uncached_list(struct rtable *rt) | ||
| 1226 | { | ||
| 1227 | spin_lock_bh(&rt_uncached_lock); | ||
| 1228 | list_add_tail(&rt->rt_uncached, &rt_uncached_list); | ||
| 1229 | spin_unlock_bh(&rt_uncached_lock); | ||
| 1230 | } | ||
| 1231 | |||
| 1232 | static void ipv4_dst_destroy(struct dst_entry *dst) | ||
| 1233 | { | ||
| 1234 | struct rtable *rt = (struct rtable *) dst; | ||
| 1235 | |||
| 1236 | if (!list_empty(&rt->rt_uncached)) { | ||
| 1237 | spin_lock_bh(&rt_uncached_lock); | ||
| 1238 | list_del(&rt->rt_uncached); | ||
| 1239 | spin_unlock_bh(&rt_uncached_lock); | ||
| 1240 | } | ||
| 1241 | } | ||
| 1242 | |||
| 1243 | void rt_flush_dev(struct net_device *dev) | ||
| 1244 | { | ||
| 1245 | if (!list_empty(&rt_uncached_list)) { | ||
| 1246 | struct net *net = dev_net(dev); | ||
| 1247 | struct rtable *rt; | ||
| 1248 | |||
| 1249 | spin_lock_bh(&rt_uncached_lock); | ||
| 1250 | list_for_each_entry(rt, &rt_uncached_list, rt_uncached) { | ||
| 1251 | if (rt->dst.dev != dev) | ||
| 1252 | continue; | ||
| 1253 | rt->dst.dev = net->loopback_dev; | ||
| 1254 | dev_hold(rt->dst.dev); | ||
| 1255 | dev_put(dev); | ||
| 1256 | } | 1973 | } |
| 1257 | spin_unlock_bh(&rt_uncached_lock); | 1974 | dst_init_metrics(&rt->dst, fi->fib_metrics, true); |
| 1258 | } | 1975 | } |
| 1259 | } | 1976 | } |
| 1260 | 1977 | ||
| 1261 | static bool rt_cache_valid(const struct rtable *rt) | 1978 | static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, |
| 1262 | { | ||
| 1263 | return rt && | ||
| 1264 | rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && | ||
| 1265 | !rt_is_expired(rt); | ||
| 1266 | } | ||
| 1267 | |||
| 1268 | static void rt_set_nexthop(struct rtable *rt, __be32 daddr, | ||
| 1269 | const struct fib_result *res, | 1979 | const struct fib_result *res, |
| 1270 | struct fib_nh_exception *fnhe, | ||
| 1271 | struct fib_info *fi, u16 type, u32 itag) | 1980 | struct fib_info *fi, u16 type, u32 itag) |
| 1272 | { | 1981 | { |
| 1273 | bool cached = false; | 1982 | struct dst_entry *dst = &rt->dst; |
| 1274 | 1983 | ||
| 1275 | if (fi) { | 1984 | if (fi) { |
| 1276 | struct fib_nh *nh = &FIB_RES_NH(*res); | 1985 | if (FIB_RES_GW(*res) && |
| 1277 | 1986 | FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) | |
| 1278 | if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) { | 1987 | rt->rt_gateway = FIB_RES_GW(*res); |
| 1279 | rt->rt_gateway = nh->nh_gw; | 1988 | rt_init_metrics(rt, fl4, fi); |
| 1280 | rt->rt_uses_gateway = 1; | ||
| 1281 | } | ||
| 1282 | dst_init_metrics(&rt->dst, fi->fib_metrics, true); | ||
| 1283 | #ifdef CONFIG_IP_ROUTE_CLASSID | 1989 | #ifdef CONFIG_IP_ROUTE_CLASSID |
| 1284 | rt->dst.tclassid = nh->nh_tclassid; | 1990 | dst->tclassid = FIB_RES_NH(*res).nh_tclassid; |
| 1285 | #endif | 1991 | #endif |
| 1286 | if (unlikely(fnhe)) | 1992 | } |
| 1287 | cached = rt_bind_exception(rt, fnhe, daddr); | 1993 | |
| 1288 | else if (!(rt->dst.flags & DST_NOCACHE)) | 1994 | if (dst_mtu(dst) > IP_MAX_MTU) |
| 1289 | cached = rt_cache_route(nh, rt); | 1995 | dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU); |
| 1290 | if (unlikely(!cached)) { | 1996 | if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) |
| 1291 | /* Routes we intend to cache in nexthop exception or | 1997 | dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); |
| 1292 | * FIB nexthop have the DST_NOCACHE bit clear. | ||
| 1293 | * However, if we are unsuccessful at storing this | ||
| 1294 | * route into the cache we really need to set it. | ||
| 1295 | */ | ||
| 1296 | rt->dst.flags |= DST_NOCACHE; | ||
| 1297 | if (!rt->rt_gateway) | ||
| 1298 | rt->rt_gateway = daddr; | ||
| 1299 | rt_add_uncached_list(rt); | ||
| 1300 | } | ||
| 1301 | } else | ||
| 1302 | rt_add_uncached_list(rt); | ||
| 1303 | 1998 | ||
| 1304 | #ifdef CONFIG_IP_ROUTE_CLASSID | 1999 | #ifdef CONFIG_IP_ROUTE_CLASSID |
| 1305 | #ifdef CONFIG_IP_MULTIPLE_TABLES | 2000 | #ifdef CONFIG_IP_MULTIPLE_TABLES |
| 1306 | set_class_tag(rt, res->tclassid); | 2001 | set_class_tag(rt, fib_rules_tclass(res)); |
| 1307 | #endif | 2002 | #endif |
| 1308 | set_class_tag(rt, itag); | 2003 | set_class_tag(rt, itag); |
| 1309 | #endif | 2004 | #endif |
| 1310 | } | 2005 | } |
| 1311 | 2006 | ||
| 1312 | static struct rtable *rt_dst_alloc(struct net_device *dev, | 2007 | static struct rtable *rt_dst_alloc(struct net_device *dev, |
| 1313 | bool nopolicy, bool noxfrm, bool will_cache) | 2008 | bool nopolicy, bool noxfrm) |
| 1314 | { | 2009 | { |
| 1315 | return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, | 2010 | return dst_alloc(&ipv4_dst_ops, dev, 1, -1, |
| 1316 | (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | | 2011 | DST_HOST | |
| 1317 | (nopolicy ? DST_NOPOLICY : 0) | | 2012 | (nopolicy ? DST_NOPOLICY : 0) | |
| 1318 | (noxfrm ? DST_NOXFRM : 0)); | 2013 | (noxfrm ? DST_NOXFRM : 0)); |
| 1319 | } | 2014 | } |
| @@ -1322,7 +2017,9 @@ static struct rtable *rt_dst_alloc(struct net_device *dev, | |||
| 1322 | static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, | 2017 | static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, |
| 1323 | u8 tos, struct net_device *dev, int our) | 2018 | u8 tos, struct net_device *dev, int our) |
| 1324 | { | 2019 | { |
| 2020 | unsigned int hash; | ||
| 1325 | struct rtable *rth; | 2021 | struct rtable *rth; |
| 2022 | __be32 spec_dst; | ||
| 1326 | struct in_device *in_dev = __in_dev_get_rcu(dev); | 2023 | struct in_device *in_dev = __in_dev_get_rcu(dev); |
| 1327 | u32 itag = 0; | 2024 | u32 itag = 0; |
| 1328 | int err; | 2025 | int err; |
| @@ -1333,24 +2030,21 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
| 1333 | return -EINVAL; | 2030 | return -EINVAL; |
| 1334 | 2031 | ||
| 1335 | if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || | 2032 | if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || |
| 1336 | skb->protocol != htons(ETH_P_IP)) | 2033 | ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) |
| 1337 | goto e_inval; | 2034 | goto e_inval; |
| 1338 | 2035 | ||
| 1339 | if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) | ||
| 1340 | if (ipv4_is_loopback(saddr)) | ||
| 1341 | goto e_inval; | ||
| 1342 | |||
| 1343 | if (ipv4_is_zeronet(saddr)) { | 2036 | if (ipv4_is_zeronet(saddr)) { |
| 1344 | if (!ipv4_is_local_multicast(daddr)) | 2037 | if (!ipv4_is_local_multicast(daddr)) |
| 1345 | goto e_inval; | 2038 | goto e_inval; |
| 2039 | spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); | ||
| 1346 | } else { | 2040 | } else { |
| 1347 | err = fib_validate_source(skb, saddr, 0, tos, 0, dev, | 2041 | err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, |
| 1348 | in_dev, &itag); | 2042 | &itag); |
| 1349 | if (err < 0) | 2043 | if (err < 0) |
| 1350 | goto e_err; | 2044 | goto e_err; |
| 1351 | } | 2045 | } |
| 1352 | rth = rt_dst_alloc(dev_net(dev)->loopback_dev, | 2046 | rth = rt_dst_alloc(init_net.loopback_dev, |
| 1353 | IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); | 2047 | IN_DEV_CONF_GET(in_dev, NOPOLICY), false); |
| 1354 | if (!rth) | 2048 | if (!rth) |
| 1355 | goto e_nobufs; | 2049 | goto e_nobufs; |
| 1356 | 2050 | ||
| @@ -1359,15 +2053,23 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
| 1359 | #endif | 2053 | #endif |
| 1360 | rth->dst.output = ip_rt_bug; | 2054 | rth->dst.output = ip_rt_bug; |
| 1361 | 2055 | ||
| 2056 | rth->rt_key_dst = daddr; | ||
| 2057 | rth->rt_key_src = saddr; | ||
| 1362 | rth->rt_genid = rt_genid(dev_net(dev)); | 2058 | rth->rt_genid = rt_genid(dev_net(dev)); |
| 1363 | rth->rt_flags = RTCF_MULTICAST; | 2059 | rth->rt_flags = RTCF_MULTICAST; |
| 1364 | rth->rt_type = RTN_MULTICAST; | 2060 | rth->rt_type = RTN_MULTICAST; |
| 1365 | rth->rt_is_input= 1; | 2061 | rth->rt_key_tos = tos; |
| 1366 | rth->rt_iif = 0; | 2062 | rth->rt_dst = daddr; |
| 1367 | rth->rt_pmtu = 0; | 2063 | rth->rt_src = saddr; |
| 1368 | rth->rt_gateway = 0; | 2064 | rth->rt_route_iif = dev->ifindex; |
| 1369 | rth->rt_uses_gateway = 0; | 2065 | rth->rt_iif = dev->ifindex; |
| 1370 | INIT_LIST_HEAD(&rth->rt_uncached); | 2066 | rth->rt_oif = 0; |
| 2067 | rth->rt_mark = skb->mark; | ||
| 2068 | rth->rt_gateway = daddr; | ||
| 2069 | rth->rt_spec_dst= spec_dst; | ||
| 2070 | rth->rt_peer_genid = 0; | ||
| 2071 | rth->peer = NULL; | ||
| 2072 | rth->fi = NULL; | ||
| 1371 | if (our) { | 2073 | if (our) { |
| 1372 | rth->dst.input= ip_local_deliver; | 2074 | rth->dst.input= ip_local_deliver; |
| 1373 | rth->rt_flags |= RTCF_LOCAL; | 2075 | rth->rt_flags |= RTCF_LOCAL; |
| @@ -1379,8 +2081,9 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
| 1379 | #endif | 2081 | #endif |
| 1380 | RT_CACHE_STAT_INC(in_slow_mc); | 2082 | RT_CACHE_STAT_INC(in_slow_mc); |
| 1381 | 2083 | ||
| 1382 | skb_dst_set(skb, &rth->dst); | 2084 | hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); |
| 1383 | return 0; | 2085 | rth = rt_intern_hash(hash, rth, skb, dev->ifindex); |
| 2086 | return IS_ERR(rth) ? PTR_ERR(rth) : 0; | ||
| 1384 | 2087 | ||
| 1385 | e_nobufs: | 2088 | e_nobufs: |
| 1386 | return -ENOBUFS; | 2089 | return -ENOBUFS; |
| @@ -1404,13 +2107,18 @@ static void ip_handle_martian_source(struct net_device *dev, | |||
| 1404 | * RFC1812 recommendation, if source is martian, | 2107 | * RFC1812 recommendation, if source is martian, |
| 1405 | * the only hint is MAC header. | 2108 | * the only hint is MAC header. |
| 1406 | */ | 2109 | */ |
| 1407 | pr_warn("martian source %pI4 from %pI4, on dev %s\n", | 2110 | printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n", |
| 1408 | &daddr, &saddr, dev->name); | 2111 | &daddr, &saddr, dev->name); |
| 1409 | if (dev->hard_header_len && skb_mac_header_was_set(skb)) { | 2112 | if (dev->hard_header_len && skb_mac_header_was_set(skb)) { |
| 1410 | print_hex_dump(KERN_WARNING, "ll header: ", | 2113 | int i; |
| 1411 | DUMP_PREFIX_OFFSET, 16, 1, | 2114 | const unsigned char *p = skb_mac_header(skb); |
| 1412 | skb_mac_header(skb), | 2115 | printk(KERN_WARNING "ll header: "); |
| 1413 | dev->hard_header_len, true); | 2116 | for (i = 0; i < dev->hard_header_len; i++, p++) { |
| 2117 | printk("%02x", *p); | ||
| 2118 | if (i < (dev->hard_header_len - 1)) | ||
| 2119 | printk(":"); | ||
| 2120 | } | ||
| 2121 | printk("\n"); | ||
| 1414 | } | 2122 | } |
| 1415 | } | 2123 | } |
| 1416 | #endif | 2124 | #endif |
| @@ -1420,24 +2128,28 @@ static void ip_handle_martian_source(struct net_device *dev, | |||
| 1420 | static int __mkroute_input(struct sk_buff *skb, | 2128 | static int __mkroute_input(struct sk_buff *skb, |
| 1421 | const struct fib_result *res, | 2129 | const struct fib_result *res, |
| 1422 | struct in_device *in_dev, | 2130 | struct in_device *in_dev, |
| 1423 | __be32 daddr, __be32 saddr, u32 tos) | 2131 | __be32 daddr, __be32 saddr, u32 tos, |
| 2132 | struct rtable **result) | ||
| 1424 | { | 2133 | { |
| 1425 | struct rtable *rth; | 2134 | struct rtable *rth; |
| 1426 | int err; | 2135 | int err; |
| 1427 | struct in_device *out_dev; | 2136 | struct in_device *out_dev; |
| 1428 | unsigned int flags = 0; | 2137 | unsigned int flags = 0; |
| 1429 | bool do_cache; | 2138 | __be32 spec_dst; |
| 1430 | u32 itag; | 2139 | u32 itag; |
| 1431 | 2140 | ||
| 1432 | /* get a working reference to the output device */ | 2141 | /* get a working reference to the output device */ |
| 1433 | out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); | 2142 | out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); |
| 1434 | if (out_dev == NULL) { | 2143 | if (out_dev == NULL) { |
| 1435 | net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); | 2144 | if (net_ratelimit()) |
| 2145 | printk(KERN_CRIT "Bug in ip_route_input" \ | ||
| 2146 | "_slow(). Please, report\n"); | ||
| 1436 | return -EINVAL; | 2147 | return -EINVAL; |
| 1437 | } | 2148 | } |
| 1438 | 2149 | ||
| 2150 | |||
| 1439 | err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), | 2151 | err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), |
| 1440 | in_dev->dev, in_dev, &itag); | 2152 | in_dev->dev, &spec_dst, &itag); |
| 1441 | if (err < 0) { | 2153 | if (err < 0) { |
| 1442 | ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, | 2154 | ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, |
| 1443 | saddr); | 2155 | saddr); |
| @@ -1445,13 +2157,13 @@ static int __mkroute_input(struct sk_buff *skb, | |||
| 1445 | goto cleanup; | 2157 | goto cleanup; |
| 1446 | } | 2158 | } |
| 1447 | 2159 | ||
| 1448 | do_cache = res->fi && !itag; | 2160 | if (err) |
| 1449 | if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && | 2161 | flags |= RTCF_DIRECTSRC; |
| 2162 | |||
| 2163 | if (out_dev == in_dev && err && | ||
| 1450 | (IN_DEV_SHARED_MEDIA(out_dev) || | 2164 | (IN_DEV_SHARED_MEDIA(out_dev) || |
| 1451 | inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) { | 2165 | inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) |
| 1452 | flags |= RTCF_DOREDIRECT; | 2166 | flags |= RTCF_DOREDIRECT; |
| 1453 | do_cache = false; | ||
| 1454 | } | ||
| 1455 | 2167 | ||
| 1456 | if (skb->protocol != htons(ETH_P_IP)) { | 2168 | if (skb->protocol != htons(ETH_P_IP)) { |
| 1457 | /* Not IP (i.e. ARP). Do not create route, if it is | 2169 | /* Not IP (i.e. ARP). Do not create route, if it is |
| @@ -1468,38 +2180,38 @@ static int __mkroute_input(struct sk_buff *skb, | |||
| 1468 | } | 2180 | } |
| 1469 | } | 2181 | } |
| 1470 | 2182 | ||
| 1471 | if (do_cache) { | ||
| 1472 | rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); | ||
| 1473 | if (rt_cache_valid(rth)) { | ||
| 1474 | skb_dst_set_noref(skb, &rth->dst); | ||
| 1475 | goto out; | ||
| 1476 | } | ||
| 1477 | } | ||
| 1478 | |||
| 1479 | rth = rt_dst_alloc(out_dev->dev, | 2183 | rth = rt_dst_alloc(out_dev->dev, |
| 1480 | IN_DEV_CONF_GET(in_dev, NOPOLICY), | 2184 | IN_DEV_CONF_GET(in_dev, NOPOLICY), |
| 1481 | IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); | 2185 | IN_DEV_CONF_GET(out_dev, NOXFRM)); |
| 1482 | if (!rth) { | 2186 | if (!rth) { |
| 1483 | err = -ENOBUFS; | 2187 | err = -ENOBUFS; |
| 1484 | goto cleanup; | 2188 | goto cleanup; |
| 1485 | } | 2189 | } |
| 1486 | 2190 | ||
| 2191 | rth->rt_key_dst = daddr; | ||
| 2192 | rth->rt_key_src = saddr; | ||
| 1487 | rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); | 2193 | rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); |
| 1488 | rth->rt_flags = flags; | 2194 | rth->rt_flags = flags; |
| 1489 | rth->rt_type = res->type; | 2195 | rth->rt_type = res->type; |
| 1490 | rth->rt_is_input = 1; | 2196 | rth->rt_key_tos = tos; |
| 1491 | rth->rt_iif = 0; | 2197 | rth->rt_dst = daddr; |
| 1492 | rth->rt_pmtu = 0; | 2198 | rth->rt_src = saddr; |
| 1493 | rth->rt_gateway = 0; | 2199 | rth->rt_route_iif = in_dev->dev->ifindex; |
| 1494 | rth->rt_uses_gateway = 0; | 2200 | rth->rt_iif = in_dev->dev->ifindex; |
| 1495 | INIT_LIST_HEAD(&rth->rt_uncached); | 2201 | rth->rt_oif = 0; |
| 2202 | rth->rt_mark = skb->mark; | ||
| 2203 | rth->rt_gateway = daddr; | ||
| 2204 | rth->rt_spec_dst= spec_dst; | ||
| 2205 | rth->rt_peer_genid = 0; | ||
| 2206 | rth->peer = NULL; | ||
| 2207 | rth->fi = NULL; | ||
| 1496 | 2208 | ||
| 1497 | rth->dst.input = ip_forward; | 2209 | rth->dst.input = ip_forward; |
| 1498 | rth->dst.output = ip_output; | 2210 | rth->dst.output = ip_output; |
| 1499 | 2211 | ||
| 1500 | rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag); | 2212 | rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag); |
| 1501 | skb_dst_set(skb, &rth->dst); | 2213 | |
| 1502 | out: | 2214 | *result = rth; |
| 1503 | err = 0; | 2215 | err = 0; |
| 1504 | cleanup: | 2216 | cleanup: |
| 1505 | return err; | 2217 | return err; |
| @@ -1511,13 +2223,27 @@ static int ip_mkroute_input(struct sk_buff *skb, | |||
| 1511 | struct in_device *in_dev, | 2223 | struct in_device *in_dev, |
| 1512 | __be32 daddr, __be32 saddr, u32 tos) | 2224 | __be32 daddr, __be32 saddr, u32 tos) |
| 1513 | { | 2225 | { |
| 2226 | struct rtable* rth = NULL; | ||
| 2227 | int err; | ||
| 2228 | unsigned hash; | ||
| 2229 | |||
| 1514 | #ifdef CONFIG_IP_ROUTE_MULTIPATH | 2230 | #ifdef CONFIG_IP_ROUTE_MULTIPATH |
| 1515 | if (res->fi && res->fi->fib_nhs > 1) | 2231 | if (res->fi && res->fi->fib_nhs > 1) |
| 1516 | fib_select_multipath(res); | 2232 | fib_select_multipath(res); |
| 1517 | #endif | 2233 | #endif |
| 1518 | 2234 | ||
| 1519 | /* create a routing cache entry */ | 2235 | /* create a routing cache entry */ |
| 1520 | return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); | 2236 | err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); |
| 2237 | if (err) | ||
| 2238 | return err; | ||
| 2239 | |||
| 2240 | /* put it into the cache */ | ||
| 2241 | hash = rt_hash(daddr, saddr, fl4->flowi4_iif, | ||
| 2242 | rt_genid(dev_net(rth->dst.dev))); | ||
| 2243 | rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif); | ||
| 2244 | if (IS_ERR(rth)) | ||
| 2245 | return PTR_ERR(rth); | ||
| 2246 | return 0; | ||
| 1521 | } | 2247 | } |
| 1522 | 2248 | ||
| 1523 | /* | 2249 | /* |
| @@ -1537,12 +2263,13 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
| 1537 | struct fib_result res; | 2263 | struct fib_result res; |
| 1538 | struct in_device *in_dev = __in_dev_get_rcu(dev); | 2264 | struct in_device *in_dev = __in_dev_get_rcu(dev); |
| 1539 | struct flowi4 fl4; | 2265 | struct flowi4 fl4; |
| 1540 | unsigned int flags = 0; | 2266 | unsigned flags = 0; |
| 1541 | u32 itag = 0; | 2267 | u32 itag = 0; |
| 1542 | struct rtable *rth; | 2268 | struct rtable * rth; |
| 2269 | unsigned hash; | ||
| 2270 | __be32 spec_dst; | ||
| 1543 | int err = -EINVAL; | 2271 | int err = -EINVAL; |
| 1544 | struct net *net = dev_net(dev); | 2272 | struct net * net = dev_net(dev); |
| 1545 | bool do_cache; | ||
| 1546 | 2273 | ||
| 1547 | /* IP on this device is disabled. */ | 2274 | /* IP on this device is disabled. */ |
| 1548 | 2275 | ||
| @@ -1553,10 +2280,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
| 1553 | by fib_lookup. | 2280 | by fib_lookup. |
| 1554 | */ | 2281 | */ |
| 1555 | 2282 | ||
| 1556 | if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) | 2283 | if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || |
| 2284 | ipv4_is_loopback(saddr)) | ||
| 1557 | goto martian_source; | 2285 | goto martian_source; |
| 1558 | 2286 | ||
| 1559 | res.fi = NULL; | ||
| 1560 | if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) | 2287 | if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) |
| 1561 | goto brd_input; | 2288 | goto brd_input; |
| 1562 | 2289 | ||
| @@ -1566,20 +2293,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
| 1566 | if (ipv4_is_zeronet(saddr)) | 2293 | if (ipv4_is_zeronet(saddr)) |
| 1567 | goto martian_source; | 2294 | goto martian_source; |
| 1568 | 2295 | ||
| 1569 | if (ipv4_is_zeronet(daddr)) | 2296 | if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr)) |
| 1570 | goto martian_destination; | 2297 | goto martian_destination; |
| 1571 | 2298 | ||
| 1572 | /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), | ||
| 1573 | * and call it once if daddr or/and saddr are loopback addresses | ||
| 1574 | */ | ||
| 1575 | if (ipv4_is_loopback(daddr)) { | ||
| 1576 | if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) | ||
| 1577 | goto martian_destination; | ||
| 1578 | } else if (ipv4_is_loopback(saddr)) { | ||
| 1579 | if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) | ||
| 1580 | goto martian_source; | ||
| 1581 | } | ||
| 1582 | |||
| 1583 | /* | 2299 | /* |
| 1584 | * Now we are ready to route packet. | 2300 | * Now we are ready to route packet. |
| 1585 | */ | 2301 | */ |
| @@ -1591,8 +2307,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
| 1591 | fl4.daddr = daddr; | 2307 | fl4.daddr = daddr; |
| 1592 | fl4.saddr = saddr; | 2308 | fl4.saddr = saddr; |
| 1593 | err = fib_lookup(net, &fl4, &res); | 2309 | err = fib_lookup(net, &fl4, &res); |
| 1594 | if (err != 0) | 2310 | if (err != 0) { |
| 2311 | if (!IN_DEV_FORWARD(in_dev)) | ||
| 2312 | goto e_hostunreach; | ||
| 1595 | goto no_route; | 2313 | goto no_route; |
| 2314 | } | ||
| 1596 | 2315 | ||
| 1597 | RT_CACHE_STAT_INC(in_slow_tot); | 2316 | RT_CACHE_STAT_INC(in_slow_tot); |
| 1598 | 2317 | ||
| @@ -1601,15 +2320,18 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
| 1601 | 2320 | ||
| 1602 | if (res.type == RTN_LOCAL) { | 2321 | if (res.type == RTN_LOCAL) { |
| 1603 | err = fib_validate_source(skb, saddr, daddr, tos, | 2322 | err = fib_validate_source(skb, saddr, daddr, tos, |
| 1604 | LOOPBACK_IFINDEX, | 2323 | net->loopback_dev->ifindex, |
| 1605 | dev, in_dev, &itag); | 2324 | dev, &spec_dst, &itag); |
| 1606 | if (err < 0) | 2325 | if (err < 0) |
| 1607 | goto martian_source_keep_err; | 2326 | goto martian_source_keep_err; |
| 2327 | if (err) | ||
| 2328 | flags |= RTCF_DIRECTSRC; | ||
| 2329 | spec_dst = daddr; | ||
| 1608 | goto local_input; | 2330 | goto local_input; |
| 1609 | } | 2331 | } |
| 1610 | 2332 | ||
| 1611 | if (!IN_DEV_FORWARD(in_dev)) | 2333 | if (!IN_DEV_FORWARD(in_dev)) |
| 1612 | goto no_route; | 2334 | goto e_hostunreach; |
| 1613 | if (res.type != RTN_UNICAST) | 2335 | if (res.type != RTN_UNICAST) |
| 1614 | goto martian_destination; | 2336 | goto martian_destination; |
| 1615 | 2337 | ||
| @@ -1620,32 +2342,23 @@ brd_input: | |||
| 1620 | if (skb->protocol != htons(ETH_P_IP)) | 2342 | if (skb->protocol != htons(ETH_P_IP)) |
| 1621 | goto e_inval; | 2343 | goto e_inval; |
| 1622 | 2344 | ||
| 1623 | if (!ipv4_is_zeronet(saddr)) { | 2345 | if (ipv4_is_zeronet(saddr)) |
| 1624 | err = fib_validate_source(skb, saddr, 0, tos, 0, dev, | 2346 | spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); |
| 1625 | in_dev, &itag); | 2347 | else { |
| 2348 | err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, | ||
| 2349 | &itag); | ||
| 1626 | if (err < 0) | 2350 | if (err < 0) |
| 1627 | goto martian_source_keep_err; | 2351 | goto martian_source_keep_err; |
| 2352 | if (err) | ||
| 2353 | flags |= RTCF_DIRECTSRC; | ||
| 1628 | } | 2354 | } |
| 1629 | flags |= RTCF_BROADCAST; | 2355 | flags |= RTCF_BROADCAST; |
| 1630 | res.type = RTN_BROADCAST; | 2356 | res.type = RTN_BROADCAST; |
| 1631 | RT_CACHE_STAT_INC(in_brd); | 2357 | RT_CACHE_STAT_INC(in_brd); |
| 1632 | 2358 | ||
| 1633 | local_input: | 2359 | local_input: |
| 1634 | do_cache = false; | ||
| 1635 | if (res.fi) { | ||
| 1636 | if (!itag) { | ||
| 1637 | rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input); | ||
| 1638 | if (rt_cache_valid(rth)) { | ||
| 1639 | skb_dst_set_noref(skb, &rth->dst); | ||
| 1640 | err = 0; | ||
| 1641 | goto out; | ||
| 1642 | } | ||
| 1643 | do_cache = true; | ||
| 1644 | } | ||
| 1645 | } | ||
| 1646 | |||
| 1647 | rth = rt_dst_alloc(net->loopback_dev, | 2360 | rth = rt_dst_alloc(net->loopback_dev, |
| 1648 | IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); | 2361 | IN_DEV_CONF_GET(in_dev, NOPOLICY), false); |
| 1649 | if (!rth) | 2362 | if (!rth) |
| 1650 | goto e_nobufs; | 2363 | goto e_nobufs; |
| 1651 | 2364 | ||
| @@ -1655,28 +2368,41 @@ local_input: | |||
| 1655 | rth->dst.tclassid = itag; | 2368 | rth->dst.tclassid = itag; |
| 1656 | #endif | 2369 | #endif |
| 1657 | 2370 | ||
| 2371 | rth->rt_key_dst = daddr; | ||
| 2372 | rth->rt_key_src = saddr; | ||
| 1658 | rth->rt_genid = rt_genid(net); | 2373 | rth->rt_genid = rt_genid(net); |
| 1659 | rth->rt_flags = flags|RTCF_LOCAL; | 2374 | rth->rt_flags = flags|RTCF_LOCAL; |
| 1660 | rth->rt_type = res.type; | 2375 | rth->rt_type = res.type; |
| 1661 | rth->rt_is_input = 1; | 2376 | rth->rt_key_tos = tos; |
| 1662 | rth->rt_iif = 0; | 2377 | rth->rt_dst = daddr; |
| 1663 | rth->rt_pmtu = 0; | 2378 | rth->rt_src = saddr; |
| 1664 | rth->rt_gateway = 0; | 2379 | #ifdef CONFIG_IP_ROUTE_CLASSID |
| 1665 | rth->rt_uses_gateway = 0; | 2380 | rth->dst.tclassid = itag; |
| 1666 | INIT_LIST_HEAD(&rth->rt_uncached); | 2381 | #endif |
| 2382 | rth->rt_route_iif = dev->ifindex; | ||
| 2383 | rth->rt_iif = dev->ifindex; | ||
| 2384 | rth->rt_oif = 0; | ||
| 2385 | rth->rt_mark = skb->mark; | ||
| 2386 | rth->rt_gateway = daddr; | ||
| 2387 | rth->rt_spec_dst= spec_dst; | ||
| 2388 | rth->rt_peer_genid = 0; | ||
| 2389 | rth->peer = NULL; | ||
| 2390 | rth->fi = NULL; | ||
| 1667 | if (res.type == RTN_UNREACHABLE) { | 2391 | if (res.type == RTN_UNREACHABLE) { |
| 1668 | rth->dst.input= ip_error; | 2392 | rth->dst.input= ip_error; |
| 1669 | rth->dst.error= -err; | 2393 | rth->dst.error= -err; |
| 1670 | rth->rt_flags &= ~RTCF_LOCAL; | 2394 | rth->rt_flags &= ~RTCF_LOCAL; |
| 1671 | } | 2395 | } |
| 1672 | if (do_cache) | 2396 | hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); |
| 1673 | rt_cache_route(&FIB_RES_NH(res), rth); | 2397 | rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); |
| 1674 | skb_dst_set(skb, &rth->dst); | ||
| 1675 | err = 0; | 2398 | err = 0; |
| 2399 | if (IS_ERR(rth)) | ||
| 2400 | err = PTR_ERR(rth); | ||
| 1676 | goto out; | 2401 | goto out; |
| 1677 | 2402 | ||
| 1678 | no_route: | 2403 | no_route: |
| 1679 | RT_CACHE_STAT_INC(in_no_route); | 2404 | RT_CACHE_STAT_INC(in_no_route); |
| 2405 | spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); | ||
| 1680 | res.type = RTN_UNREACHABLE; | 2406 | res.type = RTN_UNREACHABLE; |
| 1681 | if (err == -ESRCH) | 2407 | if (err == -ESRCH) |
| 1682 | err = -ENETUNREACH; | 2408 | err = -ENETUNREACH; |
| @@ -1688,11 +2414,15 @@ no_route: | |||
| 1688 | martian_destination: | 2414 | martian_destination: |
| 1689 | RT_CACHE_STAT_INC(in_martian_dst); | 2415 | RT_CACHE_STAT_INC(in_martian_dst); |
| 1690 | #ifdef CONFIG_IP_ROUTE_VERBOSE | 2416 | #ifdef CONFIG_IP_ROUTE_VERBOSE |
| 1691 | if (IN_DEV_LOG_MARTIANS(in_dev)) | 2417 | if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) |
| 1692 | net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", | 2418 | printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n", |
| 1693 | &daddr, &saddr, dev->name); | 2419 | &daddr, &saddr, dev->name); |
| 1694 | #endif | 2420 | #endif |
| 1695 | 2421 | ||
| 2422 | e_hostunreach: | ||
| 2423 | err = -EHOSTUNREACH; | ||
| 2424 | goto out; | ||
| 2425 | |||
| 1696 | e_inval: | 2426 | e_inval: |
| 1697 | err = -EINVAL; | 2427 | err = -EINVAL; |
| 1698 | goto out; | 2428 | goto out; |
| @@ -1708,13 +2438,50 @@ martian_source_keep_err: | |||
| 1708 | goto out; | 2438 | goto out; |
| 1709 | } | 2439 | } |
| 1710 | 2440 | ||
| 1711 | int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, | 2441 | int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, |
| 1712 | u8 tos, struct net_device *dev) | 2442 | u8 tos, struct net_device *dev, bool noref) |
| 1713 | { | 2443 | { |
| 2444 | struct rtable * rth; | ||
| 2445 | unsigned hash; | ||
| 2446 | int iif = dev->ifindex; | ||
| 2447 | struct net *net; | ||
| 1714 | int res; | 2448 | int res; |
| 1715 | 2449 | ||
| 2450 | net = dev_net(dev); | ||
| 2451 | |||
| 1716 | rcu_read_lock(); | 2452 | rcu_read_lock(); |
| 1717 | 2453 | ||
| 2454 | if (!rt_caching(net)) | ||
| 2455 | goto skip_cache; | ||
| 2456 | |||
| 2457 | tos &= IPTOS_RT_MASK; | ||
| 2458 | hash = rt_hash(daddr, saddr, iif, rt_genid(net)); | ||
| 2459 | |||
| 2460 | for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; | ||
| 2461 | rth = rcu_dereference(rth->dst.rt_next)) { | ||
| 2462 | if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | | ||
| 2463 | ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | | ||
| 2464 | (rth->rt_route_iif ^ iif) | | ||
| 2465 | (rth->rt_key_tos ^ tos)) == 0 && | ||
| 2466 | rth->rt_mark == skb->mark && | ||
| 2467 | net_eq(dev_net(rth->dst.dev), net) && | ||
| 2468 | !rt_is_expired(rth)) { | ||
| 2469 | ipv4_validate_peer(rth); | ||
| 2470 | if (noref) { | ||
| 2471 | dst_use_noref(&rth->dst, jiffies); | ||
| 2472 | skb_dst_set_noref(skb, &rth->dst); | ||
| 2473 | } else { | ||
| 2474 | dst_use(&rth->dst, jiffies); | ||
| 2475 | skb_dst_set(skb, &rth->dst); | ||
| 2476 | } | ||
| 2477 | RT_CACHE_STAT_INC(in_hit); | ||
| 2478 | rcu_read_unlock(); | ||
| 2479 | return 0; | ||
| 2480 | } | ||
| 2481 | RT_CACHE_STAT_INC(in_hlist_search); | ||
| 2482 | } | ||
| 2483 | |||
| 2484 | skip_cache: | ||
| 1718 | /* Multicast recognition logic is moved from route cache to here. | 2485 | /* Multicast recognition logic is moved from route cache to here. |
| 1719 | The problem was that too many Ethernet cards have broken/missing | 2486 | The problem was that too many Ethernet cards have broken/missing |
| 1720 | hardware multicast filters :-( As result the host on multicasting | 2487 | hardware multicast filters :-( As result the host on multicasting |
| @@ -1752,29 +2519,24 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
| 1752 | rcu_read_unlock(); | 2519 | rcu_read_unlock(); |
| 1753 | return res; | 2520 | return res; |
| 1754 | } | 2521 | } |
| 1755 | EXPORT_SYMBOL(ip_route_input_noref); | 2522 | EXPORT_SYMBOL(ip_route_input_common); |
| 1756 | 2523 | ||
| 1757 | /* called with rcu_read_lock() */ | 2524 | /* called with rcu_read_lock() */ |
| 1758 | static struct rtable *__mkroute_output(const struct fib_result *res, | 2525 | static struct rtable *__mkroute_output(const struct fib_result *res, |
| 1759 | const struct flowi4 *fl4, int orig_oif, | 2526 | const struct flowi4 *fl4, |
| 2527 | __be32 orig_daddr, __be32 orig_saddr, | ||
| 2528 | int orig_oif, __u8 orig_rtos, | ||
| 1760 | struct net_device *dev_out, | 2529 | struct net_device *dev_out, |
| 1761 | unsigned int flags) | 2530 | unsigned int flags) |
| 1762 | { | 2531 | { |
| 1763 | struct fib_info *fi = res->fi; | 2532 | struct fib_info *fi = res->fi; |
| 1764 | struct fib_nh_exception *fnhe; | ||
| 1765 | struct in_device *in_dev; | 2533 | struct in_device *in_dev; |
| 1766 | u16 type = res->type; | 2534 | u16 type = res->type; |
| 1767 | struct rtable *rth; | 2535 | struct rtable *rth; |
| 1768 | bool do_cache; | ||
| 1769 | 2536 | ||
| 1770 | in_dev = __in_dev_get_rcu(dev_out); | 2537 | if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) |
| 1771 | if (!in_dev) | ||
| 1772 | return ERR_PTR(-EINVAL); | 2538 | return ERR_PTR(-EINVAL); |
| 1773 | 2539 | ||
| 1774 | if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) | ||
| 1775 | if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) | ||
| 1776 | return ERR_PTR(-EINVAL); | ||
| 1777 | |||
| 1778 | if (ipv4_is_lbcast(fl4->daddr)) | 2540 | if (ipv4_is_lbcast(fl4->daddr)) |
| 1779 | type = RTN_BROADCAST; | 2541 | type = RTN_BROADCAST; |
| 1780 | else if (ipv4_is_multicast(fl4->daddr)) | 2542 | else if (ipv4_is_multicast(fl4->daddr)) |
| @@ -1785,7 +2547,10 @@ static struct rtable *__mkroute_output(const struct fib_result *res, | |||
| 1785 | if (dev_out->flags & IFF_LOOPBACK) | 2547 | if (dev_out->flags & IFF_LOOPBACK) |
| 1786 | flags |= RTCF_LOCAL; | 2548 | flags |= RTCF_LOCAL; |
| 1787 | 2549 | ||
| 1788 | do_cache = true; | 2550 | in_dev = __in_dev_get_rcu(dev_out); |
| 2551 | if (!in_dev) | ||
| 2552 | return ERR_PTR(-EINVAL); | ||
| 2553 | |||
| 1789 | if (type == RTN_BROADCAST) { | 2554 | if (type == RTN_BROADCAST) { |
| 1790 | flags |= RTCF_BROADCAST | RTCF_LOCAL; | 2555 | flags |= RTCF_BROADCAST | RTCF_LOCAL; |
| 1791 | fi = NULL; | 2556 | fi = NULL; |
| @@ -1794,8 +2559,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res, | |||
| 1794 | if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, | 2559 | if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, |
| 1795 | fl4->flowi4_proto)) | 2560 | fl4->flowi4_proto)) |
| 1796 | flags &= ~RTCF_LOCAL; | 2561 | flags &= ~RTCF_LOCAL; |
| 1797 | else | ||
| 1798 | do_cache = false; | ||
| 1799 | /* If multicast route do not exist use | 2562 | /* If multicast route do not exist use |
| 1800 | * default one, but do not gateway in this case. | 2563 | * default one, but do not gateway in this case. |
| 1801 | * Yes, it is hack. | 2564 | * Yes, it is hack. |
| @@ -1804,57 +2567,40 @@ static struct rtable *__mkroute_output(const struct fib_result *res, | |||
| 1804 | fi = NULL; | 2567 | fi = NULL; |
| 1805 | } | 2568 | } |
| 1806 | 2569 | ||
| 1807 | fnhe = NULL; | ||
| 1808 | do_cache &= fi != NULL; | ||
| 1809 | if (do_cache) { | ||
| 1810 | struct rtable __rcu **prth; | ||
| 1811 | struct fib_nh *nh = &FIB_RES_NH(*res); | ||
| 1812 | |||
| 1813 | fnhe = find_exception(nh, fl4->daddr); | ||
| 1814 | if (fnhe) | ||
| 1815 | prth = &fnhe->fnhe_rth; | ||
| 1816 | else { | ||
| 1817 | if (unlikely(fl4->flowi4_flags & | ||
| 1818 | FLOWI_FLAG_KNOWN_NH && | ||
| 1819 | !(nh->nh_gw && | ||
| 1820 | nh->nh_scope == RT_SCOPE_LINK))) { | ||
| 1821 | do_cache = false; | ||
| 1822 | goto add; | ||
| 1823 | } | ||
| 1824 | prth = __this_cpu_ptr(nh->nh_pcpu_rth_output); | ||
| 1825 | } | ||
| 1826 | rth = rcu_dereference(*prth); | ||
| 1827 | if (rt_cache_valid(rth)) { | ||
| 1828 | dst_hold(&rth->dst); | ||
| 1829 | return rth; | ||
| 1830 | } | ||
| 1831 | } | ||
| 1832 | |||
| 1833 | add: | ||
| 1834 | rth = rt_dst_alloc(dev_out, | 2570 | rth = rt_dst_alloc(dev_out, |
| 1835 | IN_DEV_CONF_GET(in_dev, NOPOLICY), | 2571 | IN_DEV_CONF_GET(in_dev, NOPOLICY), |
| 1836 | IN_DEV_CONF_GET(in_dev, NOXFRM), | 2572 | IN_DEV_CONF_GET(in_dev, NOXFRM)); |
| 1837 | do_cache); | ||
| 1838 | if (!rth) | 2573 | if (!rth) |
| 1839 | return ERR_PTR(-ENOBUFS); | 2574 | return ERR_PTR(-ENOBUFS); |
| 1840 | 2575 | ||
| 1841 | rth->dst.output = ip_output; | 2576 | rth->dst.output = ip_output; |
| 1842 | 2577 | ||
| 2578 | rth->rt_key_dst = orig_daddr; | ||
| 2579 | rth->rt_key_src = orig_saddr; | ||
| 1843 | rth->rt_genid = rt_genid(dev_net(dev_out)); | 2580 | rth->rt_genid = rt_genid(dev_net(dev_out)); |
| 1844 | rth->rt_flags = flags; | 2581 | rth->rt_flags = flags; |
| 1845 | rth->rt_type = type; | 2582 | rth->rt_type = type; |
| 1846 | rth->rt_is_input = 0; | 2583 | rth->rt_key_tos = orig_rtos; |
| 1847 | rth->rt_iif = orig_oif ? : 0; | 2584 | rth->rt_dst = fl4->daddr; |
| 1848 | rth->rt_pmtu = 0; | 2585 | rth->rt_src = fl4->saddr; |
| 1849 | rth->rt_gateway = 0; | 2586 | rth->rt_route_iif = 0; |
| 1850 | rth->rt_uses_gateway = 0; | 2587 | rth->rt_iif = orig_oif ? : dev_out->ifindex; |
| 1851 | INIT_LIST_HEAD(&rth->rt_uncached); | 2588 | rth->rt_oif = orig_oif; |
| 2589 | rth->rt_mark = fl4->flowi4_mark; | ||
| 2590 | rth->rt_gateway = fl4->daddr; | ||
| 2591 | rth->rt_spec_dst= fl4->saddr; | ||
| 2592 | rth->rt_peer_genid = 0; | ||
| 2593 | rth->peer = NULL; | ||
| 2594 | rth->fi = NULL; | ||
| 1852 | 2595 | ||
| 1853 | RT_CACHE_STAT_INC(out_slow_tot); | 2596 | RT_CACHE_STAT_INC(out_slow_tot); |
| 1854 | 2597 | ||
| 1855 | if (flags & RTCF_LOCAL) | 2598 | if (flags & RTCF_LOCAL) { |
| 1856 | rth->dst.input = ip_local_deliver; | 2599 | rth->dst.input = ip_local_deliver; |
| 2600 | rth->rt_spec_dst = fl4->daddr; | ||
| 2601 | } | ||
| 1857 | if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { | 2602 | if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { |
| 2603 | rth->rt_spec_dst = fl4->saddr; | ||
| 1858 | if (flags & RTCF_LOCAL && | 2604 | if (flags & RTCF_LOCAL && |
| 1859 | !(dev_out->flags & IFF_LOOPBACK)) { | 2605 | !(dev_out->flags & IFF_LOOPBACK)) { |
| 1860 | rth->dst.output = ip_mc_output; | 2606 | rth->dst.output = ip_mc_output; |
| @@ -1871,31 +2617,37 @@ add: | |||
| 1871 | #endif | 2617 | #endif |
| 1872 | } | 2618 | } |
| 1873 | 2619 | ||
| 1874 | rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); | 2620 | rt_set_nexthop(rth, fl4, res, fi, type, 0); |
| 1875 | 2621 | ||
| 1876 | return rth; | 2622 | return rth; |
| 1877 | } | 2623 | } |
| 1878 | 2624 | ||
| 1879 | /* | 2625 | /* |
| 1880 | * Major route resolver routine. | 2626 | * Major route resolver routine. |
| 2627 | * called with rcu_read_lock(); | ||
| 1881 | */ | 2628 | */ |
| 1882 | 2629 | ||
| 1883 | struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) | 2630 | static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) |
| 1884 | { | 2631 | { |
| 1885 | struct net_device *dev_out = NULL; | 2632 | struct net_device *dev_out = NULL; |
| 1886 | __u8 tos = RT_FL_TOS(fl4); | 2633 | __u8 tos = RT_FL_TOS(fl4); |
| 1887 | unsigned int flags = 0; | 2634 | unsigned int flags = 0; |
| 1888 | struct fib_result res; | 2635 | struct fib_result res; |
| 1889 | struct rtable *rth; | 2636 | struct rtable *rth; |
| 2637 | __be32 orig_daddr; | ||
| 2638 | __be32 orig_saddr; | ||
| 1890 | int orig_oif; | 2639 | int orig_oif; |
| 1891 | 2640 | ||
| 1892 | res.tclassid = 0; | ||
| 1893 | res.fi = NULL; | 2641 | res.fi = NULL; |
| 1894 | res.table = NULL; | 2642 | #ifdef CONFIG_IP_MULTIPLE_TABLES |
| 2643 | res.r = NULL; | ||
| 2644 | #endif | ||
| 1895 | 2645 | ||
| 2646 | orig_daddr = fl4->daddr; | ||
| 2647 | orig_saddr = fl4->saddr; | ||
| 1896 | orig_oif = fl4->flowi4_oif; | 2648 | orig_oif = fl4->flowi4_oif; |
| 1897 | 2649 | ||
| 1898 | fl4->flowi4_iif = LOOPBACK_IFINDEX; | 2650 | fl4->flowi4_iif = net->loopback_dev->ifindex; |
| 1899 | fl4->flowi4_tos = tos & IPTOS_RT_MASK; | 2651 | fl4->flowi4_tos = tos & IPTOS_RT_MASK; |
| 1900 | fl4->flowi4_scope = ((tos & RTO_ONLINK) ? | 2652 | fl4->flowi4_scope = ((tos & RTO_ONLINK) ? |
| 1901 | RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); | 2653 | RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); |
| @@ -1984,7 +2736,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) | |||
| 1984 | if (!fl4->daddr) | 2736 | if (!fl4->daddr) |
| 1985 | fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); | 2737 | fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); |
| 1986 | dev_out = net->loopback_dev; | 2738 | dev_out = net->loopback_dev; |
| 1987 | fl4->flowi4_oif = LOOPBACK_IFINDEX; | 2739 | fl4->flowi4_oif = net->loopback_dev->ifindex; |
| 1988 | res.type = RTN_LOCAL; | 2740 | res.type = RTN_LOCAL; |
| 1989 | flags |= RTCF_LOCAL; | 2741 | flags |= RTCF_LOCAL; |
| 1990 | goto make_route; | 2742 | goto make_route; |
| @@ -1992,7 +2744,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) | |||
| 1992 | 2744 | ||
| 1993 | if (fib_lookup(net, fl4, &res)) { | 2745 | if (fib_lookup(net, fl4, &res)) { |
| 1994 | res.fi = NULL; | 2746 | res.fi = NULL; |
| 1995 | res.table = NULL; | ||
| 1996 | if (fl4->flowi4_oif) { | 2747 | if (fl4->flowi4_oif) { |
| 1997 | /* Apparently, routing tables are wrong. Assume, | 2748 | /* Apparently, routing tables are wrong. Assume, |
| 1998 | that the destination is on link. | 2749 | that the destination is on link. |
| @@ -2031,6 +2782,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) | |||
| 2031 | } | 2782 | } |
| 2032 | dev_out = net->loopback_dev; | 2783 | dev_out = net->loopback_dev; |
| 2033 | fl4->flowi4_oif = dev_out->ifindex; | 2784 | fl4->flowi4_oif = dev_out->ifindex; |
| 2785 | res.fi = NULL; | ||
| 2034 | flags |= RTCF_LOCAL; | 2786 | flags |= RTCF_LOCAL; |
| 2035 | goto make_route; | 2787 | goto make_route; |
| 2036 | } | 2788 | } |
| @@ -2053,33 +2805,73 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) | |||
| 2053 | 2805 | ||
| 2054 | 2806 | ||
| 2055 | make_route: | 2807 | make_route: |
| 2056 | rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags); | 2808 | rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, |
| 2809 | tos, dev_out, flags); | ||
| 2810 | if (!IS_ERR(rth)) { | ||
| 2811 | unsigned int hash; | ||
| 2812 | |||
| 2813 | hash = rt_hash(orig_daddr, orig_saddr, orig_oif, | ||
| 2814 | rt_genid(dev_net(dev_out))); | ||
| 2815 | rth = rt_intern_hash(hash, rth, NULL, orig_oif); | ||
| 2816 | } | ||
| 2057 | 2817 | ||
| 2058 | out: | 2818 | out: |
| 2059 | rcu_read_unlock(); | 2819 | rcu_read_unlock(); |
| 2060 | return rth; | 2820 | return rth; |
| 2061 | } | 2821 | } |
| 2062 | EXPORT_SYMBOL_GPL(__ip_route_output_key); | ||
| 2063 | 2822 | ||
| 2064 | static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) | 2823 | struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4) |
| 2065 | { | 2824 | { |
| 2066 | return NULL; | 2825 | struct rtable *rth; |
| 2826 | unsigned int hash; | ||
| 2827 | |||
| 2828 | if (!rt_caching(net)) | ||
| 2829 | goto slow_output; | ||
| 2830 | |||
| 2831 | hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net)); | ||
| 2832 | |||
| 2833 | rcu_read_lock_bh(); | ||
| 2834 | for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; | ||
| 2835 | rth = rcu_dereference_bh(rth->dst.rt_next)) { | ||
| 2836 | if (rth->rt_key_dst == flp4->daddr && | ||
| 2837 | rth->rt_key_src == flp4->saddr && | ||
| 2838 | rt_is_output_route(rth) && | ||
| 2839 | rth->rt_oif == flp4->flowi4_oif && | ||
| 2840 | rth->rt_mark == flp4->flowi4_mark && | ||
| 2841 | !((rth->rt_key_tos ^ flp4->flowi4_tos) & | ||
| 2842 | (IPTOS_RT_MASK | RTO_ONLINK)) && | ||
| 2843 | net_eq(dev_net(rth->dst.dev), net) && | ||
| 2844 | !rt_is_expired(rth)) { | ||
| 2845 | ipv4_validate_peer(rth); | ||
| 2846 | dst_use(&rth->dst, jiffies); | ||
| 2847 | RT_CACHE_STAT_INC(out_hit); | ||
| 2848 | rcu_read_unlock_bh(); | ||
| 2849 | if (!flp4->saddr) | ||
| 2850 | flp4->saddr = rth->rt_src; | ||
| 2851 | if (!flp4->daddr) | ||
| 2852 | flp4->daddr = rth->rt_dst; | ||
| 2853 | return rth; | ||
| 2854 | } | ||
| 2855 | RT_CACHE_STAT_INC(out_hlist_search); | ||
| 2856 | } | ||
| 2857 | rcu_read_unlock_bh(); | ||
| 2858 | |||
| 2859 | slow_output: | ||
| 2860 | return ip_route_output_slow(net, flp4); | ||
| 2067 | } | 2861 | } |
| 2862 | EXPORT_SYMBOL_GPL(__ip_route_output_key); | ||
| 2068 | 2863 | ||
| 2069 | static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) | 2864 | static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) |
| 2070 | { | 2865 | { |
| 2071 | unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); | 2866 | return NULL; |
| 2072 | |||
| 2073 | return mtu ? : dst->dev->mtu; | ||
| 2074 | } | 2867 | } |
| 2075 | 2868 | ||
| 2076 | static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, | 2869 | static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst) |
| 2077 | struct sk_buff *skb, u32 mtu) | ||
| 2078 | { | 2870 | { |
| 2871 | return 0; | ||
| 2079 | } | 2872 | } |
| 2080 | 2873 | ||
| 2081 | static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, | 2874 | static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) |
| 2082 | struct sk_buff *skb) | ||
| 2083 | { | 2875 | { |
| 2084 | } | 2876 | } |
| 2085 | 2877 | ||
| @@ -2092,43 +2884,53 @@ static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, | |||
| 2092 | static struct dst_ops ipv4_dst_blackhole_ops = { | 2884 | static struct dst_ops ipv4_dst_blackhole_ops = { |
| 2093 | .family = AF_INET, | 2885 | .family = AF_INET, |
| 2094 | .protocol = cpu_to_be16(ETH_P_IP), | 2886 | .protocol = cpu_to_be16(ETH_P_IP), |
| 2887 | .destroy = ipv4_dst_destroy, | ||
| 2095 | .check = ipv4_blackhole_dst_check, | 2888 | .check = ipv4_blackhole_dst_check, |
| 2096 | .mtu = ipv4_blackhole_mtu, | 2889 | .default_mtu = ipv4_blackhole_default_mtu, |
| 2097 | .default_advmss = ipv4_default_advmss, | 2890 | .default_advmss = ipv4_default_advmss, |
| 2098 | .update_pmtu = ipv4_rt_blackhole_update_pmtu, | 2891 | .update_pmtu = ipv4_rt_blackhole_update_pmtu, |
| 2099 | .redirect = ipv4_rt_blackhole_redirect, | ||
| 2100 | .cow_metrics = ipv4_rt_blackhole_cow_metrics, | 2892 | .cow_metrics = ipv4_rt_blackhole_cow_metrics, |
| 2101 | .neigh_lookup = ipv4_neigh_lookup, | 2893 | .neigh_lookup = ipv4_neigh_lookup, |
| 2102 | }; | 2894 | }; |
| 2103 | 2895 | ||
| 2104 | struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) | 2896 | struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) |
| 2105 | { | 2897 | { |
| 2898 | struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0); | ||
| 2106 | struct rtable *ort = (struct rtable *) dst_orig; | 2899 | struct rtable *ort = (struct rtable *) dst_orig; |
| 2107 | struct rtable *rt; | ||
| 2108 | 2900 | ||
| 2109 | rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0); | ||
| 2110 | if (rt) { | 2901 | if (rt) { |
| 2111 | struct dst_entry *new = &rt->dst; | 2902 | struct dst_entry *new = &rt->dst; |
| 2112 | 2903 | ||
| 2113 | new->__use = 1; | 2904 | new->__use = 1; |
| 2114 | new->input = dst_discard; | 2905 | new->input = dst_discard; |
| 2115 | new->output = dst_discard; | 2906 | new->output = dst_discard; |
| 2907 | dst_copy_metrics(new, &ort->dst); | ||
| 2116 | 2908 | ||
| 2117 | new->dev = ort->dst.dev; | 2909 | new->dev = ort->dst.dev; |
| 2118 | if (new->dev) | 2910 | if (new->dev) |
| 2119 | dev_hold(new->dev); | 2911 | dev_hold(new->dev); |
| 2120 | 2912 | ||
| 2121 | rt->rt_is_input = ort->rt_is_input; | 2913 | rt->rt_key_dst = ort->rt_key_dst; |
| 2914 | rt->rt_key_src = ort->rt_key_src; | ||
| 2915 | rt->rt_key_tos = ort->rt_key_tos; | ||
| 2916 | rt->rt_route_iif = ort->rt_route_iif; | ||
| 2122 | rt->rt_iif = ort->rt_iif; | 2917 | rt->rt_iif = ort->rt_iif; |
| 2123 | rt->rt_pmtu = ort->rt_pmtu; | 2918 | rt->rt_oif = ort->rt_oif; |
| 2919 | rt->rt_mark = ort->rt_mark; | ||
| 2124 | 2920 | ||
| 2125 | rt->rt_genid = rt_genid(net); | 2921 | rt->rt_genid = rt_genid(net); |
| 2126 | rt->rt_flags = ort->rt_flags; | 2922 | rt->rt_flags = ort->rt_flags; |
| 2127 | rt->rt_type = ort->rt_type; | 2923 | rt->rt_type = ort->rt_type; |
| 2924 | rt->rt_dst = ort->rt_dst; | ||
| 2925 | rt->rt_src = ort->rt_src; | ||
| 2128 | rt->rt_gateway = ort->rt_gateway; | 2926 | rt->rt_gateway = ort->rt_gateway; |
| 2129 | rt->rt_uses_gateway = ort->rt_uses_gateway; | 2927 | rt->rt_spec_dst = ort->rt_spec_dst; |
| 2130 | 2928 | rt->peer = ort->peer; | |
| 2131 | INIT_LIST_HEAD(&rt->rt_uncached); | 2929 | if (rt->peer) |
| 2930 | atomic_inc(&rt->peer->refcnt); | ||
| 2931 | rt->fi = ort->fi; | ||
| 2932 | if (rt->fi) | ||
| 2933 | atomic_inc(&rt->fi->fib_clntref); | ||
| 2132 | 2934 | ||
| 2133 | dst_free(new); | 2935 | dst_free(new); |
| 2134 | } | 2936 | } |
| @@ -2155,18 +2957,18 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, | |||
| 2155 | } | 2957 | } |
| 2156 | EXPORT_SYMBOL_GPL(ip_route_output_flow); | 2958 | EXPORT_SYMBOL_GPL(ip_route_output_flow); |
| 2157 | 2959 | ||
| 2158 | static int rt_fill_info(struct net *net, __be32 dst, __be32 src, | 2960 | static int rt_fill_info(struct net *net, |
| 2159 | struct flowi4 *fl4, struct sk_buff *skb, u32 portid, | 2961 | struct sk_buff *skb, u32 pid, u32 seq, int event, |
| 2160 | u32 seq, int event, int nowait, unsigned int flags) | 2962 | int nowait, unsigned int flags) |
| 2161 | { | 2963 | { |
| 2162 | struct rtable *rt = skb_rtable(skb); | 2964 | struct rtable *rt = skb_rtable(skb); |
| 2163 | struct rtmsg *r; | 2965 | struct rtmsg *r; |
| 2164 | struct nlmsghdr *nlh; | 2966 | struct nlmsghdr *nlh; |
| 2165 | unsigned long expires = 0; | 2967 | long expires = 0; |
| 2166 | u32 error; | 2968 | const struct inet_peer *peer = rt->peer; |
| 2167 | u32 metrics[RTAX_MAX]; | 2969 | u32 id = 0, ts = 0, tsage = 0, error; |
| 2168 | 2970 | ||
| 2169 | nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags); | 2971 | nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); |
| 2170 | if (nlh == NULL) | 2972 | if (nlh == NULL) |
| 2171 | return -EMSGSIZE; | 2973 | return -EMSGSIZE; |
| 2172 | 2974 | ||
| @@ -2174,10 +2976,9 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, | |||
| 2174 | r->rtm_family = AF_INET; | 2976 | r->rtm_family = AF_INET; |
| 2175 | r->rtm_dst_len = 32; | 2977 | r->rtm_dst_len = 32; |
| 2176 | r->rtm_src_len = 0; | 2978 | r->rtm_src_len = 0; |
| 2177 | r->rtm_tos = fl4->flowi4_tos; | 2979 | r->rtm_tos = rt->rt_key_tos; |
| 2178 | r->rtm_table = RT_TABLE_MAIN; | 2980 | r->rtm_table = RT_TABLE_MAIN; |
| 2179 | if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN)) | 2981 | NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); |
| 2180 | goto nla_put_failure; | ||
| 2181 | r->rtm_type = rt->rt_type; | 2982 | r->rtm_type = rt->rt_type; |
| 2182 | r->rtm_scope = RT_SCOPE_UNIVERSE; | 2983 | r->rtm_scope = RT_SCOPE_UNIVERSE; |
| 2183 | r->rtm_protocol = RTPROT_UNSPEC; | 2984 | r->rtm_protocol = RTPROT_UNSPEC; |
| @@ -2185,58 +2986,53 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, | |||
| 2185 | if (rt->rt_flags & RTCF_NOTIFY) | 2986 | if (rt->rt_flags & RTCF_NOTIFY) |
| 2186 | r->rtm_flags |= RTM_F_NOTIFY; | 2987 | r->rtm_flags |= RTM_F_NOTIFY; |
| 2187 | 2988 | ||
| 2188 | if (nla_put_be32(skb, RTA_DST, dst)) | 2989 | NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); |
| 2189 | goto nla_put_failure; | 2990 | |
| 2190 | if (src) { | 2991 | if (rt->rt_key_src) { |
| 2191 | r->rtm_src_len = 32; | 2992 | r->rtm_src_len = 32; |
| 2192 | if (nla_put_be32(skb, RTA_SRC, src)) | 2993 | NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src); |
| 2193 | goto nla_put_failure; | ||
| 2194 | } | 2994 | } |
| 2195 | if (rt->dst.dev && | 2995 | if (rt->dst.dev) |
| 2196 | nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) | 2996 | NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); |
| 2197 | goto nla_put_failure; | ||
| 2198 | #ifdef CONFIG_IP_ROUTE_CLASSID | 2997 | #ifdef CONFIG_IP_ROUTE_CLASSID |
| 2199 | if (rt->dst.tclassid && | 2998 | if (rt->dst.tclassid) |
| 2200 | nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) | 2999 | NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); |
| 2201 | goto nla_put_failure; | ||
| 2202 | #endif | 3000 | #endif |
| 2203 | if (!rt_is_input_route(rt) && | 3001 | if (rt_is_input_route(rt)) |
| 2204 | fl4->saddr != src) { | 3002 | NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); |
| 2205 | if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr)) | 3003 | else if (rt->rt_src != rt->rt_key_src) |
| 2206 | goto nla_put_failure; | 3004 | NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); |
| 2207 | } | ||
| 2208 | if (rt->rt_uses_gateway && | ||
| 2209 | nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) | ||
| 2210 | goto nla_put_failure; | ||
| 2211 | |||
| 2212 | expires = rt->dst.expires; | ||
| 2213 | if (expires) { | ||
| 2214 | unsigned long now = jiffies; | ||
| 2215 | 3005 | ||
| 2216 | if (time_before(now, expires)) | 3006 | if (rt->rt_dst != rt->rt_gateway) |
| 2217 | expires -= now; | 3007 | NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); |
| 2218 | else | ||
| 2219 | expires = 0; | ||
| 2220 | } | ||
| 2221 | 3008 | ||
| 2222 | memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); | 3009 | if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) |
| 2223 | if (rt->rt_pmtu && expires) | ||
| 2224 | metrics[RTAX_MTU - 1] = rt->rt_pmtu; | ||
| 2225 | if (rtnetlink_put_metrics(skb, metrics) < 0) | ||
| 2226 | goto nla_put_failure; | 3010 | goto nla_put_failure; |
| 2227 | 3011 | ||
| 2228 | if (fl4->flowi4_mark && | 3012 | if (rt->rt_mark) |
| 2229 | nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) | 3013 | NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark); |
| 2230 | goto nla_put_failure; | ||
| 2231 | 3014 | ||
| 2232 | error = rt->dst.error; | 3015 | error = rt->dst.error; |
| 3016 | if (peer) { | ||
| 3017 | inet_peer_refcheck(rt->peer); | ||
| 3018 | id = atomic_read(&peer->ip_id_count) & 0xffff; | ||
| 3019 | if (peer->tcp_ts_stamp) { | ||
| 3020 | ts = peer->tcp_ts; | ||
| 3021 | tsage = get_seconds() - peer->tcp_ts_stamp; | ||
| 3022 | } | ||
| 3023 | expires = ACCESS_ONCE(peer->pmtu_expires); | ||
| 3024 | if (expires) | ||
| 3025 | expires -= jiffies; | ||
| 3026 | } | ||
| 2233 | 3027 | ||
| 2234 | if (rt_is_input_route(rt)) { | 3028 | if (rt_is_input_route(rt)) { |
| 2235 | #ifdef CONFIG_IP_MROUTE | 3029 | #ifdef CONFIG_IP_MROUTE |
| 3030 | __be32 dst = rt->rt_dst; | ||
| 3031 | |||
| 2236 | if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && | 3032 | if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && |
| 2237 | IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { | 3033 | IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { |
| 2238 | int err = ipmr_get_route(net, skb, | 3034 | int err = ipmr_get_route(net, skb, |
| 2239 | fl4->saddr, fl4->daddr, | 3035 | rt->rt_src, rt->rt_dst, |
| 2240 | r, nowait); | 3036 | r, nowait); |
| 2241 | if (err <= 0) { | 3037 | if (err <= 0) { |
| 2242 | if (!nowait) { | 3038 | if (!nowait) { |
| @@ -2251,11 +3047,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, | |||
| 2251 | } | 3047 | } |
| 2252 | } else | 3048 | } else |
| 2253 | #endif | 3049 | #endif |
| 2254 | if (nla_put_u32(skb, RTA_IIF, rt->rt_iif)) | 3050 | NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif); |
| 2255 | goto nla_put_failure; | ||
| 2256 | } | 3051 | } |
| 2257 | 3052 | ||
| 2258 | if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) | 3053 | if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, |
| 3054 | expires, error) < 0) | ||
| 2259 | goto nla_put_failure; | 3055 | goto nla_put_failure; |
| 2260 | 3056 | ||
| 2261 | return nlmsg_end(skb, nlh); | 3057 | return nlmsg_end(skb, nlh); |
| @@ -2265,13 +3061,12 @@ nla_put_failure: | |||
| 2265 | return -EMSGSIZE; | 3061 | return -EMSGSIZE; |
| 2266 | } | 3062 | } |
| 2267 | 3063 | ||
| 2268 | static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg) | 3064 | static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) |
| 2269 | { | 3065 | { |
| 2270 | struct net *net = sock_net(in_skb->sk); | 3066 | struct net *net = sock_net(in_skb->sk); |
| 2271 | struct rtmsg *rtm; | 3067 | struct rtmsg *rtm; |
| 2272 | struct nlattr *tb[RTA_MAX+1]; | 3068 | struct nlattr *tb[RTA_MAX+1]; |
| 2273 | struct rtable *rt = NULL; | 3069 | struct rtable *rt = NULL; |
| 2274 | struct flowi4 fl4; | ||
| 2275 | __be32 dst = 0; | 3070 | __be32 dst = 0; |
| 2276 | __be32 src = 0; | 3071 | __be32 src = 0; |
| 2277 | u32 iif; | 3072 | u32 iif; |
| @@ -2306,13 +3101,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void | |||
| 2306 | iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; | 3101 | iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; |
| 2307 | mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; | 3102 | mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; |
| 2308 | 3103 | ||
| 2309 | memset(&fl4, 0, sizeof(fl4)); | ||
| 2310 | fl4.daddr = dst; | ||
| 2311 | fl4.saddr = src; | ||
| 2312 | fl4.flowi4_tos = rtm->rtm_tos; | ||
| 2313 | fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; | ||
| 2314 | fl4.flowi4_mark = mark; | ||
| 2315 | |||
| 2316 | if (iif) { | 3104 | if (iif) { |
| 2317 | struct net_device *dev; | 3105 | struct net_device *dev; |
| 2318 | 3106 | ||
| @@ -2333,6 +3121,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void | |||
| 2333 | if (err == 0 && rt->dst.error) | 3121 | if (err == 0 && rt->dst.error) |
| 2334 | err = -rt->dst.error; | 3122 | err = -rt->dst.error; |
| 2335 | } else { | 3123 | } else { |
| 3124 | struct flowi4 fl4 = { | ||
| 3125 | .daddr = dst, | ||
| 3126 | .saddr = src, | ||
| 3127 | .flowi4_tos = rtm->rtm_tos, | ||
| 3128 | .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, | ||
| 3129 | .flowi4_mark = mark, | ||
| 3130 | }; | ||
| 2336 | rt = ip_route_output_key(net, &fl4); | 3131 | rt = ip_route_output_key(net, &fl4); |
| 2337 | 3132 | ||
| 2338 | err = 0; | 3133 | err = 0; |
| @@ -2347,13 +3142,12 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void | |||
| 2347 | if (rtm->rtm_flags & RTM_F_NOTIFY) | 3142 | if (rtm->rtm_flags & RTM_F_NOTIFY) |
| 2348 | rt->rt_flags |= RTCF_NOTIFY; | 3143 | rt->rt_flags |= RTCF_NOTIFY; |
| 2349 | 3144 | ||
| 2350 | err = rt_fill_info(net, dst, src, &fl4, skb, | 3145 | err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, |
| 2351 | NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, | ||
| 2352 | RTM_NEWROUTE, 0, 0); | 3146 | RTM_NEWROUTE, 0, 0); |
| 2353 | if (err <= 0) | 3147 | if (err <= 0) |
| 2354 | goto errout_free; | 3148 | goto errout_free; |
| 2355 | 3149 | ||
| 2356 | err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); | 3150 | err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); |
| 2357 | errout: | 3151 | errout: |
| 2358 | return err; | 3152 | return err; |
| 2359 | 3153 | ||
| @@ -2364,12 +3158,49 @@ errout_free: | |||
| 2364 | 3158 | ||
| 2365 | int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) | 3159 | int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) |
| 2366 | { | 3160 | { |
| 3161 | struct rtable *rt; | ||
| 3162 | int h, s_h; | ||
| 3163 | int idx, s_idx; | ||
| 3164 | struct net *net; | ||
| 3165 | |||
| 3166 | net = sock_net(skb->sk); | ||
| 3167 | |||
| 3168 | s_h = cb->args[0]; | ||
| 3169 | if (s_h < 0) | ||
| 3170 | s_h = 0; | ||
| 3171 | s_idx = idx = cb->args[1]; | ||
| 3172 | for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { | ||
| 3173 | if (!rt_hash_table[h].chain) | ||
| 3174 | continue; | ||
| 3175 | rcu_read_lock_bh(); | ||
| 3176 | for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt; | ||
| 3177 | rt = rcu_dereference_bh(rt->dst.rt_next), idx++) { | ||
| 3178 | if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx) | ||
| 3179 | continue; | ||
| 3180 | if (rt_is_expired(rt)) | ||
| 3181 | continue; | ||
| 3182 | skb_dst_set_noref(skb, &rt->dst); | ||
| 3183 | if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, | ||
| 3184 | cb->nlh->nlmsg_seq, RTM_NEWROUTE, | ||
| 3185 | 1, NLM_F_MULTI) <= 0) { | ||
| 3186 | skb_dst_drop(skb); | ||
| 3187 | rcu_read_unlock_bh(); | ||
| 3188 | goto done; | ||
| 3189 | } | ||
| 3190 | skb_dst_drop(skb); | ||
| 3191 | } | ||
| 3192 | rcu_read_unlock_bh(); | ||
| 3193 | } | ||
| 3194 | |||
| 3195 | done: | ||
| 3196 | cb->args[0] = h; | ||
| 3197 | cb->args[1] = idx; | ||
| 2367 | return skb->len; | 3198 | return skb->len; |
| 2368 | } | 3199 | } |
| 2369 | 3200 | ||
| 2370 | void ip_rt_multicast_event(struct in_device *in_dev) | 3201 | void ip_rt_multicast_event(struct in_device *in_dev) |
| 2371 | { | 3202 | { |
| 2372 | rt_cache_flush(dev_net(in_dev->dev)); | 3203 | rt_cache_flush(dev_net(in_dev->dev), 0); |
| 2373 | } | 3204 | } |
| 2374 | 3205 | ||
| 2375 | #ifdef CONFIG_SYSCTL | 3206 | #ifdef CONFIG_SYSCTL |
| @@ -2378,7 +3209,16 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, | |||
| 2378 | size_t *lenp, loff_t *ppos) | 3209 | size_t *lenp, loff_t *ppos) |
| 2379 | { | 3210 | { |
| 2380 | if (write) { | 3211 | if (write) { |
| 2381 | rt_cache_flush((struct net *)__ctl->extra1); | 3212 | int flush_delay; |
| 3213 | ctl_table ctl; | ||
| 3214 | struct net *net; | ||
| 3215 | |||
| 3216 | memcpy(&ctl, __ctl, sizeof(ctl)); | ||
| 3217 | ctl.data = &flush_delay; | ||
| 3218 | proc_dointvec(&ctl, write, buffer, lenp, ppos); | ||
| 3219 | |||
| 3220 | net = (struct net *)__ctl->extra1; | ||
| 3221 | rt_cache_flush(net, flush_delay); | ||
| 2382 | return 0; | 3222 | return 0; |
| 2383 | } | 3223 | } |
| 2384 | 3224 | ||
| @@ -2431,6 +3271,13 @@ static ctl_table ipv4_route_table[] = { | |||
| 2431 | .proc_handler = proc_dointvec_jiffies, | 3271 | .proc_handler = proc_dointvec_jiffies, |
| 2432 | }, | 3272 | }, |
| 2433 | { | 3273 | { |
| 3274 | .procname = "gc_interval", | ||
| 3275 | .data = &ip_rt_gc_interval, | ||
| 3276 | .maxlen = sizeof(int), | ||
| 3277 | .mode = 0644, | ||
| 3278 | .proc_handler = proc_dointvec_jiffies, | ||
| 3279 | }, | ||
| 3280 | { | ||
| 2434 | .procname = "redirect_load", | 3281 | .procname = "redirect_load", |
| 2435 | .data = &ip_rt_redirect_load, | 3282 | .data = &ip_rt_redirect_load, |
| 2436 | .maxlen = sizeof(int), | 3283 | .maxlen = sizeof(int), |
| @@ -2496,6 +3343,23 @@ static ctl_table ipv4_route_table[] = { | |||
| 2496 | { } | 3343 | { } |
| 2497 | }; | 3344 | }; |
| 2498 | 3345 | ||
| 3346 | static struct ctl_table empty[1]; | ||
| 3347 | |||
| 3348 | static struct ctl_table ipv4_skeleton[] = | ||
| 3349 | { | ||
| 3350 | { .procname = "route", | ||
| 3351 | .mode = 0555, .child = ipv4_route_table}, | ||
| 3352 | { .procname = "neigh", | ||
| 3353 | .mode = 0555, .child = empty}, | ||
| 3354 | { } | ||
| 3355 | }; | ||
| 3356 | |||
| 3357 | static __net_initdata struct ctl_path ipv4_path[] = { | ||
| 3358 | { .procname = "net", }, | ||
| 3359 | { .procname = "ipv4", }, | ||
| 3360 | { }, | ||
| 3361 | }; | ||
| 3362 | |||
| 2499 | static struct ctl_table ipv4_route_flush_table[] = { | 3363 | static struct ctl_table ipv4_route_flush_table[] = { |
| 2500 | { | 3364 | { |
| 2501 | .procname = "flush", | 3365 | .procname = "flush", |
| @@ -2506,6 +3370,13 @@ static struct ctl_table ipv4_route_flush_table[] = { | |||
| 2506 | { }, | 3370 | { }, |
| 2507 | }; | 3371 | }; |
| 2508 | 3372 | ||
| 3373 | static __net_initdata struct ctl_path ipv4_route_path[] = { | ||
| 3374 | { .procname = "net", }, | ||
| 3375 | { .procname = "ipv4", }, | ||
| 3376 | { .procname = "route", }, | ||
| 3377 | { }, | ||
| 3378 | }; | ||
| 3379 | |||
| 2509 | static __net_init int sysctl_route_net_init(struct net *net) | 3380 | static __net_init int sysctl_route_net_init(struct net *net) |
| 2510 | { | 3381 | { |
| 2511 | struct ctl_table *tbl; | 3382 | struct ctl_table *tbl; |
| @@ -2515,14 +3386,11 @@ static __net_init int sysctl_route_net_init(struct net *net) | |||
| 2515 | tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); | 3386 | tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); |
| 2516 | if (tbl == NULL) | 3387 | if (tbl == NULL) |
| 2517 | goto err_dup; | 3388 | goto err_dup; |
| 2518 | |||
| 2519 | /* Don't export sysctls to unprivileged users */ | ||
| 2520 | if (net->user_ns != &init_user_ns) | ||
| 2521 | tbl[0].procname = NULL; | ||
| 2522 | } | 3389 | } |
| 2523 | tbl[0].extra1 = net; | 3390 | tbl[0].extra1 = net; |
| 2524 | 3391 | ||
| 2525 | net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl); | 3392 | net->ipv4.route_hdr = |
| 3393 | register_net_sysctl_table(net, ipv4_route_path, tbl); | ||
| 2526 | if (net->ipv4.route_hdr == NULL) | 3394 | if (net->ipv4.route_hdr == NULL) |
| 2527 | goto err_reg; | 3395 | goto err_reg; |
| 2528 | return 0; | 3396 | return 0; |
| @@ -2552,7 +3420,8 @@ static __net_initdata struct pernet_operations sysctl_route_ops = { | |||
| 2552 | 3420 | ||
| 2553 | static __net_init int rt_genid_init(struct net *net) | 3421 | static __net_init int rt_genid_init(struct net *net) |
| 2554 | { | 3422 | { |
| 2555 | atomic_set(&net->rt_genid, 0); | 3423 | get_random_bytes(&net->ipv4.rt_genid, |
| 3424 | sizeof(net->ipv4.rt_genid)); | ||
| 2556 | get_random_bytes(&net->ipv4.dev_addr_genid, | 3425 | get_random_bytes(&net->ipv4.dev_addr_genid, |
| 2557 | sizeof(net->ipv4.dev_addr_genid)); | 3426 | sizeof(net->ipv4.dev_addr_genid)); |
| 2558 | return 0; | 3427 | return 0; |
| @@ -2562,35 +3431,21 @@ static __net_initdata struct pernet_operations rt_genid_ops = { | |||
| 2562 | .init = rt_genid_init, | 3431 | .init = rt_genid_init, |
| 2563 | }; | 3432 | }; |
| 2564 | 3433 | ||
| 2565 | static int __net_init ipv4_inetpeer_init(struct net *net) | ||
| 2566 | { | ||
| 2567 | struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); | ||
| 2568 | |||
| 2569 | if (!bp) | ||
| 2570 | return -ENOMEM; | ||
| 2571 | inet_peer_base_init(bp); | ||
| 2572 | net->ipv4.peers = bp; | ||
| 2573 | return 0; | ||
| 2574 | } | ||
| 2575 | |||
| 2576 | static void __net_exit ipv4_inetpeer_exit(struct net *net) | ||
| 2577 | { | ||
| 2578 | struct inet_peer_base *bp = net->ipv4.peers; | ||
| 2579 | |||
| 2580 | net->ipv4.peers = NULL; | ||
| 2581 | inetpeer_invalidate_tree(bp); | ||
| 2582 | kfree(bp); | ||
| 2583 | } | ||
| 2584 | |||
| 2585 | static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { | ||
| 2586 | .init = ipv4_inetpeer_init, | ||
| 2587 | .exit = ipv4_inetpeer_exit, | ||
| 2588 | }; | ||
| 2589 | 3434 | ||
| 2590 | #ifdef CONFIG_IP_ROUTE_CLASSID | 3435 | #ifdef CONFIG_IP_ROUTE_CLASSID |
| 2591 | struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; | 3436 | struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; |
| 2592 | #endif /* CONFIG_IP_ROUTE_CLASSID */ | 3437 | #endif /* CONFIG_IP_ROUTE_CLASSID */ |
| 2593 | 3438 | ||
| 3439 | static __initdata unsigned long rhash_entries; | ||
| 3440 | static int __init set_rhash_entries(char *str) | ||
| 3441 | { | ||
| 3442 | if (!str) | ||
| 3443 | return 0; | ||
| 3444 | rhash_entries = simple_strtoul(str, &str, 0); | ||
| 3445 | return 1; | ||
| 3446 | } | ||
| 3447 | __setup("rhash_entries=", set_rhash_entries); | ||
| 3448 | |||
| 2594 | int __init ip_rt_init(void) | 3449 | int __init ip_rt_init(void) |
| 2595 | { | 3450 | { |
| 2596 | int rc = 0; | 3451 | int rc = 0; |
| @@ -2613,17 +3468,35 @@ int __init ip_rt_init(void) | |||
| 2613 | if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) | 3468 | if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) |
| 2614 | panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); | 3469 | panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); |
| 2615 | 3470 | ||
| 2616 | ipv4_dst_ops.gc_thresh = ~0; | 3471 | rt_hash_table = (struct rt_hash_bucket *) |
| 2617 | ip_rt_max_size = INT_MAX; | 3472 | alloc_large_system_hash("IP route cache", |
| 3473 | sizeof(struct rt_hash_bucket), | ||
| 3474 | rhash_entries, | ||
| 3475 | (totalram_pages >= 128 * 1024) ? | ||
| 3476 | 15 : 17, | ||
| 3477 | 0, | ||
| 3478 | &rt_hash_log, | ||
| 3479 | &rt_hash_mask, | ||
| 3480 | rhash_entries ? 0 : 512 * 1024); | ||
| 3481 | memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); | ||
| 3482 | rt_hash_lock_init(); | ||
| 3483 | |||
| 3484 | ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); | ||
| 3485 | ip_rt_max_size = (rt_hash_mask + 1) * 16; | ||
| 2618 | 3486 | ||
| 2619 | devinet_init(); | 3487 | devinet_init(); |
| 2620 | ip_fib_init(); | 3488 | ip_fib_init(); |
| 2621 | 3489 | ||
| 3490 | INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); | ||
| 3491 | expires_ljiffies = jiffies; | ||
| 3492 | schedule_delayed_work(&expires_work, | ||
| 3493 | net_random() % ip_rt_gc_interval + ip_rt_gc_interval); | ||
| 3494 | |||
| 2622 | if (ip_rt_proc_init()) | 3495 | if (ip_rt_proc_init()) |
| 2623 | pr_err("Unable to create route proc files\n"); | 3496 | printk(KERN_ERR "Unable to create route proc files\n"); |
| 2624 | #ifdef CONFIG_XFRM | 3497 | #ifdef CONFIG_XFRM |
| 2625 | xfrm_init(); | 3498 | xfrm_init(); |
| 2626 | xfrm4_init(); | 3499 | xfrm4_init(ip_rt_max_size); |
| 2627 | #endif | 3500 | #endif |
| 2628 | rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL); | 3501 | rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL); |
| 2629 | 3502 | ||
| @@ -2631,7 +3504,6 @@ int __init ip_rt_init(void) | |||
| 2631 | register_pernet_subsys(&sysctl_route_ops); | 3504 | register_pernet_subsys(&sysctl_route_ops); |
| 2632 | #endif | 3505 | #endif |
| 2633 | register_pernet_subsys(&rt_genid_ops); | 3506 | register_pernet_subsys(&rt_genid_ops); |
| 2634 | register_pernet_subsys(&ipv4_inetpeer_ops); | ||
| 2635 | return rc; | 3507 | return rc; |
| 2636 | } | 3508 | } |
| 2637 | 3509 | ||
| @@ -2642,6 +3514,6 @@ int __init ip_rt_init(void) | |||
| 2642 | */ | 3514 | */ |
| 2643 | void __init ip_static_sysctl_init(void) | 3515 | void __init ip_static_sysctl_init(void) |
| 2644 | { | 3516 | { |
| 2645 | register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table); | 3517 | register_sysctl_paths(ipv4_path, ipv4_skeleton); |
| 2646 | } | 3518 | } |
| 2647 | #endif | 3519 | #endif |
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index b236ef04914..3bc5c8f7c71 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c | |||
| @@ -15,7 +15,6 @@ | |||
| 15 | #include <linux/random.h> | 15 | #include <linux/random.h> |
| 16 | #include <linux/cryptohash.h> | 16 | #include <linux/cryptohash.h> |
| 17 | #include <linux/kernel.h> | 17 | #include <linux/kernel.h> |
| 18 | #include <linux/export.h> | ||
| 19 | #include <net/tcp.h> | 18 | #include <net/tcp.h> |
| 20 | #include <net/route.h> | 19 | #include <net/route.h> |
| 21 | 20 | ||
| @@ -245,7 +244,7 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok) | |||
| 245 | if (!sysctl_tcp_timestamps) | 244 | if (!sysctl_tcp_timestamps) |
| 246 | return false; | 245 | return false; |
| 247 | 246 | ||
| 248 | tcp_opt->sack_ok = (options & (1 << 4)) ? TCP_SACK_SEEN : 0; | 247 | tcp_opt->sack_ok = (options >> 4) & 0x1; |
| 249 | *ecn_ok = (options >> 5) & 1; | 248 | *ecn_ok = (options >> 5) & 1; |
| 250 | if (*ecn_ok && !sysctl_tcp_ecn) | 249 | if (*ecn_ok && !sysctl_tcp_ecn) |
| 251 | return false; | 250 | return false; |
| @@ -266,7 +265,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | |||
| 266 | struct ip_options *opt) | 265 | struct ip_options *opt) |
| 267 | { | 266 | { |
| 268 | struct tcp_options_received tcp_opt; | 267 | struct tcp_options_received tcp_opt; |
| 269 | const u8 *hash_location; | 268 | u8 *hash_location; |
| 270 | struct inet_request_sock *ireq; | 269 | struct inet_request_sock *ireq; |
| 271 | struct tcp_request_sock *treq; | 270 | struct tcp_request_sock *treq; |
| 272 | struct tcp_sock *tp = tcp_sk(sk); | 271 | struct tcp_sock *tp = tcp_sk(sk); |
| @@ -278,7 +277,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | |||
| 278 | struct rtable *rt; | 277 | struct rtable *rt; |
| 279 | __u8 rcv_wscale; | 278 | __u8 rcv_wscale; |
| 280 | bool ecn_ok = false; | 279 | bool ecn_ok = false; |
| 281 | struct flowi4 fl4; | ||
| 282 | 280 | ||
| 283 | if (!sysctl_tcp_syncookies || !th->ack || th->rst) | 281 | if (!sysctl_tcp_syncookies || !th->ack || th->rst) |
| 284 | goto out; | 282 | goto out; |
| @@ -293,7 +291,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | |||
| 293 | 291 | ||
| 294 | /* check for timestamp cookie support */ | 292 | /* check for timestamp cookie support */ |
| 295 | memset(&tcp_opt, 0, sizeof(tcp_opt)); | 293 | memset(&tcp_opt, 0, sizeof(tcp_opt)); |
| 296 | tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL); | 294 | tcp_parse_options(skb, &tcp_opt, &hash_location, 0); |
| 297 | 295 | ||
| 298 | if (!cookie_check_timestamp(&tcp_opt, &ecn_ok)) | 296 | if (!cookie_check_timestamp(&tcp_opt, &ecn_ok)) |
| 299 | goto out; | 297 | goto out; |
| @@ -319,7 +317,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | |||
| 319 | ireq->tstamp_ok = tcp_opt.saw_tstamp; | 317 | ireq->tstamp_ok = tcp_opt.saw_tstamp; |
| 320 | req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; | 318 | req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; |
| 321 | treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; | 319 | treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; |
| 322 | treq->listener = NULL; | ||
| 323 | 320 | ||
| 324 | /* We throwed the options of the initial SYN away, so we hope | 321 | /* We throwed the options of the initial SYN away, so we hope |
| 325 | * the ACK carries the same options again (see RFC1122 4.2.3.8) | 322 | * the ACK carries the same options again (see RFC1122 4.2.3.8) |
| @@ -340,7 +337,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | |||
| 340 | } | 337 | } |
| 341 | 338 | ||
| 342 | req->expires = 0UL; | 339 | req->expires = 0UL; |
| 343 | req->num_retrans = 0; | 340 | req->retrans = 0; |
| 344 | 341 | ||
| 345 | /* | 342 | /* |
| 346 | * We need to lookup the route here to get at the correct | 343 | * We need to lookup the route here to get at the correct |
| @@ -348,16 +345,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | |||
| 348 | * hasn't changed since we received the original syn, but I see | 345 | * hasn't changed since we received the original syn, but I see |
| 349 | * no easy way to do this. | 346 | * no easy way to do this. |
| 350 | */ | 347 | */ |
| 351 | flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk), | 348 | { |
| 352 | RT_SCOPE_UNIVERSE, IPPROTO_TCP, | 349 | struct flowi4 fl4; |
| 353 | inet_sk_flowi_flags(sk), | 350 | |
| 354 | (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, | 351 | flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk), |
| 355 | ireq->loc_addr, th->source, th->dest); | 352 | RT_SCOPE_UNIVERSE, IPPROTO_TCP, |
| 356 | security_req_classify_flow(req, flowi4_to_flowi(&fl4)); | 353 | inet_sk_flowi_flags(sk), |
| 357 | rt = ip_route_output_key(sock_net(sk), &fl4); | 354 | (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, |
| 358 | if (IS_ERR(rt)) { | 355 | ireq->loc_addr, th->source, th->dest); |
| 359 | reqsk_free(req); | 356 | security_req_classify_flow(req, flowi4_to_flowi(&fl4)); |
| 360 | goto out; | 357 | rt = ip_route_output_key(sock_net(sk), &fl4); |
| 358 | if (IS_ERR(rt)) { | ||
| 359 | reqsk_free(req); | ||
| 360 | goto out; | ||
| 361 | } | ||
| 361 | } | 362 | } |
| 362 | 363 | ||
| 363 | /* Try to redo what tcp_v4_send_synack did. */ | 364 | /* Try to redo what tcp_v4_send_synack did. */ |
| @@ -371,10 +372,5 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | |||
| 371 | ireq->rcv_wscale = rcv_wscale; | 372 | ireq->rcv_wscale = rcv_wscale; |
| 372 | 373 | ||
| 373 | ret = get_cookie_sock(sk, skb, req, &rt->dst); | 374 | ret = get_cookie_sock(sk, skb, req, &rt->dst); |
| 374 | /* ip_queue_xmit() depends on our flow being setup | ||
| 375 | * Normal sockets get it right from inet_csk_route_child_sock() | ||
| 376 | */ | ||
| 377 | if (ret) | ||
| 378 | inet_sk(ret)->cork.fl.u.ip4 = fl4; | ||
| 379 | out: return ret; | 375 | out: return ret; |
| 380 | } | 376 | } |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d84400b6504..69fd7201129 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
| @@ -14,7 +14,6 @@ | |||
| 14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
| 15 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
| 16 | #include <linux/nsproxy.h> | 16 | #include <linux/nsproxy.h> |
| 17 | #include <linux/swap.h> | ||
| 18 | #include <net/snmp.h> | 17 | #include <net/snmp.h> |
| 19 | #include <net/icmp.h> | 18 | #include <net/icmp.h> |
| 20 | #include <net/ip.h> | 19 | #include <net/ip.h> |
| @@ -24,10 +23,8 @@ | |||
| 24 | #include <net/cipso_ipv4.h> | 23 | #include <net/cipso_ipv4.h> |
| 25 | #include <net/inet_frag.h> | 24 | #include <net/inet_frag.h> |
| 26 | #include <net/ping.h> | 25 | #include <net/ping.h> |
| 27 | #include <net/tcp_memcontrol.h> | ||
| 28 | 26 | ||
| 29 | static int zero; | 27 | static int zero; |
| 30 | static int two = 2; | ||
| 31 | static int tcp_retr1_max = 255; | 28 | static int tcp_retr1_max = 255; |
| 32 | static int ip_local_port_range_min[] = { 1, 1 }; | 29 | static int ip_local_port_range_min[] = { 1, 1 }; |
| 33 | static int ip_local_port_range_max[] = { 65535, 65535 }; | 30 | static int ip_local_port_range_max[] = { 65535, 65535 }; |
| @@ -76,10 +73,10 @@ static int ipv4_local_port_range(ctl_table *table, int write, | |||
| 76 | } | 73 | } |
| 77 | 74 | ||
| 78 | 75 | ||
| 79 | static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high) | 76 | void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high) |
| 80 | { | 77 | { |
| 81 | kgid_t *data = table->data; | 78 | gid_t *data = table->data; |
| 82 | unsigned int seq; | 79 | unsigned seq; |
| 83 | do { | 80 | do { |
| 84 | seq = read_seqbegin(&sysctl_local_ports.lock); | 81 | seq = read_seqbegin(&sysctl_local_ports.lock); |
| 85 | 82 | ||
| @@ -89,12 +86,12 @@ static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low | |||
| 89 | } | 86 | } |
| 90 | 87 | ||
| 91 | /* Update system visible IP port range */ | 88 | /* Update system visible IP port range */ |
| 92 | static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high) | 89 | static void set_ping_group_range(struct ctl_table *table, int range[2]) |
| 93 | { | 90 | { |
| 94 | kgid_t *data = table->data; | 91 | gid_t *data = table->data; |
| 95 | write_seqlock(&sysctl_local_ports.lock); | 92 | write_seqlock(&sysctl_local_ports.lock); |
| 96 | data[0] = low; | 93 | data[0] = range[0]; |
| 97 | data[1] = high; | 94 | data[1] = range[1]; |
| 98 | write_sequnlock(&sysctl_local_ports.lock); | 95 | write_sequnlock(&sysctl_local_ports.lock); |
| 99 | } | 96 | } |
| 100 | 97 | ||
| @@ -103,33 +100,21 @@ static int ipv4_ping_group_range(ctl_table *table, int write, | |||
| 103 | void __user *buffer, | 100 | void __user *buffer, |
| 104 | size_t *lenp, loff_t *ppos) | 101 | size_t *lenp, loff_t *ppos) |
| 105 | { | 102 | { |
| 106 | struct user_namespace *user_ns = current_user_ns(); | ||
| 107 | int ret; | 103 | int ret; |
| 108 | gid_t urange[2]; | 104 | gid_t range[2]; |
| 109 | kgid_t low, high; | ||
| 110 | ctl_table tmp = { | 105 | ctl_table tmp = { |
| 111 | .data = &urange, | 106 | .data = &range, |
| 112 | .maxlen = sizeof(urange), | 107 | .maxlen = sizeof(range), |
| 113 | .mode = table->mode, | 108 | .mode = table->mode, |
| 114 | .extra1 = &ip_ping_group_range_min, | 109 | .extra1 = &ip_ping_group_range_min, |
| 115 | .extra2 = &ip_ping_group_range_max, | 110 | .extra2 = &ip_ping_group_range_max, |
| 116 | }; | 111 | }; |
| 117 | 112 | ||
| 118 | inet_get_ping_group_range_table(table, &low, &high); | 113 | inet_get_ping_group_range_table(table, range, range + 1); |
| 119 | urange[0] = from_kgid_munged(user_ns, low); | ||
| 120 | urange[1] = from_kgid_munged(user_ns, high); | ||
| 121 | ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); | 114 | ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); |
| 122 | 115 | ||
| 123 | if (write && ret == 0) { | 116 | if (write && ret == 0) |
| 124 | low = make_kgid(user_ns, urange[0]); | 117 | set_ping_group_range(table, range); |
| 125 | high = make_kgid(user_ns, urange[1]); | ||
| 126 | if (!gid_valid(low) || !gid_valid(high) || | ||
| 127 | (urange[1] < urange[0]) || gid_lt(high, low)) { | ||
| 128 | low = make_kgid(&init_user_ns, 1); | ||
| 129 | high = make_kgid(&init_user_ns, 0); | ||
| 130 | } | ||
| 131 | set_ping_group_range(table, low, high); | ||
| 132 | } | ||
| 133 | 118 | ||
| 134 | return ret; | 119 | return ret; |
| 135 | } | 120 | } |
| @@ -189,90 +174,6 @@ static int proc_allowed_congestion_control(ctl_table *ctl, | |||
| 189 | return ret; | 174 | return ret; |
| 190 | } | 175 | } |
| 191 | 176 | ||
| 192 | static int ipv4_tcp_mem(ctl_table *ctl, int write, | ||
| 193 | void __user *buffer, size_t *lenp, | ||
| 194 | loff_t *ppos) | ||
| 195 | { | ||
| 196 | int ret; | ||
| 197 | unsigned long vec[3]; | ||
| 198 | struct net *net = current->nsproxy->net_ns; | ||
| 199 | #ifdef CONFIG_MEMCG_KMEM | ||
| 200 | struct mem_cgroup *memcg; | ||
| 201 | #endif | ||
| 202 | |||
| 203 | ctl_table tmp = { | ||
| 204 | .data = &vec, | ||
| 205 | .maxlen = sizeof(vec), | ||
| 206 | .mode = ctl->mode, | ||
| 207 | }; | ||
| 208 | |||
| 209 | if (!write) { | ||
| 210 | ctl->data = &net->ipv4.sysctl_tcp_mem; | ||
| 211 | return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos); | ||
| 212 | } | ||
| 213 | |||
| 214 | ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos); | ||
| 215 | if (ret) | ||
| 216 | return ret; | ||
| 217 | |||
| 218 | #ifdef CONFIG_MEMCG_KMEM | ||
| 219 | rcu_read_lock(); | ||
| 220 | memcg = mem_cgroup_from_task(current); | ||
| 221 | |||
| 222 | tcp_prot_mem(memcg, vec[0], 0); | ||
| 223 | tcp_prot_mem(memcg, vec[1], 1); | ||
| 224 | tcp_prot_mem(memcg, vec[2], 2); | ||
| 225 | rcu_read_unlock(); | ||
| 226 | #endif | ||
| 227 | |||
| 228 | net->ipv4.sysctl_tcp_mem[0] = vec[0]; | ||
| 229 | net->ipv4.sysctl_tcp_mem[1] = vec[1]; | ||
| 230 | net->ipv4.sysctl_tcp_mem[2] = vec[2]; | ||
| 231 | |||
| 232 | return 0; | ||
| 233 | } | ||
| 234 | |||
| 235 | int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer, | ||
| 236 | size_t *lenp, loff_t *ppos) | ||
| 237 | { | ||
| 238 | ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; | ||
| 239 | struct tcp_fastopen_context *ctxt; | ||
| 240 | int ret; | ||
| 241 | u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */ | ||
| 242 | |||
| 243 | tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL); | ||
| 244 | if (!tbl.data) | ||
| 245 | return -ENOMEM; | ||
| 246 | |||
| 247 | rcu_read_lock(); | ||
| 248 | ctxt = rcu_dereference(tcp_fastopen_ctx); | ||
| 249 | if (ctxt) | ||
| 250 | memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); | ||
| 251 | else | ||
| 252 | memset(user_key, 0, sizeof(user_key)); | ||
| 253 | rcu_read_unlock(); | ||
| 254 | |||
| 255 | snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x", | ||
| 256 | user_key[0], user_key[1], user_key[2], user_key[3]); | ||
| 257 | ret = proc_dostring(&tbl, write, buffer, lenp, ppos); | ||
| 258 | |||
| 259 | if (write && ret == 0) { | ||
| 260 | if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1, | ||
| 261 | user_key + 2, user_key + 3) != 4) { | ||
| 262 | ret = -EINVAL; | ||
| 263 | goto bad_key; | ||
| 264 | } | ||
| 265 | tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH); | ||
| 266 | } | ||
| 267 | |||
| 268 | bad_key: | ||
| 269 | pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", | ||
| 270 | user_key[0], user_key[1], user_key[2], user_key[3], | ||
| 271 | (char *)tbl.data, ret); | ||
| 272 | kfree(tbl.data); | ||
| 273 | return ret; | ||
| 274 | } | ||
| 275 | |||
| 276 | static struct ctl_table ipv4_table[] = { | 177 | static struct ctl_table ipv4_table[] = { |
| 277 | { | 178 | { |
| 278 | .procname = "tcp_timestamps", | 179 | .procname = "tcp_timestamps", |
| @@ -354,13 +255,6 @@ static struct ctl_table ipv4_table[] = { | |||
| 354 | .proc_handler = proc_dointvec | 255 | .proc_handler = proc_dointvec |
| 355 | }, | 256 | }, |
| 356 | { | 257 | { |
| 357 | .procname = "ip_early_demux", | ||
| 358 | .data = &sysctl_ip_early_demux, | ||
| 359 | .maxlen = sizeof(int), | ||
| 360 | .mode = 0644, | ||
| 361 | .proc_handler = proc_dointvec | ||
| 362 | }, | ||
| 363 | { | ||
| 364 | .procname = "ip_dynaddr", | 258 | .procname = "ip_dynaddr", |
| 365 | .data = &sysctl_ip_dynaddr, | 259 | .data = &sysctl_ip_dynaddr, |
| 366 | .maxlen = sizeof(int), | 260 | .maxlen = sizeof(int), |
| @@ -420,19 +314,6 @@ static struct ctl_table ipv4_table[] = { | |||
| 420 | }, | 314 | }, |
| 421 | #endif | 315 | #endif |
| 422 | { | 316 | { |
| 423 | .procname = "tcp_fastopen", | ||
| 424 | .data = &sysctl_tcp_fastopen, | ||
| 425 | .maxlen = sizeof(int), | ||
| 426 | .mode = 0644, | ||
| 427 | .proc_handler = proc_dointvec, | ||
| 428 | }, | ||
| 429 | { | ||
| 430 | .procname = "tcp_fastopen_key", | ||
| 431 | .mode = 0600, | ||
| 432 | .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), | ||
| 433 | .proc_handler = proc_tcp_fastopen_key, | ||
| 434 | }, | ||
| 435 | { | ||
| 436 | .procname = "tcp_tw_recycle", | 317 | .procname = "tcp_tw_recycle", |
| 437 | .data = &tcp_death_row.sysctl_tw_recycle, | 318 | .data = &tcp_death_row.sysctl_tw_recycle, |
| 438 | .maxlen = sizeof(int), | 319 | .maxlen = sizeof(int), |
| @@ -552,6 +433,13 @@ static struct ctl_table ipv4_table[] = { | |||
| 552 | .proc_handler = proc_dointvec | 433 | .proc_handler = proc_dointvec |
| 553 | }, | 434 | }, |
| 554 | { | 435 | { |
| 436 | .procname = "tcp_mem", | ||
| 437 | .data = &sysctl_tcp_mem, | ||
| 438 | .maxlen = sizeof(sysctl_tcp_mem), | ||
| 439 | .mode = 0644, | ||
| 440 | .proc_handler = proc_doulongvec_minmax | ||
| 441 | }, | ||
| 442 | { | ||
| 555 | .procname = "tcp_wmem", | 443 | .procname = "tcp_wmem", |
| 556 | .data = &sysctl_tcp_wmem, | 444 | .data = &sysctl_tcp_wmem, |
| 557 | .maxlen = sizeof(sysctl_tcp_wmem), | 445 | .maxlen = sizeof(sysctl_tcp_wmem), |
| @@ -664,20 +552,6 @@ static struct ctl_table ipv4_table[] = { | |||
| 664 | .mode = 0644, | 552 | .mode = 0644, |
| 665 | .proc_handler = proc_dointvec | 553 | .proc_handler = proc_dointvec |
| 666 | }, | 554 | }, |
| 667 | { | ||
| 668 | .procname = "tcp_limit_output_bytes", | ||
| 669 | .data = &sysctl_tcp_limit_output_bytes, | ||
| 670 | .maxlen = sizeof(int), | ||
| 671 | .mode = 0644, | ||
| 672 | .proc_handler = proc_dointvec | ||
| 673 | }, | ||
| 674 | { | ||
| 675 | .procname = "tcp_challenge_ack_limit", | ||
| 676 | .data = &sysctl_tcp_challenge_ack_limit, | ||
| 677 | .maxlen = sizeof(int), | ||
| 678 | .mode = 0644, | ||
| 679 | .proc_handler = proc_dointvec | ||
| 680 | }, | ||
| 681 | #ifdef CONFIG_NET_DMA | 555 | #ifdef CONFIG_NET_DMA |
| 682 | { | 556 | { |
| 683 | .procname = "tcp_dma_copybreak", | 557 | .procname = "tcp_dma_copybreak", |
| @@ -765,15 +639,6 @@ static struct ctl_table ipv4_table[] = { | |||
| 765 | .proc_handler = proc_dointvec | 639 | .proc_handler = proc_dointvec |
| 766 | }, | 640 | }, |
| 767 | { | 641 | { |
| 768 | .procname = "tcp_early_retrans", | ||
| 769 | .data = &sysctl_tcp_early_retrans, | ||
| 770 | .maxlen = sizeof(int), | ||
| 771 | .mode = 0644, | ||
| 772 | .proc_handler = proc_dointvec_minmax, | ||
| 773 | .extra1 = &zero, | ||
| 774 | .extra2 = &two, | ||
| 775 | }, | ||
| 776 | { | ||
| 777 | .procname = "udp_mem", | 642 | .procname = "udp_mem", |
| 778 | .data = &sysctl_udp_mem, | 643 | .data = &sysctl_udp_mem, |
| 779 | .maxlen = sizeof(sysctl_udp_mem), | 644 | .maxlen = sizeof(sysctl_udp_mem), |
| @@ -843,21 +708,29 @@ static struct ctl_table ipv4_net_table[] = { | |||
| 843 | .proc_handler = proc_dointvec | 708 | .proc_handler = proc_dointvec |
| 844 | }, | 709 | }, |
| 845 | { | 710 | { |
| 846 | .procname = "ping_group_range", | 711 | .procname = "rt_cache_rebuild_count", |
| 847 | .data = &init_net.ipv4.sysctl_ping_group_range, | 712 | .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count, |
| 848 | .maxlen = sizeof(gid_t)*2, | 713 | .maxlen = sizeof(int), |
| 849 | .mode = 0644, | 714 | .mode = 0644, |
| 850 | .proc_handler = ipv4_ping_group_range, | 715 | .proc_handler = proc_dointvec |
| 851 | }, | 716 | }, |
| 852 | { | 717 | { |
| 853 | .procname = "tcp_mem", | 718 | .procname = "ping_group_range", |
| 854 | .maxlen = sizeof(init_net.ipv4.sysctl_tcp_mem), | 719 | .data = &init_net.ipv4.sysctl_ping_group_range, |
| 720 | .maxlen = sizeof(init_net.ipv4.sysctl_ping_group_range), | ||
| 855 | .mode = 0644, | 721 | .mode = 0644, |
| 856 | .proc_handler = ipv4_tcp_mem, | 722 | .proc_handler = ipv4_ping_group_range, |
| 857 | }, | 723 | }, |
| 858 | { } | 724 | { } |
| 859 | }; | 725 | }; |
| 860 | 726 | ||
| 727 | struct ctl_path net_ipv4_ctl_path[] = { | ||
| 728 | { .procname = "net", }, | ||
| 729 | { .procname = "ipv4", }, | ||
| 730 | { }, | ||
| 731 | }; | ||
| 732 | EXPORT_SYMBOL_GPL(net_ipv4_ctl_path); | ||
| 733 | |||
| 861 | static __net_init int ipv4_sysctl_init_net(struct net *net) | 734 | static __net_init int ipv4_sysctl_init_net(struct net *net) |
| 862 | { | 735 | { |
| 863 | struct ctl_table *table; | 736 | struct ctl_table *table; |
| @@ -881,23 +754,23 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) | |||
| 881 | table[5].data = | 754 | table[5].data = |
| 882 | &net->ipv4.sysctl_icmp_ratemask; | 755 | &net->ipv4.sysctl_icmp_ratemask; |
| 883 | table[6].data = | 756 | table[6].data = |
| 757 | &net->ipv4.sysctl_rt_cache_rebuild_count; | ||
| 758 | table[7].data = | ||
| 884 | &net->ipv4.sysctl_ping_group_range; | 759 | &net->ipv4.sysctl_ping_group_range; |
| 885 | 760 | ||
| 886 | /* Don't export sysctls to unprivileged users */ | ||
| 887 | if (net->user_ns != &init_user_ns) | ||
| 888 | table[0].procname = NULL; | ||
| 889 | } | 761 | } |
| 890 | 762 | ||
| 891 | /* | 763 | /* |
| 892 | * Sane defaults - nobody may create ping sockets. | 764 | * Sane defaults - nobody may create ping sockets. |
| 893 | * Boot scripts should set this to distro-specific group. | 765 | * Boot scripts should set this to distro-specific group. |
| 894 | */ | 766 | */ |
| 895 | net->ipv4.sysctl_ping_group_range[0] = make_kgid(&init_user_ns, 1); | 767 | net->ipv4.sysctl_ping_group_range[0] = 1; |
| 896 | net->ipv4.sysctl_ping_group_range[1] = make_kgid(&init_user_ns, 0); | 768 | net->ipv4.sysctl_ping_group_range[1] = 0; |
| 897 | 769 | ||
| 898 | tcp_init_mem(net); | 770 | net->ipv4.sysctl_rt_cache_rebuild_count = 4; |
| 899 | 771 | ||
| 900 | net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); | 772 | net->ipv4.ipv4_hdr = register_net_sysctl_table(net, |
| 773 | net_ipv4_ctl_path, table); | ||
| 901 | if (net->ipv4.ipv4_hdr == NULL) | 774 | if (net->ipv4.ipv4_hdr == NULL) |
| 902 | goto err_reg; | 775 | goto err_reg; |
| 903 | 776 | ||
| @@ -938,12 +811,12 @@ static __init int sysctl_ipv4_init(void) | |||
| 938 | if (!i->procname) | 811 | if (!i->procname) |
| 939 | return -EINVAL; | 812 | return -EINVAL; |
| 940 | 813 | ||
| 941 | hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table); | 814 | hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table); |
| 942 | if (hdr == NULL) | 815 | if (hdr == NULL) |
| 943 | return -ENOMEM; | 816 | return -ENOMEM; |
| 944 | 817 | ||
| 945 | if (register_pernet_subsys(&ipv4_sysctl_ops)) { | 818 | if (register_pernet_subsys(&ipv4_sysctl_ops)) { |
| 946 | unregister_net_sysctl_table(hdr); | 819 | unregister_sysctl_table(hdr); |
| 947 | return -ENOMEM; | 820 | return -ENOMEM; |
| 948 | } | 821 | } |
| 949 | 822 | ||
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2aa69c8ae60..09ced58e6a5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
| @@ -245,8 +245,6 @@ | |||
| 245 | * TCP_CLOSE socket is finished | 245 | * TCP_CLOSE socket is finished |
| 246 | */ | 246 | */ |
| 247 | 247 | ||
| 248 | #define pr_fmt(fmt) "TCP: " fmt | ||
| 249 | |||
| 250 | #include <linux/kernel.h> | 248 | #include <linux/kernel.h> |
| 251 | #include <linux/module.h> | 249 | #include <linux/module.h> |
| 252 | #include <linux/types.h> | 250 | #include <linux/types.h> |
| @@ -268,12 +266,15 @@ | |||
| 268 | #include <linux/crypto.h> | 266 | #include <linux/crypto.h> |
| 269 | #include <linux/time.h> | 267 | #include <linux/time.h> |
| 270 | #include <linux/slab.h> | 268 | #include <linux/slab.h> |
| 269 | #include <linux/uid_stat.h> | ||
| 271 | 270 | ||
| 272 | #include <net/icmp.h> | 271 | #include <net/icmp.h> |
| 273 | #include <net/inet_common.h> | ||
| 274 | #include <net/tcp.h> | 272 | #include <net/tcp.h> |
| 275 | #include <net/xfrm.h> | 273 | #include <net/xfrm.h> |
| 276 | #include <net/ip.h> | 274 | #include <net/ip.h> |
| 275 | #include <net/ip6_route.h> | ||
| 276 | #include <net/ipv6.h> | ||
| 277 | #include <net/transp_v6.h> | ||
| 277 | #include <net/netdma.h> | 278 | #include <net/netdma.h> |
| 278 | #include <net/sock.h> | 279 | #include <net/sock.h> |
| 279 | 280 | ||
| @@ -285,9 +286,11 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; | |||
| 285 | struct percpu_counter tcp_orphan_count; | 286 | struct percpu_counter tcp_orphan_count; |
| 286 | EXPORT_SYMBOL_GPL(tcp_orphan_count); | 287 | EXPORT_SYMBOL_GPL(tcp_orphan_count); |
| 287 | 288 | ||
| 289 | long sysctl_tcp_mem[3] __read_mostly; | ||
| 288 | int sysctl_tcp_wmem[3] __read_mostly; | 290 | int sysctl_tcp_wmem[3] __read_mostly; |
| 289 | int sysctl_tcp_rmem[3] __read_mostly; | 291 | int sysctl_tcp_rmem[3] __read_mostly; |
| 290 | 292 | ||
| 293 | EXPORT_SYMBOL(sysctl_tcp_mem); | ||
| 291 | EXPORT_SYMBOL(sysctl_tcp_rmem); | 294 | EXPORT_SYMBOL(sysctl_tcp_rmem); |
| 292 | EXPORT_SYMBOL(sysctl_tcp_wmem); | 295 | EXPORT_SYMBOL(sysctl_tcp_wmem); |
| 293 | 296 | ||
| @@ -364,72 +367,6 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max) | |||
| 364 | return period; | 367 | return period; |
| 365 | } | 368 | } |
| 366 | 369 | ||
| 367 | /* Address-family independent initialization for a tcp_sock. | ||
| 368 | * | ||
| 369 | * NOTE: A lot of things set to zero explicitly by call to | ||
| 370 | * sk_alloc() so need not be done here. | ||
| 371 | */ | ||
| 372 | void tcp_init_sock(struct sock *sk) | ||
| 373 | { | ||
| 374 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 375 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 376 | |||
| 377 | skb_queue_head_init(&tp->out_of_order_queue); | ||
| 378 | tcp_init_xmit_timers(sk); | ||
| 379 | tcp_prequeue_init(tp); | ||
| 380 | INIT_LIST_HEAD(&tp->tsq_node); | ||
| 381 | |||
| 382 | icsk->icsk_rto = TCP_TIMEOUT_INIT; | ||
| 383 | tp->mdev = TCP_TIMEOUT_INIT; | ||
| 384 | |||
| 385 | /* So many TCP implementations out there (incorrectly) count the | ||
| 386 | * initial SYN frame in their delayed-ACK and congestion control | ||
| 387 | * algorithms that we must have the following bandaid to talk | ||
| 388 | * efficiently to them. -DaveM | ||
| 389 | */ | ||
| 390 | tp->snd_cwnd = TCP_INIT_CWND; | ||
| 391 | |||
| 392 | /* See draft-stevens-tcpca-spec-01 for discussion of the | ||
| 393 | * initialization of these values. | ||
| 394 | */ | ||
| 395 | tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; | ||
| 396 | tp->snd_cwnd_clamp = ~0; | ||
| 397 | tp->mss_cache = TCP_MSS_DEFAULT; | ||
| 398 | |||
| 399 | tp->reordering = sysctl_tcp_reordering; | ||
| 400 | tcp_enable_early_retrans(tp); | ||
| 401 | icsk->icsk_ca_ops = &tcp_init_congestion_ops; | ||
| 402 | |||
| 403 | sk->sk_state = TCP_CLOSE; | ||
| 404 | |||
| 405 | sk->sk_write_space = sk_stream_write_space; | ||
| 406 | sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); | ||
| 407 | |||
| 408 | icsk->icsk_sync_mss = tcp_sync_mss; | ||
| 409 | |||
| 410 | /* TCP Cookie Transactions */ | ||
| 411 | if (sysctl_tcp_cookie_size > 0) { | ||
| 412 | /* Default, cookies without s_data_payload. */ | ||
| 413 | tp->cookie_values = | ||
| 414 | kzalloc(sizeof(*tp->cookie_values), | ||
| 415 | sk->sk_allocation); | ||
| 416 | if (tp->cookie_values != NULL) | ||
| 417 | kref_init(&tp->cookie_values->kref); | ||
| 418 | } | ||
| 419 | /* Presumed zeroed, in order of appearance: | ||
| 420 | * cookie_in_always, cookie_out_never, | ||
| 421 | * s_data_constant, s_data_in, s_data_out | ||
| 422 | */ | ||
| 423 | sk->sk_sndbuf = sysctl_tcp_wmem[1]; | ||
| 424 | sk->sk_rcvbuf = sysctl_tcp_rmem[1]; | ||
| 425 | |||
| 426 | local_bh_disable(); | ||
| 427 | sock_update_memcg(sk); | ||
| 428 | sk_sockets_allocated_inc(sk); | ||
| 429 | local_bh_enable(); | ||
| 430 | } | ||
| 431 | EXPORT_SYMBOL(tcp_init_sock); | ||
| 432 | |||
| 433 | /* | 370 | /* |
| 434 | * Wait for a TCP event. | 371 | * Wait for a TCP event. |
| 435 | * | 372 | * |
| @@ -441,7 +378,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) | |||
| 441 | { | 378 | { |
| 442 | unsigned int mask; | 379 | unsigned int mask; |
| 443 | struct sock *sk = sock->sk; | 380 | struct sock *sk = sock->sk; |
| 444 | const struct tcp_sock *tp = tcp_sk(sk); | 381 | struct tcp_sock *tp = tcp_sk(sk); |
| 445 | 382 | ||
| 446 | sock_poll_wait(file, sk_sleep(sk), wait); | 383 | sock_poll_wait(file, sk_sleep(sk), wait); |
| 447 | if (sk->sk_state == TCP_LISTEN) | 384 | if (sk->sk_state == TCP_LISTEN) |
| @@ -486,9 +423,8 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) | |||
| 486 | if (sk->sk_shutdown & RCV_SHUTDOWN) | 423 | if (sk->sk_shutdown & RCV_SHUTDOWN) |
| 487 | mask |= POLLIN | POLLRDNORM | POLLRDHUP; | 424 | mask |= POLLIN | POLLRDNORM | POLLRDHUP; |
| 488 | 425 | ||
| 489 | /* Connected or passive Fast Open socket? */ | 426 | /* Connected? */ |
| 490 | if (sk->sk_state != TCP_SYN_SENT && | 427 | if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { |
| 491 | (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) { | ||
| 492 | int target = sock_rcvlowat(sk, 0, INT_MAX); | 428 | int target = sock_rcvlowat(sk, 0, INT_MAX); |
| 493 | 429 | ||
| 494 | if (tp->urg_seq == tp->copied_seq && | 430 | if (tp->urg_seq == tp->copied_seq && |
| @@ -536,29 +472,30 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) | |||
| 536 | { | 472 | { |
| 537 | struct tcp_sock *tp = tcp_sk(sk); | 473 | struct tcp_sock *tp = tcp_sk(sk); |
| 538 | int answ; | 474 | int answ; |
| 539 | bool slow; | ||
| 540 | 475 | ||
| 541 | switch (cmd) { | 476 | switch (cmd) { |
| 542 | case SIOCINQ: | 477 | case SIOCINQ: |
| 543 | if (sk->sk_state == TCP_LISTEN) | 478 | if (sk->sk_state == TCP_LISTEN) |
| 544 | return -EINVAL; | 479 | return -EINVAL; |
| 545 | 480 | ||
| 546 | slow = lock_sock_fast(sk); | 481 | lock_sock(sk); |
| 547 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) | 482 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) |
| 548 | answ = 0; | 483 | answ = 0; |
| 549 | else if (sock_flag(sk, SOCK_URGINLINE) || | 484 | else if (sock_flag(sk, SOCK_URGINLINE) || |
| 550 | !tp->urg_data || | 485 | !tp->urg_data || |
| 551 | before(tp->urg_seq, tp->copied_seq) || | 486 | before(tp->urg_seq, tp->copied_seq) || |
| 552 | !before(tp->urg_seq, tp->rcv_nxt)) { | 487 | !before(tp->urg_seq, tp->rcv_nxt)) { |
| 488 | struct sk_buff *skb; | ||
| 553 | 489 | ||
| 554 | answ = tp->rcv_nxt - tp->copied_seq; | 490 | answ = tp->rcv_nxt - tp->copied_seq; |
| 555 | 491 | ||
| 556 | /* Subtract 1, if FIN was received */ | 492 | /* Subtract 1, if FIN is in queue. */ |
| 557 | if (answ && sock_flag(sk, SOCK_DONE)) | 493 | skb = skb_peek_tail(&sk->sk_receive_queue); |
| 558 | answ--; | 494 | if (answ && skb) |
| 495 | answ -= tcp_hdr(skb)->fin; | ||
| 559 | } else | 496 | } else |
| 560 | answ = tp->urg_seq - tp->copied_seq; | 497 | answ = tp->urg_seq - tp->copied_seq; |
| 561 | unlock_sock_fast(sk, slow); | 498 | release_sock(sk); |
| 562 | break; | 499 | break; |
| 563 | case SIOCATMARK: | 500 | case SIOCATMARK: |
| 564 | answ = tp->urg_data && tp->urg_seq == tp->copied_seq; | 501 | answ = tp->urg_data && tp->urg_seq == tp->copied_seq; |
| @@ -591,11 +528,11 @@ EXPORT_SYMBOL(tcp_ioctl); | |||
| 591 | 528 | ||
| 592 | static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) | 529 | static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) |
| 593 | { | 530 | { |
| 594 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; | 531 | TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; |
| 595 | tp->pushed_seq = tp->write_seq; | 532 | tp->pushed_seq = tp->write_seq; |
| 596 | } | 533 | } |
| 597 | 534 | ||
| 598 | static inline bool forced_push(const struct tcp_sock *tp) | 535 | static inline int forced_push(struct tcp_sock *tp) |
| 599 | { | 536 | { |
| 600 | return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); | 537 | return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); |
| 601 | } | 538 | } |
| @@ -607,7 +544,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb) | |||
| 607 | 544 | ||
| 608 | skb->csum = 0; | 545 | skb->csum = 0; |
| 609 | tcb->seq = tcb->end_seq = tp->write_seq; | 546 | tcb->seq = tcb->end_seq = tp->write_seq; |
| 610 | tcb->tcp_flags = TCPHDR_ACK; | 547 | tcb->flags = TCPHDR_ACK; |
| 611 | tcb->sacked = 0; | 548 | tcb->sacked = 0; |
| 612 | skb_header_release(skb); | 549 | skb_header_release(skb); |
| 613 | tcp_add_write_queue_tail(sk, skb); | 550 | tcp_add_write_queue_tail(sk, skb); |
| @@ -768,12 +705,11 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) | |||
| 768 | skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); | 705 | skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); |
| 769 | if (skb) { | 706 | if (skb) { |
| 770 | if (sk_wmem_schedule(sk, skb->truesize)) { | 707 | if (sk_wmem_schedule(sk, skb->truesize)) { |
| 771 | skb_reserve(skb, sk->sk_prot->max_header); | ||
| 772 | /* | 708 | /* |
| 773 | * Make sure that we have exactly size bytes | 709 | * Make sure that we have exactly size bytes |
| 774 | * available to the caller, no more, no less. | 710 | * available to the caller, no more, no less. |
| 775 | */ | 711 | */ |
| 776 | skb->avail_size = size; | 712 | skb_reserve(skb, skb_tailroom(skb) - size); |
| 777 | return skb; | 713 | return skb; |
| 778 | } | 714 | } |
| 779 | __kfree_skb(skb); | 715 | __kfree_skb(skb); |
| @@ -798,10 +734,6 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, | |||
| 798 | inet_csk(sk)->icsk_ext_hdr_len - | 734 | inet_csk(sk)->icsk_ext_hdr_len - |
| 799 | tp->tcp_header_len); | 735 | tp->tcp_header_len); |
| 800 | 736 | ||
| 801 | /* TSQ : try to have two TSO segments in flight */ | ||
| 802 | xmit_size_goal = min_t(u32, xmit_size_goal, | ||
| 803 | sysctl_tcp_limit_output_bytes >> 1); | ||
| 804 | |||
| 805 | xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); | 737 | xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); |
| 806 | 738 | ||
| 807 | /* We try hard to avoid divides here */ | 739 | /* We try hard to avoid divides here */ |
| @@ -811,9 +743,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, | |||
| 811 | old_size_goal + mss_now > xmit_size_goal)) { | 743 | old_size_goal + mss_now > xmit_size_goal)) { |
| 812 | xmit_size_goal = old_size_goal; | 744 | xmit_size_goal = old_size_goal; |
| 813 | } else { | 745 | } else { |
| 814 | tp->xmit_size_goal_segs = | 746 | tp->xmit_size_goal_segs = xmit_size_goal / mss_now; |
| 815 | min_t(u16, xmit_size_goal / mss_now, | ||
| 816 | sk->sk_gso_max_segs); | ||
| 817 | xmit_size_goal = tp->xmit_size_goal_segs * mss_now; | 747 | xmit_size_goal = tp->xmit_size_goal_segs * mss_now; |
| 818 | } | 748 | } |
| 819 | } | 749 | } |
| @@ -831,8 +761,8 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) | |||
| 831 | return mss_now; | 761 | return mss_now; |
| 832 | } | 762 | } |
| 833 | 763 | ||
| 834 | static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, | 764 | static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, |
| 835 | size_t size, int flags) | 765 | size_t psize, int flags) |
| 836 | { | 766 | { |
| 837 | struct tcp_sock *tp = tcp_sk(sk); | 767 | struct tcp_sock *tp = tcp_sk(sk); |
| 838 | int mss_now, size_goal; | 768 | int mss_now, size_goal; |
| @@ -840,15 +770,10 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, | |||
| 840 | ssize_t copied; | 770 | ssize_t copied; |
| 841 | long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); | 771 | long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); |
| 842 | 772 | ||
| 843 | /* Wait for a connection to finish. One exception is TCP Fast Open | 773 | /* Wait for a connection to finish. */ |
| 844 | * (passive side) where data is allowed to be sent before a connection | 774 | if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) |
| 845 | * is fully established. | ||
| 846 | */ | ||
| 847 | if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && | ||
| 848 | !tcp_passive_fastopen(sk)) { | ||
| 849 | if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) | 775 | if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) |
| 850 | goto out_err; | 776 | goto out_err; |
| 851 | } | ||
| 852 | 777 | ||
| 853 | clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); | 778 | clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); |
| 854 | 779 | ||
| @@ -859,10 +784,12 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, | |||
| 859 | if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) | 784 | if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) |
| 860 | goto out_err; | 785 | goto out_err; |
| 861 | 786 | ||
| 862 | while (size > 0) { | 787 | while (psize > 0) { |
| 863 | struct sk_buff *skb = tcp_write_queue_tail(sk); | 788 | struct sk_buff *skb = tcp_write_queue_tail(sk); |
| 864 | int copy, i; | 789 | struct page *page = pages[poffset / PAGE_SIZE]; |
| 865 | bool can_coalesce; | 790 | int copy, i, can_coalesce; |
| 791 | int offset = poffset % PAGE_SIZE; | ||
| 792 | int size = min_t(size_t, psize, PAGE_SIZE - offset); | ||
| 866 | 793 | ||
| 867 | if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { | 794 | if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { |
| 868 | new_segment: | 795 | new_segment: |
| @@ -890,7 +817,7 @@ new_segment: | |||
| 890 | goto wait_for_memory; | 817 | goto wait_for_memory; |
| 891 | 818 | ||
| 892 | if (can_coalesce) { | 819 | if (can_coalesce) { |
| 893 | skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); | 820 | skb_shinfo(skb)->frags[i - 1].size += copy; |
| 894 | } else { | 821 | } else { |
| 895 | get_page(page); | 822 | get_page(page); |
| 896 | skb_fill_page_desc(skb, i, page, offset, copy); | 823 | skb_fill_page_desc(skb, i, page, offset, copy); |
| @@ -907,11 +834,11 @@ new_segment: | |||
| 907 | skb_shinfo(skb)->gso_segs = 0; | 834 | skb_shinfo(skb)->gso_segs = 0; |
| 908 | 835 | ||
| 909 | if (!copied) | 836 | if (!copied) |
| 910 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; | 837 | TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH; |
| 911 | 838 | ||
| 912 | copied += copy; | 839 | copied += copy; |
| 913 | offset += copy; | 840 | poffset += copy; |
| 914 | if (!(size -= copy)) | 841 | if (!(psize -= copy)) |
| 915 | goto out; | 842 | goto out; |
| 916 | 843 | ||
| 917 | if (skb->len < size_goal || (flags & MSG_OOB)) | 844 | if (skb->len < size_goal || (flags & MSG_OOB)) |
| @@ -927,7 +854,8 @@ new_segment: | |||
| 927 | wait_for_sndbuf: | 854 | wait_for_sndbuf: |
| 928 | set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); | 855 | set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); |
| 929 | wait_for_memory: | 856 | wait_for_memory: |
| 930 | tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); | 857 | if (copied) |
| 858 | tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); | ||
| 931 | 859 | ||
| 932 | if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) | 860 | if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) |
| 933 | goto do_error; | 861 | goto do_error; |
| @@ -936,7 +864,7 @@ wait_for_memory: | |||
| 936 | } | 864 | } |
| 937 | 865 | ||
| 938 | out: | 866 | out: |
| 939 | if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) | 867 | if (copied) |
| 940 | tcp_push(sk, flags, mss_now, tp->nonagle); | 868 | tcp_push(sk, flags, mss_now, tp->nonagle); |
| 941 | return copied; | 869 | return copied; |
| 942 | 870 | ||
| @@ -958,24 +886,24 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, | |||
| 958 | flags); | 886 | flags); |
| 959 | 887 | ||
| 960 | lock_sock(sk); | 888 | lock_sock(sk); |
| 961 | res = do_tcp_sendpages(sk, page, offset, size, flags); | 889 | res = do_tcp_sendpages(sk, &page, offset, size, flags); |
| 962 | release_sock(sk); | 890 | release_sock(sk); |
| 963 | return res; | 891 | return res; |
| 964 | } | 892 | } |
| 965 | EXPORT_SYMBOL(tcp_sendpage); | 893 | EXPORT_SYMBOL(tcp_sendpage); |
| 966 | 894 | ||
| 967 | static inline int select_size(const struct sock *sk, bool sg) | 895 | #define TCP_PAGE(sk) (sk->sk_sndmsg_page) |
| 896 | #define TCP_OFF(sk) (sk->sk_sndmsg_off) | ||
| 897 | |||
| 898 | static inline int select_size(struct sock *sk, int sg) | ||
| 968 | { | 899 | { |
| 969 | const struct tcp_sock *tp = tcp_sk(sk); | 900 | struct tcp_sock *tp = tcp_sk(sk); |
| 970 | int tmp = tp->mss_cache; | 901 | int tmp = tp->mss_cache; |
| 971 | 902 | ||
| 972 | if (sg) { | 903 | if (sg) { |
| 973 | if (sk_can_gso(sk)) { | 904 | if (sk_can_gso(sk)) |
| 974 | /* Small frames wont use a full page: | 905 | tmp = 0; |
| 975 | * Payload will immediately follow tcp header. | 906 | else { |
| 976 | */ | ||
| 977 | tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER); | ||
| 978 | } else { | ||
| 979 | int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); | 907 | int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); |
| 980 | 908 | ||
| 981 | if (tmp >= pgbreak && | 909 | if (tmp >= pgbreak && |
| @@ -987,86 +915,27 @@ static inline int select_size(const struct sock *sk, bool sg) | |||
| 987 | return tmp; | 915 | return tmp; |
| 988 | } | 916 | } |
| 989 | 917 | ||
| 990 | void tcp_free_fastopen_req(struct tcp_sock *tp) | ||
| 991 | { | ||
| 992 | if (tp->fastopen_req != NULL) { | ||
| 993 | kfree(tp->fastopen_req); | ||
| 994 | tp->fastopen_req = NULL; | ||
| 995 | } | ||
| 996 | } | ||
| 997 | |||
| 998 | static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size) | ||
| 999 | { | ||
| 1000 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 1001 | int err, flags; | ||
| 1002 | |||
| 1003 | if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) | ||
| 1004 | return -EOPNOTSUPP; | ||
| 1005 | if (tp->fastopen_req != NULL) | ||
| 1006 | return -EALREADY; /* Another Fast Open is in progress */ | ||
| 1007 | |||
| 1008 | tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), | ||
| 1009 | sk->sk_allocation); | ||
| 1010 | if (unlikely(tp->fastopen_req == NULL)) | ||
| 1011 | return -ENOBUFS; | ||
| 1012 | tp->fastopen_req->data = msg; | ||
| 1013 | |||
| 1014 | flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; | ||
| 1015 | err = __inet_stream_connect(sk->sk_socket, msg->msg_name, | ||
| 1016 | msg->msg_namelen, flags); | ||
| 1017 | *size = tp->fastopen_req->copied; | ||
| 1018 | tcp_free_fastopen_req(tp); | ||
| 1019 | return err; | ||
| 1020 | } | ||
| 1021 | |||
| 1022 | int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | 918 | int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, |
| 1023 | size_t size) | 919 | size_t size) |
| 1024 | { | 920 | { |
| 1025 | struct iovec *iov; | 921 | struct iovec *iov; |
| 1026 | struct tcp_sock *tp = tcp_sk(sk); | 922 | struct tcp_sock *tp = tcp_sk(sk); |
| 1027 | struct sk_buff *skb; | 923 | struct sk_buff *skb; |
| 1028 | int iovlen, flags, err, copied = 0; | 924 | int iovlen, flags; |
| 1029 | int mss_now = 0, size_goal, copied_syn = 0, offset = 0; | 925 | int mss_now, size_goal; |
| 1030 | bool sg; | 926 | int sg, err, copied; |
| 1031 | long timeo; | 927 | long timeo; |
| 1032 | 928 | ||
| 1033 | lock_sock(sk); | 929 | lock_sock(sk); |
| 1034 | 930 | ||
| 1035 | flags = msg->msg_flags; | 931 | flags = msg->msg_flags; |
| 1036 | if (flags & MSG_FASTOPEN) { | ||
| 1037 | err = tcp_sendmsg_fastopen(sk, msg, &copied_syn); | ||
| 1038 | if (err == -EINPROGRESS && copied_syn > 0) | ||
| 1039 | goto out; | ||
| 1040 | else if (err) | ||
| 1041 | goto out_err; | ||
| 1042 | offset = copied_syn; | ||
| 1043 | } | ||
| 1044 | |||
| 1045 | timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); | 932 | timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); |
| 1046 | 933 | ||
| 1047 | /* Wait for a connection to finish. One exception is TCP Fast Open | 934 | /* Wait for a connection to finish. */ |
| 1048 | * (passive side) where data is allowed to be sent before a connection | 935 | if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) |
| 1049 | * is fully established. | ||
| 1050 | */ | ||
| 1051 | if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && | ||
| 1052 | !tcp_passive_fastopen(sk)) { | ||
| 1053 | if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) | 936 | if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) |
| 1054 | goto do_error; | ||
| 1055 | } | ||
| 1056 | |||
| 1057 | if (unlikely(tp->repair)) { | ||
| 1058 | if (tp->repair_queue == TCP_RECV_QUEUE) { | ||
| 1059 | copied = tcp_send_rcvq(sk, msg, size); | ||
| 1060 | goto out; | ||
| 1061 | } | ||
| 1062 | |||
| 1063 | err = -EINVAL; | ||
| 1064 | if (tp->repair_queue == TCP_NO_QUEUE) | ||
| 1065 | goto out_err; | 937 | goto out_err; |
| 1066 | 938 | ||
| 1067 | /* 'common' sending to sendq */ | ||
| 1068 | } | ||
| 1069 | |||
| 1070 | /* This should be in poll */ | 939 | /* This should be in poll */ |
| 1071 | clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); | 940 | clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); |
| 1072 | 941 | ||
| @@ -1081,22 +950,13 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
| 1081 | if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) | 950 | if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) |
| 1082 | goto out_err; | 951 | goto out_err; |
| 1083 | 952 | ||
| 1084 | sg = !!(sk->sk_route_caps & NETIF_F_SG); | 953 | sg = sk->sk_route_caps & NETIF_F_SG; |
| 1085 | 954 | ||
| 1086 | while (--iovlen >= 0) { | 955 | while (--iovlen >= 0) { |
| 1087 | size_t seglen = iov->iov_len; | 956 | size_t seglen = iov->iov_len; |
| 1088 | unsigned char __user *from = iov->iov_base; | 957 | unsigned char __user *from = iov->iov_base; |
| 1089 | 958 | ||
| 1090 | iov++; | 959 | iov++; |
| 1091 | if (unlikely(offset > 0)) { /* Skip bytes copied in SYN */ | ||
| 1092 | if (offset >= seglen) { | ||
| 1093 | offset -= seglen; | ||
| 1094 | continue; | ||
| 1095 | } | ||
| 1096 | seglen -= offset; | ||
| 1097 | from += offset; | ||
| 1098 | offset = 0; | ||
| 1099 | } | ||
| 1100 | 960 | ||
| 1101 | while (seglen > 0) { | 961 | while (seglen > 0) { |
| 1102 | int copy = 0; | 962 | int copy = 0; |
| @@ -1139,54 +999,86 @@ new_segment: | |||
| 1139 | copy = seglen; | 999 | copy = seglen; |
| 1140 | 1000 | ||
| 1141 | /* Where to copy to? */ | 1001 | /* Where to copy to? */ |
| 1142 | if (skb_availroom(skb) > 0) { | 1002 | if (skb_tailroom(skb) > 0) { |
| 1143 | /* We have some space in skb head. Superb! */ | 1003 | /* We have some space in skb head. Superb! */ |
| 1144 | copy = min_t(int, copy, skb_availroom(skb)); | 1004 | if (copy > skb_tailroom(skb)) |
| 1005 | copy = skb_tailroom(skb); | ||
| 1145 | err = skb_add_data_nocache(sk, skb, from, copy); | 1006 | err = skb_add_data_nocache(sk, skb, from, copy); |
| 1146 | if (err) | 1007 | if (err) |
| 1147 | goto do_fault; | 1008 | goto do_fault; |
| 1148 | } else { | 1009 | } else { |
| 1149 | bool merge = true; | 1010 | int merge = 0; |
| 1150 | int i = skb_shinfo(skb)->nr_frags; | 1011 | int i = skb_shinfo(skb)->nr_frags; |
| 1151 | struct page_frag *pfrag = sk_page_frag(sk); | 1012 | struct page *page = TCP_PAGE(sk); |
| 1152 | 1013 | int off = TCP_OFF(sk); | |
| 1153 | if (!sk_page_frag_refill(sk, pfrag)) | 1014 | |
| 1154 | goto wait_for_memory; | 1015 | if (skb_can_coalesce(skb, i, page, off) && |
| 1155 | 1016 | off != PAGE_SIZE) { | |
| 1156 | if (!skb_can_coalesce(skb, i, pfrag->page, | 1017 | /* We can extend the last page |
| 1157 | pfrag->offset)) { | 1018 | * fragment. */ |
| 1158 | if (i == MAX_SKB_FRAGS || !sg) { | 1019 | merge = 1; |
| 1159 | tcp_mark_push(tp, skb); | 1020 | } else if (i == MAX_SKB_FRAGS || !sg) { |
| 1160 | goto new_segment; | 1021 | /* Need to add new fragment and cannot |
| 1022 | * do this because interface is non-SG, | ||
| 1023 | * or because all the page slots are | ||
| 1024 | * busy. */ | ||
| 1025 | tcp_mark_push(tp, skb); | ||
| 1026 | goto new_segment; | ||
| 1027 | } else if (page) { | ||
| 1028 | if (off == PAGE_SIZE) { | ||
| 1029 | put_page(page); | ||
| 1030 | TCP_PAGE(sk) = page = NULL; | ||
| 1031 | off = 0; | ||
| 1161 | } | 1032 | } |
| 1162 | merge = false; | 1033 | } else |
| 1163 | } | 1034 | off = 0; |
| 1164 | 1035 | ||
| 1165 | copy = min_t(int, copy, pfrag->size - pfrag->offset); | 1036 | if (copy > PAGE_SIZE - off) |
| 1037 | copy = PAGE_SIZE - off; | ||
| 1166 | 1038 | ||
| 1167 | if (!sk_wmem_schedule(sk, copy)) | 1039 | if (!sk_wmem_schedule(sk, copy)) |
| 1168 | goto wait_for_memory; | 1040 | goto wait_for_memory; |
| 1169 | 1041 | ||
| 1042 | if (!page) { | ||
| 1043 | /* Allocate new cache page. */ | ||
| 1044 | if (!(page = sk_stream_alloc_page(sk))) | ||
| 1045 | goto wait_for_memory; | ||
| 1046 | } | ||
| 1047 | |||
| 1048 | /* Time to copy data. We are close to | ||
| 1049 | * the end! */ | ||
| 1170 | err = skb_copy_to_page_nocache(sk, from, skb, | 1050 | err = skb_copy_to_page_nocache(sk, from, skb, |
| 1171 | pfrag->page, | 1051 | page, off, copy); |
| 1172 | pfrag->offset, | 1052 | if (err) { |
| 1173 | copy); | 1053 | /* If this page was new, give it to the |
| 1174 | if (err) | 1054 | * socket so it does not get leaked. |
| 1055 | */ | ||
| 1056 | if (!TCP_PAGE(sk)) { | ||
| 1057 | TCP_PAGE(sk) = page; | ||
| 1058 | TCP_OFF(sk) = 0; | ||
| 1059 | } | ||
| 1175 | goto do_error; | 1060 | goto do_error; |
| 1061 | } | ||
| 1176 | 1062 | ||
| 1177 | /* Update the skb. */ | 1063 | /* Update the skb. */ |
| 1178 | if (merge) { | 1064 | if (merge) { |
| 1179 | skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); | 1065 | skb_shinfo(skb)->frags[i - 1].size += |
| 1066 | copy; | ||
| 1180 | } else { | 1067 | } else { |
| 1181 | skb_fill_page_desc(skb, i, pfrag->page, | 1068 | skb_fill_page_desc(skb, i, page, off, copy); |
| 1182 | pfrag->offset, copy); | 1069 | if (TCP_PAGE(sk)) { |
| 1183 | get_page(pfrag->page); | 1070 | get_page(page); |
| 1071 | } else if (off + copy < PAGE_SIZE) { | ||
| 1072 | get_page(page); | ||
| 1073 | TCP_PAGE(sk) = page; | ||
| 1074 | } | ||
| 1184 | } | 1075 | } |
| 1185 | pfrag->offset += copy; | 1076 | |
| 1077 | TCP_OFF(sk) = off + copy; | ||
| 1186 | } | 1078 | } |
| 1187 | 1079 | ||
| 1188 | if (!copied) | 1080 | if (!copied) |
| 1189 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; | 1081 | TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH; |
| 1190 | 1082 | ||
| 1191 | tp->write_seq += copy; | 1083 | tp->write_seq += copy; |
| 1192 | TCP_SKB_CB(skb)->end_seq += copy; | 1084 | TCP_SKB_CB(skb)->end_seq += copy; |
| @@ -1197,7 +1089,7 @@ new_segment: | |||
| 1197 | if ((seglen -= copy) == 0 && iovlen == 0) | 1089 | if ((seglen -= copy) == 0 && iovlen == 0) |
| 1198 | goto out; | 1090 | goto out; |
| 1199 | 1091 | ||
| 1200 | if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair)) | 1092 | if (skb->len < max || (flags & MSG_OOB)) |
| 1201 | continue; | 1093 | continue; |
| 1202 | 1094 | ||
| 1203 | if (forced_push(tp)) { | 1095 | if (forced_push(tp)) { |
| @@ -1224,7 +1116,10 @@ out: | |||
| 1224 | if (copied) | 1116 | if (copied) |
| 1225 | tcp_push(sk, flags, mss_now, tp->nonagle); | 1117 | tcp_push(sk, flags, mss_now, tp->nonagle); |
| 1226 | release_sock(sk); | 1118 | release_sock(sk); |
| 1227 | return copied + copied_syn; | 1119 | |
| 1120 | if (copied > 0) | ||
| 1121 | uid_stat_tcp_snd(current_uid(), copied); | ||
| 1122 | return copied; | ||
| 1228 | 1123 | ||
| 1229 | do_fault: | 1124 | do_fault: |
| 1230 | if (!skb->len) { | 1125 | if (!skb->len) { |
| @@ -1237,7 +1132,7 @@ do_fault: | |||
| 1237 | } | 1132 | } |
| 1238 | 1133 | ||
| 1239 | do_error: | 1134 | do_error: |
| 1240 | if (copied + copied_syn) | 1135 | if (copied) |
| 1241 | goto out; | 1136 | goto out; |
| 1242 | out_err: | 1137 | out_err: |
| 1243 | err = sk_stream_error(sk, flags, err); | 1138 | err = sk_stream_error(sk, flags, err); |
| @@ -1295,24 +1190,6 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) | |||
| 1295 | return -EAGAIN; | 1190 | return -EAGAIN; |
| 1296 | } | 1191 | } |
| 1297 | 1192 | ||
| 1298 | static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) | ||
| 1299 | { | ||
| 1300 | struct sk_buff *skb; | ||
| 1301 | int copied = 0, err = 0; | ||
| 1302 | |||
| 1303 | /* XXX -- need to support SO_PEEK_OFF */ | ||
| 1304 | |||
| 1305 | skb_queue_walk(&sk->sk_write_queue, skb) { | ||
| 1306 | err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len); | ||
| 1307 | if (err) | ||
| 1308 | break; | ||
| 1309 | |||
| 1310 | copied += skb->len; | ||
| 1311 | } | ||
| 1312 | |||
| 1313 | return err ?: copied; | ||
| 1314 | } | ||
| 1315 | |||
| 1316 | /* Clean up the receive buffer for full frames taken by the user, | 1193 | /* Clean up the receive buffer for full frames taken by the user, |
| 1317 | * then send an ACK if necessary. COPIED is the number of bytes | 1194 | * then send an ACK if necessary. COPIED is the number of bytes |
| 1318 | * tcp_recvmsg has given to the user so far, it speeds up the | 1195 | * tcp_recvmsg has given to the user so far, it speeds up the |
| @@ -1322,13 +1199,15 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) | |||
| 1322 | void tcp_cleanup_rbuf(struct sock *sk, int copied) | 1199 | void tcp_cleanup_rbuf(struct sock *sk, int copied) |
| 1323 | { | 1200 | { |
| 1324 | struct tcp_sock *tp = tcp_sk(sk); | 1201 | struct tcp_sock *tp = tcp_sk(sk); |
| 1325 | bool time_to_ack = false; | 1202 | int time_to_ack = 0; |
| 1326 | 1203 | ||
| 1204 | #if TCP_DEBUG | ||
| 1327 | struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); | 1205 | struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); |
| 1328 | 1206 | ||
| 1329 | WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), | 1207 | WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), |
| 1330 | "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", | 1208 | "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", |
| 1331 | tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); | 1209 | tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); |
| 1210 | #endif | ||
| 1332 | 1211 | ||
| 1333 | if (inet_csk_ack_scheduled(sk)) { | 1212 | if (inet_csk_ack_scheduled(sk)) { |
| 1334 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1213 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| @@ -1348,7 +1227,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) | |||
| 1348 | ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && | 1227 | ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && |
| 1349 | !icsk->icsk_ack.pingpong)) && | 1228 | !icsk->icsk_ack.pingpong)) && |
| 1350 | !atomic_read(&sk->sk_rmem_alloc))) | 1229 | !atomic_read(&sk->sk_rmem_alloc))) |
| 1351 | time_to_ack = true; | 1230 | time_to_ack = 1; |
| 1352 | } | 1231 | } |
| 1353 | 1232 | ||
| 1354 | /* We send an ACK if we can now advertise a non-zero window | 1233 | /* We send an ACK if we can now advertise a non-zero window |
| @@ -1370,7 +1249,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) | |||
| 1370 | * "Lots" means "at least twice" here. | 1249 | * "Lots" means "at least twice" here. |
| 1371 | */ | 1250 | */ |
| 1372 | if (new_window && new_window >= 2 * rcv_window_now) | 1251 | if (new_window && new_window >= 2 * rcv_window_now) |
| 1373 | time_to_ack = true; | 1252 | time_to_ack = 1; |
| 1374 | } | 1253 | } |
| 1375 | } | 1254 | } |
| 1376 | if (time_to_ack) | 1255 | if (time_to_ack) |
| @@ -1428,12 +1307,12 @@ static void tcp_service_net_dma(struct sock *sk, bool wait) | |||
| 1428 | } | 1307 | } |
| 1429 | #endif | 1308 | #endif |
| 1430 | 1309 | ||
| 1431 | static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) | 1310 | static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) |
| 1432 | { | 1311 | { |
| 1433 | struct sk_buff *skb; | 1312 | struct sk_buff *skb; |
| 1434 | u32 offset; | 1313 | u32 offset; |
| 1435 | 1314 | ||
| 1436 | while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { | 1315 | skb_queue_walk(&sk->sk_receive_queue, skb) { |
| 1437 | offset = seq - TCP_SKB_CB(skb)->seq; | 1316 | offset = seq - TCP_SKB_CB(skb)->seq; |
| 1438 | if (tcp_hdr(skb)->syn) | 1317 | if (tcp_hdr(skb)->syn) |
| 1439 | offset--; | 1318 | offset--; |
| @@ -1441,11 +1320,6 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) | |||
| 1441 | *off = offset; | 1320 | *off = offset; |
| 1442 | return skb; | 1321 | return skb; |
| 1443 | } | 1322 | } |
| 1444 | /* This looks weird, but this can happen if TCP collapsing | ||
| 1445 | * splitted a fat GRO packet, while we released socket lock | ||
| 1446 | * in skb_splice_bits() | ||
| 1447 | */ | ||
| 1448 | sk_eat_skb(sk, skb, false); | ||
| 1449 | } | 1323 | } |
| 1450 | return NULL; | 1324 | return NULL; |
| 1451 | } | 1325 | } |
| @@ -1487,7 +1361,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, | |||
| 1487 | break; | 1361 | break; |
| 1488 | } | 1362 | } |
| 1489 | used = recv_actor(desc, skb, offset, len); | 1363 | used = recv_actor(desc, skb, offset, len); |
| 1490 | if (used <= 0) { | 1364 | if (used < 0) { |
| 1491 | if (!copied) | 1365 | if (!copied) |
| 1492 | copied = used; | 1366 | copied = used; |
| 1493 | break; | 1367 | break; |
| @@ -1496,26 +1370,22 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, | |||
| 1496 | copied += used; | 1370 | copied += used; |
| 1497 | offset += used; | 1371 | offset += used; |
| 1498 | } | 1372 | } |
| 1499 | /* If recv_actor drops the lock (e.g. TCP splice | 1373 | /* |
| 1374 | * If recv_actor drops the lock (e.g. TCP splice | ||
| 1500 | * receive) the skb pointer might be invalid when | 1375 | * receive) the skb pointer might be invalid when |
| 1501 | * getting here: tcp_collapse might have deleted it | 1376 | * getting here: tcp_collapse might have deleted it |
| 1502 | * while aggregating skbs from the socket queue. | 1377 | * while aggregating skbs from the socket queue. |
| 1503 | */ | 1378 | */ |
| 1504 | skb = tcp_recv_skb(sk, seq - 1, &offset); | 1379 | skb = tcp_recv_skb(sk, seq-1, &offset); |
| 1505 | if (!skb) | 1380 | if (!skb || (offset+1 != skb->len)) |
| 1506 | break; | 1381 | break; |
| 1507 | /* TCP coalescing might have appended data to the skb. | ||
| 1508 | * Try to splice more frags | ||
| 1509 | */ | ||
| 1510 | if (offset + 1 != skb->len) | ||
| 1511 | continue; | ||
| 1512 | } | 1382 | } |
| 1513 | if (tcp_hdr(skb)->fin) { | 1383 | if (tcp_hdr(skb)->fin) { |
| 1514 | sk_eat_skb(sk, skb, false); | 1384 | sk_eat_skb(sk, skb, 0); |
| 1515 | ++seq; | 1385 | ++seq; |
| 1516 | break; | 1386 | break; |
| 1517 | } | 1387 | } |
| 1518 | sk_eat_skb(sk, skb, false); | 1388 | sk_eat_skb(sk, skb, 0); |
| 1519 | if (!desc->count) | 1389 | if (!desc->count) |
| 1520 | break; | 1390 | break; |
| 1521 | tp->copied_seq = seq; | 1391 | tp->copied_seq = seq; |
| @@ -1526,9 +1396,10 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, | |||
| 1526 | 1396 | ||
| 1527 | /* Clean up data we have read: This will do ACK frames. */ | 1397 | /* Clean up data we have read: This will do ACK frames. */ |
| 1528 | if (copied > 0) { | 1398 | if (copied > 0) { |
| 1529 | tcp_recv_skb(sk, seq, &offset); | ||
| 1530 | tcp_cleanup_rbuf(sk, copied); | 1399 | tcp_cleanup_rbuf(sk, copied); |
| 1400 | uid_stat_tcp_rcv(current_uid(), copied); | ||
| 1531 | } | 1401 | } |
| 1402 | |||
| 1532 | return copied; | 1403 | return copied; |
| 1533 | } | 1404 | } |
| 1534 | EXPORT_SYMBOL(tcp_read_sock); | 1405 | EXPORT_SYMBOL(tcp_read_sock); |
| @@ -1553,7 +1424,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
| 1553 | int target; /* Read at least this many bytes */ | 1424 | int target; /* Read at least this many bytes */ |
| 1554 | long timeo; | 1425 | long timeo; |
| 1555 | struct task_struct *user_recv = NULL; | 1426 | struct task_struct *user_recv = NULL; |
| 1556 | bool copied_early = false; | 1427 | int copied_early = 0; |
| 1557 | struct sk_buff *skb; | 1428 | struct sk_buff *skb; |
| 1558 | u32 urg_hole = 0; | 1429 | u32 urg_hole = 0; |
| 1559 | 1430 | ||
| @@ -1569,21 +1440,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
| 1569 | if (flags & MSG_OOB) | 1440 | if (flags & MSG_OOB) |
| 1570 | goto recv_urg; | 1441 | goto recv_urg; |
| 1571 | 1442 | ||
| 1572 | if (unlikely(tp->repair)) { | ||
| 1573 | err = -EPERM; | ||
| 1574 | if (!(flags & MSG_PEEK)) | ||
| 1575 | goto out; | ||
| 1576 | |||
| 1577 | if (tp->repair_queue == TCP_SEND_QUEUE) | ||
| 1578 | goto recv_sndq; | ||
| 1579 | |||
| 1580 | err = -EINVAL; | ||
| 1581 | if (tp->repair_queue == TCP_NO_QUEUE) | ||
| 1582 | goto out; | ||
| 1583 | |||
| 1584 | /* 'common' recv queue MSG_PEEK-ing */ | ||
| 1585 | } | ||
| 1586 | |||
| 1587 | seq = &tp->copied_seq; | 1443 | seq = &tp->copied_seq; |
| 1588 | if (flags & MSG_PEEK) { | 1444 | if (flags & MSG_PEEK) { |
| 1589 | peek_seq = tp->copied_seq; | 1445 | peek_seq = tp->copied_seq; |
| @@ -1604,7 +1460,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
| 1604 | if ((available < target) && | 1460 | if ((available < target) && |
| 1605 | (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && | 1461 | (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && |
| 1606 | !sysctl_tcp_low_latency && | 1462 | !sysctl_tcp_low_latency && |
| 1607 | net_dma_find_channel()) { | 1463 | dma_find_channel(DMA_MEMCPY)) { |
| 1608 | preempt_enable_no_resched(); | 1464 | preempt_enable_no_resched(); |
| 1609 | tp->ucopy.pinned_list = | 1465 | tp->ucopy.pinned_list = |
| 1610 | dma_pin_iovec_pages(msg->msg_iov, len); | 1466 | dma_pin_iovec_pages(msg->msg_iov, len); |
| @@ -1745,14 +1601,8 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
| 1745 | } | 1601 | } |
| 1746 | 1602 | ||
| 1747 | #ifdef CONFIG_NET_DMA | 1603 | #ifdef CONFIG_NET_DMA |
| 1748 | if (tp->ucopy.dma_chan) { | 1604 | if (tp->ucopy.dma_chan) |
| 1749 | if (tp->rcv_wnd == 0 && | 1605 | dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); |
| 1750 | !skb_queue_empty(&sk->sk_async_wait_queue)) { | ||
| 1751 | tcp_service_net_dma(sk, true); | ||
| 1752 | tcp_cleanup_rbuf(sk, copied); | ||
| 1753 | } else | ||
| 1754 | dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); | ||
| 1755 | } | ||
| 1756 | #endif | 1606 | #endif |
| 1757 | if (copied >= target) { | 1607 | if (copied >= target) { |
| 1758 | /* Do not sleep, just process backlog. */ | 1608 | /* Do not sleep, just process backlog. */ |
| @@ -1791,9 +1641,9 @@ do_prequeue: | |||
| 1791 | } | 1641 | } |
| 1792 | if ((flags & MSG_PEEK) && | 1642 | if ((flags & MSG_PEEK) && |
| 1793 | (peek_seq - copied - urg_hole != tp->copied_seq)) { | 1643 | (peek_seq - copied - urg_hole != tp->copied_seq)) { |
| 1794 | net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", | 1644 | if (net_ratelimit()) |
| 1795 | current->comm, | 1645 | printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n", |
| 1796 | task_pid_nr(current)); | 1646 | current->comm, task_pid_nr(current)); |
| 1797 | peek_seq = tp->copied_seq; | 1647 | peek_seq = tp->copied_seq; |
| 1798 | } | 1648 | } |
| 1799 | continue; | 1649 | continue; |
| @@ -1825,7 +1675,7 @@ do_prequeue: | |||
| 1825 | if (!(flags & MSG_TRUNC)) { | 1675 | if (!(flags & MSG_TRUNC)) { |
| 1826 | #ifdef CONFIG_NET_DMA | 1676 | #ifdef CONFIG_NET_DMA |
| 1827 | if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) | 1677 | if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) |
| 1828 | tp->ucopy.dma_chan = net_dma_find_channel(); | 1678 | tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); |
| 1829 | 1679 | ||
| 1830 | if (tp->ucopy.dma_chan) { | 1680 | if (tp->ucopy.dma_chan) { |
| 1831 | tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( | 1681 | tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( |
| @@ -1835,8 +1685,7 @@ do_prequeue: | |||
| 1835 | 1685 | ||
| 1836 | if (tp->ucopy.dma_cookie < 0) { | 1686 | if (tp->ucopy.dma_cookie < 0) { |
| 1837 | 1687 | ||
| 1838 | pr_alert("%s: dma_cookie < 0\n", | 1688 | printk(KERN_ALERT "dma_cookie < 0\n"); |
| 1839 | __func__); | ||
| 1840 | 1689 | ||
| 1841 | /* Exception. Bailout! */ | 1690 | /* Exception. Bailout! */ |
| 1842 | if (!copied) | 1691 | if (!copied) |
| @@ -1847,7 +1696,7 @@ do_prequeue: | |||
| 1847 | dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); | 1696 | dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); |
| 1848 | 1697 | ||
| 1849 | if ((offset + used) == skb->len) | 1698 | if ((offset + used) == skb->len) |
| 1850 | copied_early = true; | 1699 | copied_early = 1; |
| 1851 | 1700 | ||
| 1852 | } else | 1701 | } else |
| 1853 | #endif | 1702 | #endif |
| @@ -1881,7 +1730,7 @@ skip_copy: | |||
| 1881 | goto found_fin_ok; | 1730 | goto found_fin_ok; |
| 1882 | if (!(flags & MSG_PEEK)) { | 1731 | if (!(flags & MSG_PEEK)) { |
| 1883 | sk_eat_skb(sk, skb, copied_early); | 1732 | sk_eat_skb(sk, skb, copied_early); |
| 1884 | copied_early = false; | 1733 | copied_early = 0; |
| 1885 | } | 1734 | } |
| 1886 | continue; | 1735 | continue; |
| 1887 | 1736 | ||
| @@ -1890,7 +1739,7 @@ skip_copy: | |||
| 1890 | ++*seq; | 1739 | ++*seq; |
| 1891 | if (!(flags & MSG_PEEK)) { | 1740 | if (!(flags & MSG_PEEK)) { |
| 1892 | sk_eat_skb(sk, skb, copied_early); | 1741 | sk_eat_skb(sk, skb, copied_early); |
| 1893 | copied_early = false; | 1742 | copied_early = 0; |
| 1894 | } | 1743 | } |
| 1895 | break; | 1744 | break; |
| 1896 | } while (len > 0); | 1745 | } while (len > 0); |
| @@ -1932,6 +1781,9 @@ skip_copy: | |||
| 1932 | tcp_cleanup_rbuf(sk, copied); | 1781 | tcp_cleanup_rbuf(sk, copied); |
| 1933 | 1782 | ||
| 1934 | release_sock(sk); | 1783 | release_sock(sk); |
| 1784 | |||
| 1785 | if (copied > 0) | ||
| 1786 | uid_stat_tcp_rcv(current_uid(), copied); | ||
| 1935 | return copied; | 1787 | return copied; |
| 1936 | 1788 | ||
| 1937 | out: | 1789 | out: |
| @@ -1940,10 +1792,8 @@ out: | |||
| 1940 | 1792 | ||
| 1941 | recv_urg: | 1793 | recv_urg: |
| 1942 | err = tcp_recv_urg(sk, msg, len, flags); | 1794 | err = tcp_recv_urg(sk, msg, len, flags); |
| 1943 | goto out; | 1795 | if (err > 0) |
| 1944 | 1796 | uid_stat_tcp_rcv(current_uid(), err); | |
| 1945 | recv_sndq: | ||
| 1946 | err = tcp_peek_sndq(sk, msg, len); | ||
| 1947 | goto out; | 1797 | goto out; |
| 1948 | } | 1798 | } |
| 1949 | EXPORT_SYMBOL(tcp_recvmsg); | 1799 | EXPORT_SYMBOL(tcp_recvmsg); |
| @@ -2041,20 +1891,6 @@ void tcp_shutdown(struct sock *sk, int how) | |||
| 2041 | } | 1891 | } |
| 2042 | EXPORT_SYMBOL(tcp_shutdown); | 1892 | EXPORT_SYMBOL(tcp_shutdown); |
| 2043 | 1893 | ||
| 2044 | bool tcp_check_oom(struct sock *sk, int shift) | ||
| 2045 | { | ||
| 2046 | bool too_many_orphans, out_of_socket_memory; | ||
| 2047 | |||
| 2048 | too_many_orphans = tcp_too_many_orphans(sk, shift); | ||
| 2049 | out_of_socket_memory = tcp_out_of_memory(sk); | ||
| 2050 | |||
| 2051 | if (too_many_orphans) | ||
| 2052 | net_info_ratelimited("too many orphaned sockets\n"); | ||
| 2053 | if (out_of_socket_memory) | ||
| 2054 | net_info_ratelimited("out of memory -- consider tuning tcp_mem\n"); | ||
| 2055 | return too_many_orphans || out_of_socket_memory; | ||
| 2056 | } | ||
| 2057 | |||
| 2058 | void tcp_close(struct sock *sk, long timeout) | 1894 | void tcp_close(struct sock *sk, long timeout) |
| 2059 | { | 1895 | { |
| 2060 | struct sk_buff *skb; | 1896 | struct sk_buff *skb; |
| @@ -2097,9 +1933,7 @@ void tcp_close(struct sock *sk, long timeout) | |||
| 2097 | * advertise a zero window, then kill -9 the FTP client, wheee... | 1933 | * advertise a zero window, then kill -9 the FTP client, wheee... |
| 2098 | * Note: timeout is always zero in such a case. | 1934 | * Note: timeout is always zero in such a case. |
| 2099 | */ | 1935 | */ |
| 2100 | if (unlikely(tcp_sk(sk)->repair)) { | 1936 | if (data_was_unread) { |
| 2101 | sk->sk_prot->disconnect(sk, 0); | ||
| 2102 | } else if (data_was_unread) { | ||
| 2103 | /* Unread data was tossed, zap the connection. */ | 1937 | /* Unread data was tossed, zap the connection. */ |
| 2104 | NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); | 1938 | NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); |
| 2105 | tcp_set_state(sk, TCP_CLOSE); | 1939 | tcp_set_state(sk, TCP_CLOSE); |
| @@ -2133,10 +1967,6 @@ void tcp_close(struct sock *sk, long timeout) | |||
| 2133 | * they look as CLOSING or LAST_ACK for Linux) | 1967 | * they look as CLOSING or LAST_ACK for Linux) |
| 2134 | * Probably, I missed some more holelets. | 1968 | * Probably, I missed some more holelets. |
| 2135 | * --ANK | 1969 | * --ANK |
| 2136 | * XXX (TFO) - To start off we don't support SYN+ACK+FIN | ||
| 2137 | * in a single packet! (May consider it later but will | ||
| 2138 | * probably need API support or TCP_CORK SYN-ACK until | ||
| 2139 | * data is written and socket is closed.) | ||
| 2140 | */ | 1970 | */ |
| 2141 | tcp_send_fin(sk); | 1971 | tcp_send_fin(sk); |
| 2142 | } | 1972 | } |
| @@ -2200,7 +2030,10 @@ adjudge_to_death: | |||
| 2200 | } | 2030 | } |
| 2201 | if (sk->sk_state != TCP_CLOSE) { | 2031 | if (sk->sk_state != TCP_CLOSE) { |
| 2202 | sk_mem_reclaim(sk); | 2032 | sk_mem_reclaim(sk); |
| 2203 | if (tcp_check_oom(sk, 0)) { | 2033 | if (tcp_too_many_orphans(sk, 0)) { |
| 2034 | if (net_ratelimit()) | ||
| 2035 | printk(KERN_INFO "TCP: too many of orphaned " | ||
| 2036 | "sockets\n"); | ||
| 2204 | tcp_set_state(sk, TCP_CLOSE); | 2037 | tcp_set_state(sk, TCP_CLOSE); |
| 2205 | tcp_send_active_reset(sk, GFP_ATOMIC); | 2038 | tcp_send_active_reset(sk, GFP_ATOMIC); |
| 2206 | NET_INC_STATS_BH(sock_net(sk), | 2039 | NET_INC_STATS_BH(sock_net(sk), |
| @@ -2208,16 +2041,8 @@ adjudge_to_death: | |||
| 2208 | } | 2041 | } |
| 2209 | } | 2042 | } |
| 2210 | 2043 | ||
| 2211 | if (sk->sk_state == TCP_CLOSE) { | 2044 | if (sk->sk_state == TCP_CLOSE) |
| 2212 | struct request_sock *req = tcp_sk(sk)->fastopen_rsk; | ||
| 2213 | /* We could get here with a non-NULL req if the socket is | ||
| 2214 | * aborted (e.g., closed with unread data) before 3WHS | ||
| 2215 | * finishes. | ||
| 2216 | */ | ||
| 2217 | if (req != NULL) | ||
| 2218 | reqsk_fastopen_remove(sk, req, false); | ||
| 2219 | inet_csk_destroy_sock(sk); | 2045 | inet_csk_destroy_sock(sk); |
| 2220 | } | ||
| 2221 | /* Otherwise, socket is reprieved until protocol close. */ | 2046 | /* Otherwise, socket is reprieved until protocol close. */ |
| 2222 | 2047 | ||
| 2223 | out: | 2048 | out: |
| @@ -2229,7 +2054,7 @@ EXPORT_SYMBOL(tcp_close); | |||
| 2229 | 2054 | ||
| 2230 | /* These states need RST on ABORT according to RFC793 */ | 2055 | /* These states need RST on ABORT according to RFC793 */ |
| 2231 | 2056 | ||
| 2232 | static inline bool tcp_need_reset(int state) | 2057 | static inline int tcp_need_reset(int state) |
| 2233 | { | 2058 | { |
| 2234 | return (1 << state) & | 2059 | return (1 << state) & |
| 2235 | (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | | 2060 | (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | |
| @@ -2250,8 +2075,6 @@ int tcp_disconnect(struct sock *sk, int flags) | |||
| 2250 | /* ABORT function of RFC793 */ | 2075 | /* ABORT function of RFC793 */ |
| 2251 | if (old_state == TCP_LISTEN) { | 2076 | if (old_state == TCP_LISTEN) { |
| 2252 | inet_csk_listen_stop(sk); | 2077 | inet_csk_listen_stop(sk); |
| 2253 | } else if (unlikely(tp->repair)) { | ||
| 2254 | sk->sk_err = ECONNABORTED; | ||
| 2255 | } else if (tcp_need_reset(old_state) || | 2078 | } else if (tcp_need_reset(old_state) || |
| 2256 | (tp->snd_nxt != tp->write_seq && | 2079 | (tp->snd_nxt != tp->write_seq && |
| 2257 | (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { | 2080 | (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { |
| @@ -2303,68 +2126,6 @@ int tcp_disconnect(struct sock *sk, int flags) | |||
| 2303 | } | 2126 | } |
| 2304 | EXPORT_SYMBOL(tcp_disconnect); | 2127 | EXPORT_SYMBOL(tcp_disconnect); |
| 2305 | 2128 | ||
| 2306 | void tcp_sock_destruct(struct sock *sk) | ||
| 2307 | { | ||
| 2308 | inet_sock_destruct(sk); | ||
| 2309 | |||
| 2310 | kfree(inet_csk(sk)->icsk_accept_queue.fastopenq); | ||
| 2311 | } | ||
| 2312 | |||
| 2313 | static inline bool tcp_can_repair_sock(const struct sock *sk) | ||
| 2314 | { | ||
| 2315 | return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && | ||
| 2316 | ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED)); | ||
| 2317 | } | ||
| 2318 | |||
| 2319 | static int tcp_repair_options_est(struct tcp_sock *tp, | ||
| 2320 | struct tcp_repair_opt __user *optbuf, unsigned int len) | ||
| 2321 | { | ||
| 2322 | struct tcp_repair_opt opt; | ||
| 2323 | |||
| 2324 | while (len >= sizeof(opt)) { | ||
| 2325 | if (copy_from_user(&opt, optbuf, sizeof(opt))) | ||
| 2326 | return -EFAULT; | ||
| 2327 | |||
| 2328 | optbuf++; | ||
| 2329 | len -= sizeof(opt); | ||
| 2330 | |||
| 2331 | switch (opt.opt_code) { | ||
| 2332 | case TCPOPT_MSS: | ||
| 2333 | tp->rx_opt.mss_clamp = opt.opt_val; | ||
| 2334 | break; | ||
| 2335 | case TCPOPT_WINDOW: | ||
| 2336 | { | ||
| 2337 | u16 snd_wscale = opt.opt_val & 0xFFFF; | ||
| 2338 | u16 rcv_wscale = opt.opt_val >> 16; | ||
| 2339 | |||
| 2340 | if (snd_wscale > 14 || rcv_wscale > 14) | ||
| 2341 | return -EFBIG; | ||
| 2342 | |||
| 2343 | tp->rx_opt.snd_wscale = snd_wscale; | ||
| 2344 | tp->rx_opt.rcv_wscale = rcv_wscale; | ||
| 2345 | tp->rx_opt.wscale_ok = 1; | ||
| 2346 | } | ||
| 2347 | break; | ||
| 2348 | case TCPOPT_SACK_PERM: | ||
| 2349 | if (opt.opt_val != 0) | ||
| 2350 | return -EINVAL; | ||
| 2351 | |||
| 2352 | tp->rx_opt.sack_ok |= TCP_SACK_SEEN; | ||
| 2353 | if (sysctl_tcp_fack) | ||
| 2354 | tcp_enable_fack(tp); | ||
| 2355 | break; | ||
| 2356 | case TCPOPT_TIMESTAMP: | ||
| 2357 | if (opt.opt_val != 0) | ||
| 2358 | return -EINVAL; | ||
| 2359 | |||
| 2360 | tp->rx_opt.tstamp_ok = 1; | ||
| 2361 | break; | ||
| 2362 | } | ||
| 2363 | } | ||
| 2364 | |||
| 2365 | return 0; | ||
| 2366 | } | ||
| 2367 | |||
| 2368 | /* | 2129 | /* |
| 2369 | * Socket option code for TCP. | 2130 | * Socket option code for TCP. |
| 2370 | */ | 2131 | */ |
| @@ -2535,55 +2296,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
| 2535 | err = -EINVAL; | 2296 | err = -EINVAL; |
| 2536 | else | 2297 | else |
| 2537 | tp->thin_dupack = val; | 2298 | tp->thin_dupack = val; |
| 2538 | if (tp->thin_dupack) | ||
| 2539 | tcp_disable_early_retrans(tp); | ||
| 2540 | break; | ||
| 2541 | |||
| 2542 | case TCP_REPAIR: | ||
| 2543 | if (!tcp_can_repair_sock(sk)) | ||
| 2544 | err = -EPERM; | ||
| 2545 | else if (val == 1) { | ||
| 2546 | tp->repair = 1; | ||
| 2547 | sk->sk_reuse = SK_FORCE_REUSE; | ||
| 2548 | tp->repair_queue = TCP_NO_QUEUE; | ||
| 2549 | } else if (val == 0) { | ||
| 2550 | tp->repair = 0; | ||
| 2551 | sk->sk_reuse = SK_NO_REUSE; | ||
| 2552 | tcp_send_window_probe(sk); | ||
| 2553 | } else | ||
| 2554 | err = -EINVAL; | ||
| 2555 | |||
| 2556 | break; | ||
| 2557 | |||
| 2558 | case TCP_REPAIR_QUEUE: | ||
| 2559 | if (!tp->repair) | ||
| 2560 | err = -EPERM; | ||
| 2561 | else if (val < TCP_QUEUES_NR) | ||
| 2562 | tp->repair_queue = val; | ||
| 2563 | else | ||
| 2564 | err = -EINVAL; | ||
| 2565 | break; | ||
| 2566 | |||
| 2567 | case TCP_QUEUE_SEQ: | ||
| 2568 | if (sk->sk_state != TCP_CLOSE) | ||
| 2569 | err = -EPERM; | ||
| 2570 | else if (tp->repair_queue == TCP_SEND_QUEUE) | ||
| 2571 | tp->write_seq = val; | ||
| 2572 | else if (tp->repair_queue == TCP_RECV_QUEUE) | ||
| 2573 | tp->rcv_nxt = val; | ||
| 2574 | else | ||
| 2575 | err = -EINVAL; | ||
| 2576 | break; | ||
| 2577 | |||
| 2578 | case TCP_REPAIR_OPTIONS: | ||
| 2579 | if (!tp->repair) | ||
| 2580 | err = -EINVAL; | ||
| 2581 | else if (sk->sk_state == TCP_ESTABLISHED) | ||
| 2582 | err = tcp_repair_options_est(tp, | ||
| 2583 | (struct tcp_repair_opt __user *)optval, | ||
| 2584 | optlen); | ||
| 2585 | else | ||
| 2586 | err = -EPERM; | ||
| 2587 | break; | 2299 | break; |
| 2588 | 2300 | ||
| 2589 | case TCP_CORK: | 2301 | case TCP_CORK: |
| @@ -2698,18 +2410,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
| 2698 | /* Cap the max timeout in ms TCP will retry/retrans | 2410 | /* Cap the max timeout in ms TCP will retry/retrans |
| 2699 | * before giving up and aborting (ETIMEDOUT) a connection. | 2411 | * before giving up and aborting (ETIMEDOUT) a connection. |
| 2700 | */ | 2412 | */ |
| 2701 | if (val < 0) | 2413 | icsk->icsk_user_timeout = msecs_to_jiffies(val); |
| 2702 | err = -EINVAL; | ||
| 2703 | else | ||
| 2704 | icsk->icsk_user_timeout = msecs_to_jiffies(val); | ||
| 2705 | break; | ||
| 2706 | |||
| 2707 | case TCP_FASTOPEN: | ||
| 2708 | if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | | ||
| 2709 | TCPF_LISTEN))) | ||
| 2710 | err = fastopen_init_queue(sk, val); | ||
| 2711 | else | ||
| 2712 | err = -EINVAL; | ||
| 2713 | break; | 2414 | break; |
| 2714 | default: | 2415 | default: |
| 2715 | err = -ENOPROTOOPT; | 2416 | err = -ENOPROTOOPT; |
| @@ -2723,7 +2424,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
| 2723 | int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | 2424 | int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, |
| 2724 | unsigned int optlen) | 2425 | unsigned int optlen) |
| 2725 | { | 2426 | { |
| 2726 | const struct inet_connection_sock *icsk = inet_csk(sk); | 2427 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 2727 | 2428 | ||
| 2728 | if (level != SOL_TCP) | 2429 | if (level != SOL_TCP) |
| 2729 | return icsk->icsk_af_ops->setsockopt(sk, level, optname, | 2430 | return icsk->icsk_af_ops->setsockopt(sk, level, optname, |
| @@ -2745,9 +2446,9 @@ EXPORT_SYMBOL(compat_tcp_setsockopt); | |||
| 2745 | #endif | 2446 | #endif |
| 2746 | 2447 | ||
| 2747 | /* Return information about state of tcp endpoint in API format. */ | 2448 | /* Return information about state of tcp endpoint in API format. */ |
| 2748 | void tcp_get_info(const struct sock *sk, struct tcp_info *info) | 2449 | void tcp_get_info(struct sock *sk, struct tcp_info *info) |
| 2749 | { | 2450 | { |
| 2750 | const struct tcp_sock *tp = tcp_sk(sk); | 2451 | struct tcp_sock *tp = tcp_sk(sk); |
| 2751 | const struct inet_connection_sock *icsk = inet_csk(sk); | 2452 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 2752 | u32 now = tcp_time_stamp; | 2453 | u32 now = tcp_time_stamp; |
| 2753 | 2454 | ||
| @@ -2769,12 +2470,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) | |||
| 2769 | info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; | 2470 | info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; |
| 2770 | } | 2471 | } |
| 2771 | 2472 | ||
| 2772 | if (tp->ecn_flags & TCP_ECN_OK) | 2473 | if (tp->ecn_flags&TCP_ECN_OK) |
| 2773 | info->tcpi_options |= TCPI_OPT_ECN; | 2474 | info->tcpi_options |= TCPI_OPT_ECN; |
| 2774 | if (tp->ecn_flags & TCP_ECN_SEEN) | ||
| 2775 | info->tcpi_options |= TCPI_OPT_ECN_SEEN; | ||
| 2776 | if (tp->syn_data_acked) | ||
| 2777 | info->tcpi_options |= TCPI_OPT_SYN_DATA; | ||
| 2778 | 2475 | ||
| 2779 | info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); | 2476 | info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); |
| 2780 | info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); | 2477 | info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); |
| @@ -2832,8 +2529,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level, | |||
| 2832 | val = tp->mss_cache; | 2529 | val = tp->mss_cache; |
| 2833 | if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) | 2530 | if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) |
| 2834 | val = tp->rx_opt.user_mss; | 2531 | val = tp->rx_opt.user_mss; |
| 2835 | if (tp->repair) | ||
| 2836 | val = tp->rx_opt.mss_clamp; | ||
| 2837 | break; | 2532 | break; |
| 2838 | case TCP_NODELAY: | 2533 | case TCP_NODELAY: |
| 2839 | val = !!(tp->nonagle&TCP_NAGLE_OFF); | 2534 | val = !!(tp->nonagle&TCP_NAGLE_OFF); |
| @@ -2936,26 +2631,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level, | |||
| 2936 | val = tp->thin_dupack; | 2631 | val = tp->thin_dupack; |
| 2937 | break; | 2632 | break; |
| 2938 | 2633 | ||
| 2939 | case TCP_REPAIR: | ||
| 2940 | val = tp->repair; | ||
| 2941 | break; | ||
| 2942 | |||
| 2943 | case TCP_REPAIR_QUEUE: | ||
| 2944 | if (tp->repair) | ||
| 2945 | val = tp->repair_queue; | ||
| 2946 | else | ||
| 2947 | return -EINVAL; | ||
| 2948 | break; | ||
| 2949 | |||
| 2950 | case TCP_QUEUE_SEQ: | ||
| 2951 | if (tp->repair_queue == TCP_SEND_QUEUE) | ||
| 2952 | val = tp->write_seq; | ||
| 2953 | else if (tp->repair_queue == TCP_RECV_QUEUE) | ||
| 2954 | val = tp->rcv_nxt; | ||
| 2955 | else | ||
| 2956 | return -EINVAL; | ||
| 2957 | break; | ||
| 2958 | |||
| 2959 | case TCP_USER_TIMEOUT: | 2634 | case TCP_USER_TIMEOUT: |
| 2960 | val = jiffies_to_msecs(icsk->icsk_user_timeout); | 2635 | val = jiffies_to_msecs(icsk->icsk_user_timeout); |
| 2961 | break; | 2636 | break; |
| @@ -2994,12 +2669,11 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname, | |||
| 2994 | EXPORT_SYMBOL(compat_tcp_getsockopt); | 2669 | EXPORT_SYMBOL(compat_tcp_getsockopt); |
| 2995 | #endif | 2670 | #endif |
| 2996 | 2671 | ||
| 2997 | struct sk_buff *tcp_tso_segment(struct sk_buff *skb, | 2672 | struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features) |
| 2998 | netdev_features_t features) | ||
| 2999 | { | 2673 | { |
| 3000 | struct sk_buff *segs = ERR_PTR(-EINVAL); | 2674 | struct sk_buff *segs = ERR_PTR(-EINVAL); |
| 3001 | struct tcphdr *th; | 2675 | struct tcphdr *th; |
| 3002 | unsigned int thlen; | 2676 | unsigned thlen; |
| 3003 | unsigned int seq; | 2677 | unsigned int seq; |
| 3004 | __be32 delta; | 2678 | __be32 delta; |
| 3005 | unsigned int oldlen; | 2679 | unsigned int oldlen; |
| @@ -3198,25 +2872,26 @@ EXPORT_SYMBOL(tcp_gro_complete); | |||
| 3198 | 2872 | ||
| 3199 | #ifdef CONFIG_TCP_MD5SIG | 2873 | #ifdef CONFIG_TCP_MD5SIG |
| 3200 | static unsigned long tcp_md5sig_users; | 2874 | static unsigned long tcp_md5sig_users; |
| 3201 | static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool; | 2875 | static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool; |
| 3202 | static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); | 2876 | static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); |
| 3203 | 2877 | ||
| 3204 | static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool) | 2878 | static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool) |
| 3205 | { | 2879 | { |
| 3206 | int cpu; | 2880 | int cpu; |
| 3207 | |||
| 3208 | for_each_possible_cpu(cpu) { | 2881 | for_each_possible_cpu(cpu) { |
| 3209 | struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu); | 2882 | struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu); |
| 3210 | 2883 | if (p) { | |
| 3211 | if (p->md5_desc.tfm) | 2884 | if (p->md5_desc.tfm) |
| 3212 | crypto_free_hash(p->md5_desc.tfm); | 2885 | crypto_free_hash(p->md5_desc.tfm); |
| 2886 | kfree(p); | ||
| 2887 | } | ||
| 3213 | } | 2888 | } |
| 3214 | free_percpu(pool); | 2889 | free_percpu(pool); |
| 3215 | } | 2890 | } |
| 3216 | 2891 | ||
| 3217 | void tcp_free_md5sig_pool(void) | 2892 | void tcp_free_md5sig_pool(void) |
| 3218 | { | 2893 | { |
| 3219 | struct tcp_md5sig_pool __percpu *pool = NULL; | 2894 | struct tcp_md5sig_pool * __percpu *pool = NULL; |
| 3220 | 2895 | ||
| 3221 | spin_lock_bh(&tcp_md5sig_pool_lock); | 2896 | spin_lock_bh(&tcp_md5sig_pool_lock); |
| 3222 | if (--tcp_md5sig_users == 0) { | 2897 | if (--tcp_md5sig_users == 0) { |
| @@ -3229,24 +2904,30 @@ void tcp_free_md5sig_pool(void) | |||
| 3229 | } | 2904 | } |
| 3230 | EXPORT_SYMBOL(tcp_free_md5sig_pool); | 2905 | EXPORT_SYMBOL(tcp_free_md5sig_pool); |
| 3231 | 2906 | ||
| 3232 | static struct tcp_md5sig_pool __percpu * | 2907 | static struct tcp_md5sig_pool * __percpu * |
| 3233 | __tcp_alloc_md5sig_pool(struct sock *sk) | 2908 | __tcp_alloc_md5sig_pool(struct sock *sk) |
| 3234 | { | 2909 | { |
| 3235 | int cpu; | 2910 | int cpu; |
| 3236 | struct tcp_md5sig_pool __percpu *pool; | 2911 | struct tcp_md5sig_pool * __percpu *pool; |
| 3237 | 2912 | ||
| 3238 | pool = alloc_percpu(struct tcp_md5sig_pool); | 2913 | pool = alloc_percpu(struct tcp_md5sig_pool *); |
| 3239 | if (!pool) | 2914 | if (!pool) |
| 3240 | return NULL; | 2915 | return NULL; |
| 3241 | 2916 | ||
| 3242 | for_each_possible_cpu(cpu) { | 2917 | for_each_possible_cpu(cpu) { |
| 2918 | struct tcp_md5sig_pool *p; | ||
| 3243 | struct crypto_hash *hash; | 2919 | struct crypto_hash *hash; |
| 3244 | 2920 | ||
| 2921 | p = kzalloc(sizeof(*p), sk->sk_allocation); | ||
| 2922 | if (!p) | ||
| 2923 | goto out_free; | ||
| 2924 | *per_cpu_ptr(pool, cpu) = p; | ||
| 2925 | |||
| 3245 | hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); | 2926 | hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); |
| 3246 | if (!hash || IS_ERR(hash)) | 2927 | if (!hash || IS_ERR(hash)) |
| 3247 | goto out_free; | 2928 | goto out_free; |
| 3248 | 2929 | ||
| 3249 | per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash; | 2930 | p->md5_desc.tfm = hash; |
| 3250 | } | 2931 | } |
| 3251 | return pool; | 2932 | return pool; |
| 3252 | out_free: | 2933 | out_free: |
| @@ -3254,16 +2935,16 @@ out_free: | |||
| 3254 | return NULL; | 2935 | return NULL; |
| 3255 | } | 2936 | } |
| 3256 | 2937 | ||
| 3257 | struct tcp_md5sig_pool __percpu *tcp_alloc_md5sig_pool(struct sock *sk) | 2938 | struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk) |
| 3258 | { | 2939 | { |
| 3259 | struct tcp_md5sig_pool __percpu *pool; | 2940 | struct tcp_md5sig_pool * __percpu *pool; |
| 3260 | bool alloc = false; | 2941 | int alloc = 0; |
| 3261 | 2942 | ||
| 3262 | retry: | 2943 | retry: |
| 3263 | spin_lock_bh(&tcp_md5sig_pool_lock); | 2944 | spin_lock_bh(&tcp_md5sig_pool_lock); |
| 3264 | pool = tcp_md5sig_pool; | 2945 | pool = tcp_md5sig_pool; |
| 3265 | if (tcp_md5sig_users++ == 0) { | 2946 | if (tcp_md5sig_users++ == 0) { |
| 3266 | alloc = true; | 2947 | alloc = 1; |
| 3267 | spin_unlock_bh(&tcp_md5sig_pool_lock); | 2948 | spin_unlock_bh(&tcp_md5sig_pool_lock); |
| 3268 | } else if (!pool) { | 2949 | } else if (!pool) { |
| 3269 | tcp_md5sig_users--; | 2950 | tcp_md5sig_users--; |
| @@ -3275,7 +2956,7 @@ retry: | |||
| 3275 | 2956 | ||
| 3276 | if (alloc) { | 2957 | if (alloc) { |
| 3277 | /* we cannot hold spinlock here because this may sleep. */ | 2958 | /* we cannot hold spinlock here because this may sleep. */ |
| 3278 | struct tcp_md5sig_pool __percpu *p; | 2959 | struct tcp_md5sig_pool * __percpu *p; |
| 3279 | 2960 | ||
| 3280 | p = __tcp_alloc_md5sig_pool(sk); | 2961 | p = __tcp_alloc_md5sig_pool(sk); |
| 3281 | spin_lock_bh(&tcp_md5sig_pool_lock); | 2962 | spin_lock_bh(&tcp_md5sig_pool_lock); |
| @@ -3308,7 +2989,7 @@ EXPORT_SYMBOL(tcp_alloc_md5sig_pool); | |||
| 3308 | */ | 2989 | */ |
| 3309 | struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) | 2990 | struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) |
| 3310 | { | 2991 | { |
| 3311 | struct tcp_md5sig_pool __percpu *p; | 2992 | struct tcp_md5sig_pool * __percpu *p; |
| 3312 | 2993 | ||
| 3313 | local_bh_disable(); | 2994 | local_bh_disable(); |
| 3314 | 2995 | ||
| @@ -3319,7 +3000,7 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) | |||
| 3319 | spin_unlock(&tcp_md5sig_pool_lock); | 3000 | spin_unlock(&tcp_md5sig_pool_lock); |
| 3320 | 3001 | ||
| 3321 | if (p) | 3002 | if (p) |
| 3322 | return this_cpu_ptr(p); | 3003 | return *this_cpu_ptr(p); |
| 3323 | 3004 | ||
| 3324 | local_bh_enable(); | 3005 | local_bh_enable(); |
| 3325 | return NULL; | 3006 | return NULL; |
| @@ -3334,32 +3015,30 @@ void tcp_put_md5sig_pool(void) | |||
| 3334 | EXPORT_SYMBOL(tcp_put_md5sig_pool); | 3015 | EXPORT_SYMBOL(tcp_put_md5sig_pool); |
| 3335 | 3016 | ||
| 3336 | int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, | 3017 | int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, |
| 3337 | const struct tcphdr *th) | 3018 | struct tcphdr *th) |
| 3338 | { | 3019 | { |
| 3339 | struct scatterlist sg; | 3020 | struct scatterlist sg; |
| 3340 | struct tcphdr hdr; | ||
| 3341 | int err; | 3021 | int err; |
| 3342 | 3022 | ||
| 3343 | /* We are not allowed to change tcphdr, make a local copy */ | 3023 | __sum16 old_checksum = th->check; |
| 3344 | memcpy(&hdr, th, sizeof(hdr)); | 3024 | th->check = 0; |
| 3345 | hdr.check = 0; | ||
| 3346 | |||
| 3347 | /* options aren't included in the hash */ | 3025 | /* options aren't included in the hash */ |
| 3348 | sg_init_one(&sg, &hdr, sizeof(hdr)); | 3026 | sg_init_one(&sg, th, sizeof(struct tcphdr)); |
| 3349 | err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr)); | 3027 | err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr)); |
| 3028 | th->check = old_checksum; | ||
| 3350 | return err; | 3029 | return err; |
| 3351 | } | 3030 | } |
| 3352 | EXPORT_SYMBOL(tcp_md5_hash_header); | 3031 | EXPORT_SYMBOL(tcp_md5_hash_header); |
| 3353 | 3032 | ||
| 3354 | int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, | 3033 | int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, |
| 3355 | const struct sk_buff *skb, unsigned int header_len) | 3034 | struct sk_buff *skb, unsigned header_len) |
| 3356 | { | 3035 | { |
| 3357 | struct scatterlist sg; | 3036 | struct scatterlist sg; |
| 3358 | const struct tcphdr *tp = tcp_hdr(skb); | 3037 | const struct tcphdr *tp = tcp_hdr(skb); |
| 3359 | struct hash_desc *desc = &hp->md5_desc; | 3038 | struct hash_desc *desc = &hp->md5_desc; |
| 3360 | unsigned int i; | 3039 | unsigned i; |
| 3361 | const unsigned int head_data_len = skb_headlen(skb) > header_len ? | 3040 | const unsigned head_data_len = skb_headlen(skb) > header_len ? |
| 3362 | skb_headlen(skb) - header_len : 0; | 3041 | skb_headlen(skb) - header_len : 0; |
| 3363 | const struct skb_shared_info *shi = skb_shinfo(skb); | 3042 | const struct skb_shared_info *shi = skb_shinfo(skb); |
| 3364 | struct sk_buff *frag_iter; | 3043 | struct sk_buff *frag_iter; |
| 3365 | 3044 | ||
| @@ -3371,9 +3050,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, | |||
| 3371 | 3050 | ||
| 3372 | for (i = 0; i < shi->nr_frags; ++i) { | 3051 | for (i = 0; i < shi->nr_frags; ++i) { |
| 3373 | const struct skb_frag_struct *f = &shi->frags[i]; | 3052 | const struct skb_frag_struct *f = &shi->frags[i]; |
| 3374 | struct page *page = skb_frag_page(f); | 3053 | sg_set_page(&sg, f->page, f->size, f->page_offset); |
| 3375 | sg_set_page(&sg, page, skb_frag_size(f), f->page_offset); | 3054 | if (crypto_hash_update(desc, &sg, f->size)) |
| 3376 | if (crypto_hash_update(desc, &sg, skb_frag_size(f))) | ||
| 3377 | return 1; | 3055 | return 1; |
| 3378 | } | 3056 | } |
| 3379 | 3057 | ||
| @@ -3385,7 +3063,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, | |||
| 3385 | } | 3063 | } |
| 3386 | EXPORT_SYMBOL(tcp_md5_hash_skb_data); | 3064 | EXPORT_SYMBOL(tcp_md5_hash_skb_data); |
| 3387 | 3065 | ||
| 3388 | int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key) | 3066 | int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key) |
| 3389 | { | 3067 | { |
| 3390 | struct scatterlist sg; | 3068 | struct scatterlist sg; |
| 3391 | 3069 | ||
| @@ -3396,7 +3074,8 @@ EXPORT_SYMBOL(tcp_md5_hash_key); | |||
| 3396 | 3074 | ||
| 3397 | #endif | 3075 | #endif |
| 3398 | 3076 | ||
| 3399 | /* Each Responder maintains up to two secret values concurrently for | 3077 | /** |
| 3078 | * Each Responder maintains up to two secret values concurrently for | ||
| 3400 | * efficient secret rollover. Each secret value has 4 states: | 3079 | * efficient secret rollover. Each secret value has 4 states: |
| 3401 | * | 3080 | * |
| 3402 | * Generating. (tcp_secret_generating != tcp_secret_primary) | 3081 | * Generating. (tcp_secret_generating != tcp_secret_primary) |
| @@ -3526,15 +3205,11 @@ EXPORT_SYMBOL(tcp_cookie_generator); | |||
| 3526 | 3205 | ||
| 3527 | void tcp_done(struct sock *sk) | 3206 | void tcp_done(struct sock *sk) |
| 3528 | { | 3207 | { |
| 3529 | struct request_sock *req = tcp_sk(sk)->fastopen_rsk; | ||
| 3530 | |||
| 3531 | if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) | 3208 | if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) |
| 3532 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); | 3209 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); |
| 3533 | 3210 | ||
| 3534 | tcp_set_state(sk, TCP_CLOSE); | 3211 | tcp_set_state(sk, TCP_CLOSE); |
| 3535 | tcp_clear_xmit_timers(sk); | 3212 | tcp_clear_xmit_timers(sk); |
| 3536 | if (req != NULL) | ||
| 3537 | reqsk_fastopen_remove(sk, req, false); | ||
| 3538 | 3213 | ||
| 3539 | sk->sk_shutdown = SHUTDOWN_MASK; | 3214 | sk->sk_shutdown = SHUTDOWN_MASK; |
| 3540 | 3215 | ||
| @@ -3550,34 +3225,18 @@ extern struct tcp_congestion_ops tcp_reno; | |||
| 3550 | static __initdata unsigned long thash_entries; | 3225 | static __initdata unsigned long thash_entries; |
| 3551 | static int __init set_thash_entries(char *str) | 3226 | static int __init set_thash_entries(char *str) |
| 3552 | { | 3227 | { |
| 3553 | ssize_t ret; | ||
| 3554 | |||
| 3555 | if (!str) | 3228 | if (!str) |
| 3556 | return 0; | 3229 | return 0; |
| 3557 | 3230 | thash_entries = simple_strtoul(str, &str, 0); | |
| 3558 | ret = kstrtoul(str, 0, &thash_entries); | ||
| 3559 | if (ret) | ||
| 3560 | return 0; | ||
| 3561 | |||
| 3562 | return 1; | 3231 | return 1; |
| 3563 | } | 3232 | } |
| 3564 | __setup("thash_entries=", set_thash_entries); | 3233 | __setup("thash_entries=", set_thash_entries); |
| 3565 | 3234 | ||
| 3566 | void tcp_init_mem(struct net *net) | ||
| 3567 | { | ||
| 3568 | unsigned long limit = nr_free_buffer_pages() / 8; | ||
| 3569 | limit = max(limit, 128UL); | ||
| 3570 | net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; | ||
| 3571 | net->ipv4.sysctl_tcp_mem[1] = limit; | ||
| 3572 | net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2; | ||
| 3573 | } | ||
| 3574 | |||
| 3575 | void __init tcp_init(void) | 3235 | void __init tcp_init(void) |
| 3576 | { | 3236 | { |
| 3577 | struct sk_buff *skb = NULL; | 3237 | struct sk_buff *skb = NULL; |
| 3578 | unsigned long limit; | 3238 | unsigned long limit; |
| 3579 | int max_rshare, max_wshare, cnt; | 3239 | int i, max_share, cnt; |
| 3580 | unsigned int i; | ||
| 3581 | unsigned long jiffy = jiffies; | 3240 | unsigned long jiffy = jiffies; |
| 3582 | 3241 | ||
| 3583 | BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); | 3242 | BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); |
| @@ -3598,11 +3257,11 @@ void __init tcp_init(void) | |||
| 3598 | alloc_large_system_hash("TCP established", | 3257 | alloc_large_system_hash("TCP established", |
| 3599 | sizeof(struct inet_ehash_bucket), | 3258 | sizeof(struct inet_ehash_bucket), |
| 3600 | thash_entries, | 3259 | thash_entries, |
| 3601 | 17, /* one slot per 128 KB of memory */ | 3260 | (totalram_pages >= 128 * 1024) ? |
| 3261 | 13 : 15, | ||
| 3602 | 0, | 3262 | 0, |
| 3603 | NULL, | 3263 | NULL, |
| 3604 | &tcp_hashinfo.ehash_mask, | 3264 | &tcp_hashinfo.ehash_mask, |
| 3605 | 0, | ||
| 3606 | thash_entries ? 0 : 512 * 1024); | 3265 | thash_entries ? 0 : 512 * 1024); |
| 3607 | for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) { | 3266 | for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) { |
| 3608 | INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); | 3267 | INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); |
| @@ -3614,13 +3273,13 @@ void __init tcp_init(void) | |||
| 3614 | alloc_large_system_hash("TCP bind", | 3273 | alloc_large_system_hash("TCP bind", |
| 3615 | sizeof(struct inet_bind_hashbucket), | 3274 | sizeof(struct inet_bind_hashbucket), |
| 3616 | tcp_hashinfo.ehash_mask + 1, | 3275 | tcp_hashinfo.ehash_mask + 1, |
| 3617 | 17, /* one slot per 128 KB of memory */ | 3276 | (totalram_pages >= 128 * 1024) ? |
| 3277 | 13 : 15, | ||
| 3618 | 0, | 3278 | 0, |
| 3619 | &tcp_hashinfo.bhash_size, | 3279 | &tcp_hashinfo.bhash_size, |
| 3620 | NULL, | 3280 | NULL, |
| 3621 | 0, | ||
| 3622 | 64 * 1024); | 3281 | 64 * 1024); |
| 3623 | tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; | 3282 | tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; |
| 3624 | for (i = 0; i < tcp_hashinfo.bhash_size; i++) { | 3283 | for (i = 0; i < tcp_hashinfo.bhash_size; i++) { |
| 3625 | spin_lock_init(&tcp_hashinfo.bhash[i].lock); | 3284 | spin_lock_init(&tcp_hashinfo.bhash[i].lock); |
| 3626 | INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); | 3285 | INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); |
| @@ -3633,24 +3292,27 @@ void __init tcp_init(void) | |||
| 3633 | sysctl_tcp_max_orphans = cnt / 2; | 3292 | sysctl_tcp_max_orphans = cnt / 2; |
| 3634 | sysctl_max_syn_backlog = max(128, cnt / 256); | 3293 | sysctl_max_syn_backlog = max(128, cnt / 256); |
| 3635 | 3294 | ||
| 3636 | tcp_init_mem(&init_net); | 3295 | limit = nr_free_buffer_pages() / 8; |
| 3296 | limit = max(limit, 128UL); | ||
| 3297 | sysctl_tcp_mem[0] = limit / 4 * 3; | ||
| 3298 | sysctl_tcp_mem[1] = limit; | ||
| 3299 | sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; | ||
| 3300 | |||
| 3637 | /* Set per-socket limits to no more than 1/128 the pressure threshold */ | 3301 | /* Set per-socket limits to no more than 1/128 the pressure threshold */ |
| 3638 | limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); | 3302 | limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); |
| 3639 | max_wshare = min(4UL*1024*1024, limit); | 3303 | max_share = min(4UL*1024*1024, limit); |
| 3640 | max_rshare = min(6UL*1024*1024, limit); | ||
| 3641 | 3304 | ||
| 3642 | sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; | 3305 | sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; |
| 3643 | sysctl_tcp_wmem[1] = 16*1024; | 3306 | sysctl_tcp_wmem[1] = 16*1024; |
| 3644 | sysctl_tcp_wmem[2] = max(64*1024, max_wshare); | 3307 | sysctl_tcp_wmem[2] = max(64*1024, max_share); |
| 3645 | 3308 | ||
| 3646 | sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; | 3309 | sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; |
| 3647 | sysctl_tcp_rmem[1] = 87380; | 3310 | sysctl_tcp_rmem[1] = 87380; |
| 3648 | sysctl_tcp_rmem[2] = max(87380, max_rshare); | 3311 | sysctl_tcp_rmem[2] = max(87380, max_share); |
| 3649 | 3312 | ||
| 3650 | pr_info("Hash tables configured (established %u bind %u)\n", | 3313 | printk(KERN_INFO "TCP: Hash tables configured " |
| 3651 | tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); | 3314 | "(established %u bind %u)\n", |
| 3652 | 3315 | tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); | |
| 3653 | tcp_metrics_init(); | ||
| 3654 | 3316 | ||
| 3655 | tcp_register_congestion_control(&tcp_reno); | 3317 | tcp_register_congestion_control(&tcp_reno); |
| 3656 | 3318 | ||
| @@ -3662,5 +3324,108 @@ void __init tcp_init(void) | |||
| 3662 | tcp_secret_primary = &tcp_secret_one; | 3324 | tcp_secret_primary = &tcp_secret_one; |
| 3663 | tcp_secret_retiring = &tcp_secret_two; | 3325 | tcp_secret_retiring = &tcp_secret_two; |
| 3664 | tcp_secret_secondary = &tcp_secret_two; | 3326 | tcp_secret_secondary = &tcp_secret_two; |
| 3665 | tcp_tasklet_init(); | 3327 | } |
| 3328 | |||
| 3329 | static int tcp_is_local(struct net *net, __be32 addr) { | ||
| 3330 | struct rtable *rt; | ||
| 3331 | struct flowi4 fl4 = { .daddr = addr }; | ||
| 3332 | rt = ip_route_output_key(net, &fl4); | ||
| 3333 | if (IS_ERR_OR_NULL(rt)) | ||
| 3334 | return 0; | ||
| 3335 | return rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK); | ||
| 3336 | } | ||
| 3337 | |||
| 3338 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | ||
| 3339 | static int tcp_is_local6(struct net *net, struct in6_addr *addr) { | ||
| 3340 | struct rt6_info *rt6 = rt6_lookup(net, addr, addr, 0, 0); | ||
| 3341 | return rt6 && rt6->rt6i_dev && (rt6->rt6i_dev->flags & IFF_LOOPBACK); | ||
| 3342 | } | ||
| 3343 | #endif | ||
| 3344 | |||
| 3345 | /* | ||
| 3346 | * tcp_nuke_addr - destroy all sockets on the given local address | ||
| 3347 | * if local address is the unspecified address (0.0.0.0 or ::), destroy all | ||
| 3348 | * sockets with local addresses that are not configured. | ||
| 3349 | */ | ||
| 3350 | int tcp_nuke_addr(struct net *net, struct sockaddr *addr) | ||
| 3351 | { | ||
| 3352 | int family = addr->sa_family; | ||
| 3353 | unsigned int bucket; | ||
| 3354 | |||
| 3355 | struct in_addr *in; | ||
| 3356 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | ||
| 3357 | struct in6_addr *in6; | ||
| 3358 | #endif | ||
| 3359 | if (family == AF_INET) { | ||
| 3360 | in = &((struct sockaddr_in *)addr)->sin_addr; | ||
| 3361 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | ||
| 3362 | } else if (family == AF_INET6) { | ||
| 3363 | in6 = &((struct sockaddr_in6 *)addr)->sin6_addr; | ||
| 3364 | #endif | ||
| 3365 | } else { | ||
| 3366 | return -EAFNOSUPPORT; | ||
| 3367 | } | ||
| 3368 | |||
| 3369 | for (bucket = 0; bucket < tcp_hashinfo.ehash_mask; bucket++) { | ||
| 3370 | struct hlist_nulls_node *node; | ||
| 3371 | struct sock *sk; | ||
| 3372 | spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, bucket); | ||
| 3373 | |||
| 3374 | restart: | ||
| 3375 | spin_lock_bh(lock); | ||
| 3376 | sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[bucket].chain) { | ||
| 3377 | struct inet_sock *inet = inet_sk(sk); | ||
| 3378 | |||
| 3379 | if (sysctl_ip_dynaddr && sk->sk_state == TCP_SYN_SENT) | ||
| 3380 | continue; | ||
| 3381 | if (sock_flag(sk, SOCK_DEAD)) | ||
| 3382 | continue; | ||
| 3383 | |||
| 3384 | if (family == AF_INET) { | ||
| 3385 | __be32 s4 = inet->inet_rcv_saddr; | ||
| 3386 | if (s4 == LOOPBACK4_IPV6) | ||
| 3387 | continue; | ||
| 3388 | |||
| 3389 | if (in->s_addr != s4 && | ||
| 3390 | !(in->s_addr == INADDR_ANY && | ||
| 3391 | !tcp_is_local(net, s4))) | ||
| 3392 | continue; | ||
| 3393 | } | ||
| 3394 | |||
| 3395 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) | ||
| 3396 | if (family == AF_INET6) { | ||
| 3397 | struct in6_addr *s6; | ||
| 3398 | if (!inet->pinet6) | ||
| 3399 | continue; | ||
| 3400 | |||
| 3401 | s6 = &inet->pinet6->rcv_saddr; | ||
| 3402 | if (ipv6_addr_type(s6) == IPV6_ADDR_MAPPED) | ||
| 3403 | continue; | ||
| 3404 | |||
| 3405 | if (!ipv6_addr_equal(in6, s6) && | ||
| 3406 | !(ipv6_addr_equal(in6, &in6addr_any) && | ||
| 3407 | !tcp_is_local6(net, s6))) | ||
| 3408 | continue; | ||
| 3409 | } | ||
| 3410 | #endif | ||
| 3411 | |||
| 3412 | sock_hold(sk); | ||
| 3413 | spin_unlock_bh(lock); | ||
| 3414 | |||
| 3415 | local_bh_disable(); | ||
| 3416 | bh_lock_sock(sk); | ||
| 3417 | sk->sk_err = ETIMEDOUT; | ||
| 3418 | sk->sk_error_report(sk); | ||
| 3419 | |||
| 3420 | tcp_done(sk); | ||
| 3421 | bh_unlock_sock(sk); | ||
| 3422 | local_bh_enable(); | ||
| 3423 | sock_put(sk); | ||
| 3424 | |||
| 3425 | goto restart; | ||
| 3426 | } | ||
| 3427 | spin_unlock_bh(lock); | ||
| 3428 | } | ||
| 3429 | |||
| 3430 | return 0; | ||
| 3666 | } | 3431 | } |
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index f45e1c24244..6187eb4d1dc 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c | |||
| @@ -63,6 +63,7 @@ static inline void bictcp_reset(struct bictcp *ca) | |||
| 63 | { | 63 | { |
| 64 | ca->cnt = 0; | 64 | ca->cnt = 0; |
| 65 | ca->last_max_cwnd = 0; | 65 | ca->last_max_cwnd = 0; |
| 66 | ca->loss_cwnd = 0; | ||
| 66 | ca->last_cwnd = 0; | 67 | ca->last_cwnd = 0; |
| 67 | ca->last_time = 0; | 68 | ca->last_time = 0; |
| 68 | ca->epoch_start = 0; | 69 | ca->epoch_start = 0; |
| @@ -71,11 +72,7 @@ static inline void bictcp_reset(struct bictcp *ca) | |||
| 71 | 72 | ||
| 72 | static void bictcp_init(struct sock *sk) | 73 | static void bictcp_init(struct sock *sk) |
| 73 | { | 74 | { |
| 74 | struct bictcp *ca = inet_csk_ca(sk); | 75 | bictcp_reset(inet_csk_ca(sk)); |
| 75 | |||
| 76 | bictcp_reset(ca); | ||
| 77 | ca->loss_cwnd = 0; | ||
| 78 | |||
| 79 | if (initial_ssthresh) | 76 | if (initial_ssthresh) |
| 80 | tcp_sk(sk)->snd_ssthresh = initial_ssthresh; | 77 | tcp_sk(sk)->snd_ssthresh = initial_ssthresh; |
| 81 | } | 78 | } |
| @@ -130,7 +127,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) | |||
| 130 | } | 127 | } |
| 131 | 128 | ||
| 132 | /* if in slow start or link utilization is very low */ | 129 | /* if in slow start or link utilization is very low */ |
| 133 | if (ca->last_max_cwnd == 0) { | 130 | if (ca->loss_cwnd == 0) { |
| 134 | if (ca->cnt > 20) /* increase cwnd 5% per RTT */ | 131 | if (ca->cnt > 20) /* increase cwnd 5% per RTT */ |
| 135 | ca->cnt = 20; | 132 | ca->cnt = 20; |
| 136 | } | 133 | } |
| @@ -188,7 +185,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk) | |||
| 188 | { | 185 | { |
| 189 | const struct tcp_sock *tp = tcp_sk(sk); | 186 | const struct tcp_sock *tp = tcp_sk(sk); |
| 190 | const struct bictcp *ca = inet_csk_ca(sk); | 187 | const struct bictcp *ca = inet_csk_ca(sk); |
| 191 | return max(tp->snd_cwnd, ca->loss_cwnd); | 188 | return max(tp->snd_cwnd, ca->last_max_cwnd); |
| 192 | } | 189 | } |
| 193 | 190 | ||
| 194 | static void bictcp_state(struct sock *sk, u8 new_state) | 191 | static void bictcp_state(struct sock *sk, u8 new_state) |
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 291f2ed7cc3..850c737e08e 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c | |||
| @@ -1,13 +1,11 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * Plugable TCP congestion control support and newReno | 2 | * Plugable TCP congestion control support and newReno |
| 3 | * congestion control. | 3 | * congestion control. |
| 4 | * Based on ideas from I/O scheduler support and Web100. | 4 | * Based on ideas from I/O scheduler suport and Web100. |
| 5 | * | 5 | * |
| 6 | * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> | 6 | * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> |
| 7 | */ | 7 | */ |
| 8 | 8 | ||
| 9 | #define pr_fmt(fmt) "TCP: " fmt | ||
| 10 | |||
| 11 | #include <linux/module.h> | 9 | #include <linux/module.h> |
| 12 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
| 13 | #include <linux/types.h> | 11 | #include <linux/types.h> |
| @@ -43,17 +41,18 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) | |||
| 43 | 41 | ||
| 44 | /* all algorithms must implement ssthresh and cong_avoid ops */ | 42 | /* all algorithms must implement ssthresh and cong_avoid ops */ |
| 45 | if (!ca->ssthresh || !ca->cong_avoid) { | 43 | if (!ca->ssthresh || !ca->cong_avoid) { |
| 46 | pr_err("%s does not implement required ops\n", ca->name); | 44 | printk(KERN_ERR "TCP %s does not implement required ops\n", |
| 45 | ca->name); | ||
| 47 | return -EINVAL; | 46 | return -EINVAL; |
| 48 | } | 47 | } |
| 49 | 48 | ||
| 50 | spin_lock(&tcp_cong_list_lock); | 49 | spin_lock(&tcp_cong_list_lock); |
| 51 | if (tcp_ca_find(ca->name)) { | 50 | if (tcp_ca_find(ca->name)) { |
| 52 | pr_notice("%s already registered\n", ca->name); | 51 | printk(KERN_NOTICE "TCP %s already registered\n", ca->name); |
| 53 | ret = -EEXIST; | 52 | ret = -EEXIST; |
| 54 | } else { | 53 | } else { |
| 55 | list_add_tail_rcu(&ca->list, &tcp_cong_list); | 54 | list_add_tail_rcu(&ca->list, &tcp_cong_list); |
| 56 | pr_info("%s registered\n", ca->name); | 55 | printk(KERN_INFO "TCP %s registered\n", ca->name); |
| 57 | } | 56 | } |
| 58 | spin_unlock(&tcp_cong_list_lock); | 57 | spin_unlock(&tcp_cong_list_lock); |
| 59 | 58 | ||
| @@ -259,8 +258,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) | |||
| 259 | if (!ca) | 258 | if (!ca) |
| 260 | err = -ENOENT; | 259 | err = -ENOENT; |
| 261 | 260 | ||
| 262 | else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || | 261 | else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN))) |
| 263 | ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) | ||
| 264 | err = -EPERM; | 262 | err = -EPERM; |
| 265 | 263 | ||
| 266 | else if (!try_module_get(ca->owner)) | 264 | else if (!try_module_get(ca->owner)) |
| @@ -281,21 +279,20 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) | |||
| 281 | /* RFC2861 Check whether we are limited by application or congestion window | 279 | /* RFC2861 Check whether we are limited by application or congestion window |
| 282 | * This is the inverse of cwnd check in tcp_tso_should_defer | 280 | * This is the inverse of cwnd check in tcp_tso_should_defer |
| 283 | */ | 281 | */ |
| 284 | bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) | 282 | int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) |
| 285 | { | 283 | { |
| 286 | const struct tcp_sock *tp = tcp_sk(sk); | 284 | const struct tcp_sock *tp = tcp_sk(sk); |
| 287 | u32 left; | 285 | u32 left; |
| 288 | 286 | ||
| 289 | if (in_flight >= tp->snd_cwnd) | 287 | if (in_flight >= tp->snd_cwnd) |
| 290 | return true; | 288 | return 1; |
| 291 | 289 | ||
| 292 | left = tp->snd_cwnd - in_flight; | 290 | left = tp->snd_cwnd - in_flight; |
| 293 | if (sk_can_gso(sk) && | 291 | if (sk_can_gso(sk) && |
| 294 | left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && | 292 | left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && |
| 295 | left * tp->mss_cache < sk->sk_gso_max_size && | 293 | left * tp->mss_cache < sk->sk_gso_max_size) |
| 296 | left < sk->sk_gso_max_segs) | 294 | return 1; |
| 297 | return true; | 295 | return left <= tcp_max_burst(tp); |
| 298 | return left <= tcp_max_tso_deferred_mss(tp); | ||
| 299 | } | 296 | } |
| 300 | EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); | 297 | EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); |
| 301 | 298 | ||
| @@ -309,7 +306,6 @@ EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); | |||
| 309 | void tcp_slow_start(struct tcp_sock *tp) | 306 | void tcp_slow_start(struct tcp_sock *tp) |
| 310 | { | 307 | { |
| 311 | int cnt; /* increase in packets */ | 308 | int cnt; /* increase in packets */ |
| 312 | unsigned int delta = 0; | ||
| 313 | 309 | ||
| 314 | /* RFC3465: ABC Slow start | 310 | /* RFC3465: ABC Slow start |
| 315 | * Increase only after a full MSS of bytes is acked | 311 | * Increase only after a full MSS of bytes is acked |
| @@ -336,9 +332,9 @@ void tcp_slow_start(struct tcp_sock *tp) | |||
| 336 | tp->snd_cwnd_cnt += cnt; | 332 | tp->snd_cwnd_cnt += cnt; |
| 337 | while (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | 333 | while (tp->snd_cwnd_cnt >= tp->snd_cwnd) { |
| 338 | tp->snd_cwnd_cnt -= tp->snd_cwnd; | 334 | tp->snd_cwnd_cnt -= tp->snd_cwnd; |
| 339 | delta++; | 335 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) |
| 336 | tp->snd_cwnd++; | ||
| 340 | } | 337 | } |
| 341 | tp->snd_cwnd = min(tp->snd_cwnd + delta, tp->snd_cwnd_clamp); | ||
| 342 | } | 338 | } |
| 343 | EXPORT_SYMBOL_GPL(tcp_slow_start); | 339 | EXPORT_SYMBOL_GPL(tcp_slow_start); |
| 344 | 340 | ||
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index a9077f441cb..f376b05cca8 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c | |||
| @@ -107,6 +107,7 @@ static inline void bictcp_reset(struct bictcp *ca) | |||
| 107 | { | 107 | { |
| 108 | ca->cnt = 0; | 108 | ca->cnt = 0; |
| 109 | ca->last_max_cwnd = 0; | 109 | ca->last_max_cwnd = 0; |
| 110 | ca->loss_cwnd = 0; | ||
| 110 | ca->last_cwnd = 0; | 111 | ca->last_cwnd = 0; |
| 111 | ca->last_time = 0; | 112 | ca->last_time = 0; |
| 112 | ca->bic_origin_point = 0; | 113 | ca->bic_origin_point = 0; |
| @@ -141,10 +142,7 @@ static inline void bictcp_hystart_reset(struct sock *sk) | |||
| 141 | 142 | ||
| 142 | static void bictcp_init(struct sock *sk) | 143 | static void bictcp_init(struct sock *sk) |
| 143 | { | 144 | { |
| 144 | struct bictcp *ca = inet_csk_ca(sk); | 145 | bictcp_reset(inet_csk_ca(sk)); |
| 145 | |||
| 146 | bictcp_reset(ca); | ||
| 147 | ca->loss_cwnd = 0; | ||
| 148 | 146 | ||
| 149 | if (hystart) | 147 | if (hystart) |
| 150 | bictcp_hystart_reset(sk); | 148 | bictcp_hystart_reset(sk); |
| @@ -277,7 +275,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) | |||
| 277 | * The initial growth of cubic function may be too conservative | 275 | * The initial growth of cubic function may be too conservative |
| 278 | * when the available bandwidth is still unknown. | 276 | * when the available bandwidth is still unknown. |
| 279 | */ | 277 | */ |
| 280 | if (ca->last_max_cwnd == 0 && ca->cnt > 20) | 278 | if (ca->loss_cwnd == 0 && ca->cnt > 20) |
| 281 | ca->cnt = 20; /* increase cwnd 5% per RTT */ | 279 | ca->cnt = 20; /* increase cwnd 5% per RTT */ |
| 282 | 280 | ||
| 283 | /* TCP Friendly */ | 281 | /* TCP Friendly */ |
| @@ -344,7 +342,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk) | |||
| 344 | { | 342 | { |
| 345 | struct bictcp *ca = inet_csk_ca(sk); | 343 | struct bictcp *ca = inet_csk_ca(sk); |
| 346 | 344 | ||
| 347 | return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); | 345 | return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); |
| 348 | } | 346 | } |
| 349 | 347 | ||
| 350 | static void bictcp_state(struct sock *sk, u8 new_state) | 348 | static void bictcp_state(struct sock *sk, u8 new_state) |
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index ed3f2ad42e0..939edb3b8e4 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c | |||
| @@ -34,23 +34,11 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, | |||
| 34 | tcp_get_info(sk, info); | 34 | tcp_get_info(sk, info); |
| 35 | } | 35 | } |
| 36 | 36 | ||
| 37 | static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, | ||
| 38 | struct inet_diag_req_v2 *r, struct nlattr *bc) | ||
| 39 | { | ||
| 40 | inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc); | ||
| 41 | } | ||
| 42 | |||
| 43 | static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, | ||
| 44 | struct inet_diag_req_v2 *req) | ||
| 45 | { | ||
| 46 | return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req); | ||
| 47 | } | ||
| 48 | |||
| 49 | static const struct inet_diag_handler tcp_diag_handler = { | 37 | static const struct inet_diag_handler tcp_diag_handler = { |
| 50 | .dump = tcp_diag_dump, | 38 | .idiag_hashinfo = &tcp_hashinfo, |
| 51 | .dump_one = tcp_diag_dump_one, | ||
| 52 | .idiag_get_info = tcp_diag_get_info, | 39 | .idiag_get_info = tcp_diag_get_info, |
| 53 | .idiag_type = IPPROTO_TCP, | 40 | .idiag_type = TCPDIAG_GETSOCK, |
| 41 | .idiag_info_size = sizeof(struct tcp_info), | ||
| 54 | }; | 42 | }; |
| 55 | 43 | ||
| 56 | static int __init tcp_diag_init(void) | 44 | static int __init tcp_diag_init(void) |
| @@ -66,4 +54,4 @@ static void __exit tcp_diag_exit(void) | |||
| 66 | module_init(tcp_diag_init); | 54 | module_init(tcp_diag_init); |
| 67 | module_exit(tcp_diag_exit); | 55 | module_exit(tcp_diag_exit); |
| 68 | MODULE_LICENSE("GPL"); | 56 | MODULE_LICENSE("GPL"); |
| 69 | MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-6 /* AF_INET - IPPROTO_TCP */); | 57 | MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_INET_DIAG, TCPDIAG_GETSOCK); |
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c deleted file mode 100644 index 8f7ef0ad80e..00000000000 --- a/net/ipv4/tcp_fastopen.c +++ /dev/null | |||
| @@ -1,92 +0,0 @@ | |||
| 1 | #include <linux/err.h> | ||
| 2 | #include <linux/init.h> | ||
| 3 | #include <linux/kernel.h> | ||
| 4 | #include <linux/list.h> | ||
| 5 | #include <linux/tcp.h> | ||
| 6 | #include <linux/rcupdate.h> | ||
| 7 | #include <linux/rculist.h> | ||
| 8 | #include <net/inetpeer.h> | ||
| 9 | #include <net/tcp.h> | ||
| 10 | |||
| 11 | int sysctl_tcp_fastopen __read_mostly; | ||
| 12 | |||
| 13 | struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; | ||
| 14 | |||
| 15 | static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); | ||
| 16 | |||
| 17 | static void tcp_fastopen_ctx_free(struct rcu_head *head) | ||
| 18 | { | ||
| 19 | struct tcp_fastopen_context *ctx = | ||
| 20 | container_of(head, struct tcp_fastopen_context, rcu); | ||
| 21 | crypto_free_cipher(ctx->tfm); | ||
| 22 | kfree(ctx); | ||
| 23 | } | ||
| 24 | |||
| 25 | int tcp_fastopen_reset_cipher(void *key, unsigned int len) | ||
| 26 | { | ||
| 27 | int err; | ||
| 28 | struct tcp_fastopen_context *ctx, *octx; | ||
| 29 | |||
| 30 | ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); | ||
| 31 | if (!ctx) | ||
| 32 | return -ENOMEM; | ||
| 33 | ctx->tfm = crypto_alloc_cipher("aes", 0, 0); | ||
| 34 | |||
| 35 | if (IS_ERR(ctx->tfm)) { | ||
| 36 | err = PTR_ERR(ctx->tfm); | ||
| 37 | error: kfree(ctx); | ||
| 38 | pr_err("TCP: TFO aes cipher alloc error: %d\n", err); | ||
| 39 | return err; | ||
| 40 | } | ||
| 41 | err = crypto_cipher_setkey(ctx->tfm, key, len); | ||
| 42 | if (err) { | ||
| 43 | pr_err("TCP: TFO cipher key error: %d\n", err); | ||
| 44 | crypto_free_cipher(ctx->tfm); | ||
| 45 | goto error; | ||
| 46 | } | ||
| 47 | memcpy(ctx->key, key, len); | ||
| 48 | |||
| 49 | spin_lock(&tcp_fastopen_ctx_lock); | ||
| 50 | |||
| 51 | octx = rcu_dereference_protected(tcp_fastopen_ctx, | ||
| 52 | lockdep_is_held(&tcp_fastopen_ctx_lock)); | ||
| 53 | rcu_assign_pointer(tcp_fastopen_ctx, ctx); | ||
| 54 | spin_unlock(&tcp_fastopen_ctx_lock); | ||
| 55 | |||
| 56 | if (octx) | ||
| 57 | call_rcu(&octx->rcu, tcp_fastopen_ctx_free); | ||
| 58 | return err; | ||
| 59 | } | ||
| 60 | |||
| 61 | /* Computes the fastopen cookie for the peer. | ||
| 62 | * The peer address is a 128 bits long (pad with zeros for IPv4). | ||
| 63 | * | ||
| 64 | * The caller must check foc->len to determine if a valid cookie | ||
| 65 | * has been generated successfully. | ||
| 66 | */ | ||
| 67 | void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc) | ||
| 68 | { | ||
| 69 | __be32 peer_addr[4] = { addr, 0, 0, 0 }; | ||
| 70 | struct tcp_fastopen_context *ctx; | ||
| 71 | |||
| 72 | rcu_read_lock(); | ||
| 73 | ctx = rcu_dereference(tcp_fastopen_ctx); | ||
| 74 | if (ctx) { | ||
| 75 | crypto_cipher_encrypt_one(ctx->tfm, | ||
| 76 | foc->val, | ||
| 77 | (__u8 *)peer_addr); | ||
| 78 | foc->len = TCP_FASTOPEN_COOKIE_SIZE; | ||
| 79 | } | ||
| 80 | rcu_read_unlock(); | ||
| 81 | } | ||
| 82 | |||
| 83 | static int __init tcp_fastopen_init(void) | ||
| 84 | { | ||
| 85 | __u8 key[TCP_FASTOPEN_KEY_LENGTH]; | ||
| 86 | |||
| 87 | get_random_bytes(key, sizeof(key)); | ||
| 88 | tcp_fastopen_reset_cipher(key, sizeof(key)); | ||
| 89 | return 0; | ||
| 90 | } | ||
| 91 | |||
| 92 | late_initcall(tcp_fastopen_init); | ||
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 57bdd17dff4..fe3ecf484b4 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c | |||
| @@ -15,7 +15,7 @@ | |||
| 15 | 15 | ||
| 16 | /* Tcp Hybla structure. */ | 16 | /* Tcp Hybla structure. */ |
| 17 | struct hybla { | 17 | struct hybla { |
| 18 | bool hybla_en; | 18 | u8 hybla_en; |
| 19 | u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */ | 19 | u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */ |
| 20 | u32 rho; /* Rho parameter, integer part */ | 20 | u32 rho; /* Rho parameter, integer part */ |
| 21 | u32 rho2; /* Rho * Rho, integer part */ | 21 | u32 rho2; /* Rho * Rho, integer part */ |
| @@ -24,7 +24,8 @@ struct hybla { | |||
| 24 | u32 minrtt; /* Minimum smoothed round trip time value seen */ | 24 | u32 minrtt; /* Minimum smoothed round trip time value seen */ |
| 25 | }; | 25 | }; |
| 26 | 26 | ||
| 27 | /* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */ | 27 | /* Hybla reference round trip time (default= 1/40 sec = 25 ms), |
| 28 | expressed in jiffies */ | ||
| 28 | static int rtt0 = 25; | 29 | static int rtt0 = 25; |
| 29 | module_param(rtt0, int, 0644); | 30 | module_param(rtt0, int, 0644); |
| 30 | MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); | 31 | MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); |
| @@ -38,7 +39,7 @@ static inline void hybla_recalc_param (struct sock *sk) | |||
| 38 | ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8); | 39 | ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8); |
| 39 | ca->rho = ca->rho_3ls >> 3; | 40 | ca->rho = ca->rho_3ls >> 3; |
| 40 | ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; | 41 | ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; |
| 41 | ca->rho2 = ca->rho2_7ls >> 7; | 42 | ca->rho2 = ca->rho2_7ls >>7; |
| 42 | } | 43 | } |
| 43 | 44 | ||
| 44 | static void hybla_init(struct sock *sk) | 45 | static void hybla_init(struct sock *sk) |
| @@ -51,7 +52,7 @@ static void hybla_init(struct sock *sk) | |||
| 51 | ca->rho_3ls = 0; | 52 | ca->rho_3ls = 0; |
| 52 | ca->rho2_7ls = 0; | 53 | ca->rho2_7ls = 0; |
| 53 | ca->snd_cwnd_cents = 0; | 54 | ca->snd_cwnd_cents = 0; |
| 54 | ca->hybla_en = true; | 55 | ca->hybla_en = 1; |
| 55 | tp->snd_cwnd = 2; | 56 | tp->snd_cwnd = 2; |
| 56 | tp->snd_cwnd_clamp = 65535; | 57 | tp->snd_cwnd_clamp = 65535; |
| 57 | 58 | ||
| @@ -66,7 +67,6 @@ static void hybla_init(struct sock *sk) | |||
| 66 | static void hybla_state(struct sock *sk, u8 ca_state) | 67 | static void hybla_state(struct sock *sk, u8 ca_state) |
| 67 | { | 68 | { |
| 68 | struct hybla *ca = inet_csk_ca(sk); | 69 | struct hybla *ca = inet_csk_ca(sk); |
| 69 | |||
| 70 | ca->hybla_en = (ca_state == TCP_CA_Open); | 70 | ca->hybla_en = (ca_state == TCP_CA_Open); |
| 71 | } | 71 | } |
| 72 | 72 | ||
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 834857f3c87..813b43a76fe 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c | |||
| @@ -313,13 +313,11 @@ static void tcp_illinois_info(struct sock *sk, u32 ext, | |||
| 313 | .tcpv_rttcnt = ca->cnt_rtt, | 313 | .tcpv_rttcnt = ca->cnt_rtt, |
| 314 | .tcpv_minrtt = ca->base_rtt, | 314 | .tcpv_minrtt = ca->base_rtt, |
| 315 | }; | 315 | }; |
| 316 | u64 t = ca->sum_rtt; | ||
| 316 | 317 | ||
| 317 | if (info.tcpv_rttcnt > 0) { | 318 | do_div(t, ca->cnt_rtt); |
| 318 | u64 t = ca->sum_rtt; | 319 | info.tcpv_rtt = t; |
| 319 | 320 | ||
| 320 | do_div(t, info.tcpv_rttcnt); | ||
| 321 | info.tcpv_rtt = t; | ||
| 322 | } | ||
| 323 | nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); | 321 | nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); |
| 324 | } | 322 | } |
| 325 | } | 323 | } |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 18f97ca76b0..d73aab3fbfc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
| @@ -61,8 +61,6 @@ | |||
| 61 | * Pasi Sarolahti: F-RTO for dealing with spurious RTOs | 61 | * Pasi Sarolahti: F-RTO for dealing with spurious RTOs |
| 62 | */ | 62 | */ |
| 63 | 63 | ||
| 64 | #define pr_fmt(fmt) "TCP: " fmt | ||
| 65 | |||
| 66 | #include <linux/mm.h> | 64 | #include <linux/mm.h> |
| 67 | #include <linux/slab.h> | 65 | #include <linux/slab.h> |
| 68 | #include <linux/module.h> | 66 | #include <linux/module.h> |
| @@ -85,23 +83,20 @@ int sysctl_tcp_ecn __read_mostly = 2; | |||
| 85 | EXPORT_SYMBOL(sysctl_tcp_ecn); | 83 | EXPORT_SYMBOL(sysctl_tcp_ecn); |
| 86 | int sysctl_tcp_dsack __read_mostly = 1; | 84 | int sysctl_tcp_dsack __read_mostly = 1; |
| 87 | int sysctl_tcp_app_win __read_mostly = 31; | 85 | int sysctl_tcp_app_win __read_mostly = 31; |
| 88 | int sysctl_tcp_adv_win_scale __read_mostly = 1; | 86 | int sysctl_tcp_adv_win_scale __read_mostly = 2; |
| 89 | EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); | 87 | EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); |
| 90 | 88 | ||
| 91 | /* rfc5961 challenge ack rate limiting */ | ||
| 92 | int sysctl_tcp_challenge_ack_limit = 100; | ||
| 93 | |||
| 94 | int sysctl_tcp_stdurg __read_mostly; | 89 | int sysctl_tcp_stdurg __read_mostly; |
| 95 | int sysctl_tcp_rfc1337 __read_mostly; | 90 | int sysctl_tcp_rfc1337 __read_mostly; |
| 96 | int sysctl_tcp_max_orphans __read_mostly = NR_FILE; | 91 | int sysctl_tcp_max_orphans __read_mostly = NR_FILE; |
| 97 | int sysctl_tcp_frto __read_mostly = 2; | 92 | int sysctl_tcp_frto __read_mostly = 2; |
| 98 | int sysctl_tcp_frto_response __read_mostly; | 93 | int sysctl_tcp_frto_response __read_mostly; |
| 94 | int sysctl_tcp_nometrics_save __read_mostly; | ||
| 99 | 95 | ||
| 100 | int sysctl_tcp_thin_dupack __read_mostly; | 96 | int sysctl_tcp_thin_dupack __read_mostly; |
| 101 | 97 | ||
| 102 | int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; | 98 | int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; |
| 103 | int sysctl_tcp_abc __read_mostly; | 99 | int sysctl_tcp_abc __read_mostly; |
| 104 | int sysctl_tcp_early_retrans __read_mostly = 2; | ||
| 105 | 100 | ||
| 106 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ | 101 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ |
| 107 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ | 102 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ |
| @@ -110,6 +105,7 @@ int sysctl_tcp_early_retrans __read_mostly = 2; | |||
| 110 | #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ | 105 | #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ |
| 111 | #define FLAG_DATA_SACKED 0x20 /* New SACK. */ | 106 | #define FLAG_DATA_SACKED 0x20 /* New SACK. */ |
| 112 | #define FLAG_ECE 0x40 /* ECE in this ACK */ | 107 | #define FLAG_ECE 0x40 /* ECE in this ACK */ |
| 108 | #define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ | ||
| 113 | #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ | 109 | #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ |
| 114 | #define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ | 110 | #define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ |
| 115 | #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ | 111 | #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ |
| @@ -178,7 +174,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) | |||
| 178 | static void tcp_incr_quickack(struct sock *sk) | 174 | static void tcp_incr_quickack(struct sock *sk) |
| 179 | { | 175 | { |
| 180 | struct inet_connection_sock *icsk = inet_csk(sk); | 176 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 181 | unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); | 177 | unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); |
| 182 | 178 | ||
| 183 | if (quickacks == 0) | 179 | if (quickacks == 0) |
| 184 | quickacks = 2; | 180 | quickacks = 2; |
| @@ -198,10 +194,9 @@ static void tcp_enter_quickack_mode(struct sock *sk) | |||
| 198 | * and the session is not interactive. | 194 | * and the session is not interactive. |
| 199 | */ | 195 | */ |
| 200 | 196 | ||
| 201 | static inline bool tcp_in_quickack_mode(const struct sock *sk) | 197 | static inline int tcp_in_quickack_mode(const struct sock *sk) |
| 202 | { | 198 | { |
| 203 | const struct inet_connection_sock *icsk = inet_csk(sk); | 199 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 204 | |||
| 205 | return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; | 200 | return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; |
| 206 | } | 201 | } |
| 207 | 202 | ||
| @@ -211,7 +206,7 @@ static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp) | |||
| 211 | tp->ecn_flags |= TCP_ECN_QUEUE_CWR; | 206 | tp->ecn_flags |= TCP_ECN_QUEUE_CWR; |
| 212 | } | 207 | } |
| 213 | 208 | ||
| 214 | static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) | 209 | static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, struct sk_buff *skb) |
| 215 | { | 210 | { |
| 216 | if (tcp_hdr(skb)->cwr) | 211 | if (tcp_hdr(skb)->cwr) |
| 217 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; | 212 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; |
| @@ -222,49 +217,36 @@ static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp) | |||
| 222 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; | 217 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; |
| 223 | } | 218 | } |
| 224 | 219 | ||
| 225 | static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) | 220 | static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb) |
| 226 | { | 221 | { |
| 227 | if (!(tp->ecn_flags & TCP_ECN_OK)) | 222 | if (tp->ecn_flags & TCP_ECN_OK) { |
| 228 | return; | 223 | if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) |
| 229 | 224 | tp->ecn_flags |= TCP_ECN_DEMAND_CWR; | |
| 230 | switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { | ||
| 231 | case INET_ECN_NOT_ECT: | ||
| 232 | /* Funny extension: if ECT is not set on a segment, | 225 | /* Funny extension: if ECT is not set on a segment, |
| 233 | * and we already seen ECT on a previous segment, | 226 | * it is surely retransmit. It is not in ECN RFC, |
| 234 | * it is probably a retransmit. | 227 | * but Linux follows this rule. */ |
| 235 | */ | 228 | else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags))) |
| 236 | if (tp->ecn_flags & TCP_ECN_SEEN) | ||
| 237 | tcp_enter_quickack_mode((struct sock *)tp); | ||
| 238 | break; | ||
| 239 | case INET_ECN_CE: | ||
| 240 | if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { | ||
| 241 | /* Better not delay acks, sender can have a very low cwnd */ | ||
| 242 | tcp_enter_quickack_mode((struct sock *)tp); | 229 | tcp_enter_quickack_mode((struct sock *)tp); |
| 243 | tp->ecn_flags |= TCP_ECN_DEMAND_CWR; | ||
| 244 | } | ||
| 245 | /* fallinto */ | ||
| 246 | default: | ||
| 247 | tp->ecn_flags |= TCP_ECN_SEEN; | ||
| 248 | } | 230 | } |
| 249 | } | 231 | } |
| 250 | 232 | ||
| 251 | static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) | 233 | static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th) |
| 252 | { | 234 | { |
| 253 | if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) | 235 | if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) |
| 254 | tp->ecn_flags &= ~TCP_ECN_OK; | 236 | tp->ecn_flags &= ~TCP_ECN_OK; |
| 255 | } | 237 | } |
| 256 | 238 | ||
| 257 | static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) | 239 | static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th) |
| 258 | { | 240 | { |
| 259 | if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) | 241 | if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) |
| 260 | tp->ecn_flags &= ~TCP_ECN_OK; | 242 | tp->ecn_flags &= ~TCP_ECN_OK; |
| 261 | } | 243 | } |
| 262 | 244 | ||
| 263 | static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) | 245 | static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th) |
| 264 | { | 246 | { |
| 265 | if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) | 247 | if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) |
| 266 | return true; | 248 | return 1; |
| 267 | return false; | 249 | return 0; |
| 268 | } | 250 | } |
| 269 | 251 | ||
| 270 | /* Buffer size and advertised window tuning. | 252 | /* Buffer size and advertised window tuning. |
| @@ -274,11 +256,14 @@ static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr | |||
| 274 | 256 | ||
| 275 | static void tcp_fixup_sndbuf(struct sock *sk) | 257 | static void tcp_fixup_sndbuf(struct sock *sk) |
| 276 | { | 258 | { |
| 277 | int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER); | 259 | int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + |
| 260 | sizeof(struct sk_buff); | ||
| 278 | 261 | ||
| 279 | sndmem *= TCP_INIT_CWND; | 262 | if (sk->sk_sndbuf < 3 * sndmem) { |
| 280 | if (sk->sk_sndbuf < sndmem) | 263 | sk->sk_sndbuf = 3 * sndmem; |
| 281 | sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); | 264 | if (sk->sk_sndbuf > sysctl_tcp_wmem[2]) |
| 265 | sk->sk_sndbuf = sysctl_tcp_wmem[2]; | ||
| 266 | } | ||
| 282 | } | 267 | } |
| 283 | 268 | ||
| 284 | /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) | 269 | /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) |
| @@ -324,14 +309,14 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) | |||
| 324 | return 0; | 309 | return 0; |
| 325 | } | 310 | } |
| 326 | 311 | ||
| 327 | static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) | 312 | static void tcp_grow_window(struct sock *sk, struct sk_buff *skb) |
| 328 | { | 313 | { |
| 329 | struct tcp_sock *tp = tcp_sk(sk); | 314 | struct tcp_sock *tp = tcp_sk(sk); |
| 330 | 315 | ||
| 331 | /* Check #1 */ | 316 | /* Check #1 */ |
| 332 | if (tp->rcv_ssthresh < tp->window_clamp && | 317 | if (tp->rcv_ssthresh < tp->window_clamp && |
| 333 | (int)tp->rcv_ssthresh < tcp_space(sk) && | 318 | (int)tp->rcv_ssthresh < tcp_space(sk) && |
| 334 | !sk_under_memory_pressure(sk)) { | 319 | !tcp_memory_pressure) { |
| 335 | int incr; | 320 | int incr; |
| 336 | 321 | ||
| 337 | /* Check #2. Increase window, if skb with such overhead | 322 | /* Check #2. Increase window, if skb with such overhead |
| @@ -343,7 +328,6 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) | |||
| 343 | incr = __tcp_grow_window(sk, skb); | 328 | incr = __tcp_grow_window(sk, skb); |
| 344 | 329 | ||
| 345 | if (incr) { | 330 | if (incr) { |
| 346 | incr = max_t(int, incr, 2 * skb->len); | ||
| 347 | tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, | 331 | tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, |
| 348 | tp->window_clamp); | 332 | tp->window_clamp); |
| 349 | inet_csk(sk)->icsk_ack.quick |= 1; | 333 | inet_csk(sk)->icsk_ack.quick |= 1; |
| @@ -355,30 +339,23 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) | |||
| 355 | 339 | ||
| 356 | static void tcp_fixup_rcvbuf(struct sock *sk) | 340 | static void tcp_fixup_rcvbuf(struct sock *sk) |
| 357 | { | 341 | { |
| 358 | u32 mss = tcp_sk(sk)->advmss; | 342 | struct tcp_sock *tp = tcp_sk(sk); |
| 359 | u32 icwnd = TCP_DEFAULT_INIT_RCVWND; | 343 | int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); |
| 360 | int rcvmem; | ||
| 361 | 344 | ||
| 362 | /* Limit to 10 segments if mss <= 1460, | 345 | /* Try to select rcvbuf so that 4 mss-sized segments |
| 363 | * or 14600/mss segments, with a minimum of two segments. | 346 | * will fit to window and corresponding skbs will fit to our rcvbuf. |
| 347 | * (was 3; 4 is minimum to allow fast retransmit to work.) | ||
| 364 | */ | 348 | */ |
| 365 | if (mss > 1460) | 349 | while (tcp_win_from_space(rcvmem) < tp->advmss) |
| 366 | icwnd = max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2); | ||
| 367 | |||
| 368 | rcvmem = SKB_TRUESIZE(mss + MAX_TCP_HEADER); | ||
| 369 | while (tcp_win_from_space(rcvmem) < mss) | ||
| 370 | rcvmem += 128; | 350 | rcvmem += 128; |
| 371 | 351 | if (sk->sk_rcvbuf < 4 * rcvmem) | |
| 372 | rcvmem *= icwnd; | 352 | sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); |
| 373 | |||
| 374 | if (sk->sk_rcvbuf < rcvmem) | ||
| 375 | sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); | ||
| 376 | } | 353 | } |
| 377 | 354 | ||
| 378 | /* 4. Try to fixup all. It is made immediately after connection enters | 355 | /* 4. Try to fixup all. It is made immediately after connection enters |
| 379 | * established state. | 356 | * established state. |
| 380 | */ | 357 | */ |
| 381 | void tcp_init_buffer_space(struct sock *sk) | 358 | static void tcp_init_buffer_space(struct sock *sk) |
| 382 | { | 359 | { |
| 383 | struct tcp_sock *tp = tcp_sk(sk); | 360 | struct tcp_sock *tp = tcp_sk(sk); |
| 384 | int maxwin; | 361 | int maxwin; |
| @@ -421,8 +398,8 @@ static void tcp_clamp_window(struct sock *sk) | |||
| 421 | 398 | ||
| 422 | if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && | 399 | if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && |
| 423 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && | 400 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && |
| 424 | !sk_under_memory_pressure(sk) && | 401 | !tcp_memory_pressure && |
| 425 | sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { | 402 | atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { |
| 426 | sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), | 403 | sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), |
| 427 | sysctl_tcp_rmem[2]); | 404 | sysctl_tcp_rmem[2]); |
| 428 | } | 405 | } |
| @@ -439,7 +416,7 @@ static void tcp_clamp_window(struct sock *sk) | |||
| 439 | */ | 416 | */ |
| 440 | void tcp_initialize_rcv_mss(struct sock *sk) | 417 | void tcp_initialize_rcv_mss(struct sock *sk) |
| 441 | { | 418 | { |
| 442 | const struct tcp_sock *tp = tcp_sk(sk); | 419 | struct tcp_sock *tp = tcp_sk(sk); |
| 443 | unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); | 420 | unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); |
| 444 | 421 | ||
| 445 | hint = min(hint, tp->rcv_wnd / 2); | 422 | hint = min(hint, tp->rcv_wnd / 2); |
| @@ -483,11 +460,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) | |||
| 483 | if (!win_dep) { | 460 | if (!win_dep) { |
| 484 | m -= (new_sample >> 3); | 461 | m -= (new_sample >> 3); |
| 485 | new_sample += m; | 462 | new_sample += m; |
| 486 | } else { | 463 | } else if (m < new_sample) |
| 487 | m <<= 3; | 464 | new_sample = m << 3; |
| 488 | if (m < new_sample) | ||
| 489 | new_sample = m; | ||
| 490 | } | ||
| 491 | } else { | 465 | } else { |
| 492 | /* No previous measure. */ | 466 | /* No previous measure. */ |
| 493 | new_sample = m << 3; | 467 | new_sample = m << 3; |
| @@ -503,7 +477,7 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) | |||
| 503 | goto new_measure; | 477 | goto new_measure; |
| 504 | if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) | 478 | if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) |
| 505 | return; | 479 | return; |
| 506 | tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1); | 480 | tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1); |
| 507 | 481 | ||
| 508 | new_measure: | 482 | new_measure: |
| 509 | tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; | 483 | tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; |
| @@ -557,7 +531,8 @@ void tcp_rcv_space_adjust(struct sock *sk) | |||
| 557 | space /= tp->advmss; | 531 | space /= tp->advmss; |
| 558 | if (!space) | 532 | if (!space) |
| 559 | space = 1; | 533 | space = 1; |
| 560 | rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); | 534 | rcvmem = (tp->advmss + MAX_TCP_HEADER + |
| 535 | 16 + sizeof(struct sk_buff)); | ||
| 561 | while (tcp_win_from_space(rcvmem) < tp->advmss) | 536 | while (tcp_win_from_space(rcvmem) < tp->advmss) |
| 562 | rcvmem += 128; | 537 | rcvmem += 128; |
| 563 | space *= rcvmem; | 538 | space *= rcvmem; |
| @@ -707,7 +682,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) | |||
| 707 | /* Calculate rto without backoff. This is the second half of Van Jacobson's | 682 | /* Calculate rto without backoff. This is the second half of Van Jacobson's |
| 708 | * routine referred to above. | 683 | * routine referred to above. |
| 709 | */ | 684 | */ |
| 710 | void tcp_set_rto(struct sock *sk) | 685 | static inline void tcp_set_rto(struct sock *sk) |
| 711 | { | 686 | { |
| 712 | const struct tcp_sock *tp = tcp_sk(sk); | 687 | const struct tcp_sock *tp = tcp_sk(sk); |
| 713 | /* Old crap is replaced with new one. 8) | 688 | /* Old crap is replaced with new one. 8) |
| @@ -734,7 +709,110 @@ void tcp_set_rto(struct sock *sk) | |||
| 734 | tcp_bound_rto(sk); | 709 | tcp_bound_rto(sk); |
| 735 | } | 710 | } |
| 736 | 711 | ||
| 737 | __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) | 712 | /* Save metrics learned by this TCP session. |
| 713 | This function is called only, when TCP finishes successfully | ||
| 714 | i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. | ||
| 715 | */ | ||
| 716 | void tcp_update_metrics(struct sock *sk) | ||
| 717 | { | ||
| 718 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 719 | struct dst_entry *dst = __sk_dst_get(sk); | ||
| 720 | |||
| 721 | if (sysctl_tcp_nometrics_save) | ||
| 722 | return; | ||
| 723 | |||
| 724 | dst_confirm(dst); | ||
| 725 | |||
| 726 | if (dst && (dst->flags & DST_HOST)) { | ||
| 727 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 728 | int m; | ||
| 729 | unsigned long rtt; | ||
| 730 | |||
| 731 | if (icsk->icsk_backoff || !tp->srtt) { | ||
| 732 | /* This session failed to estimate rtt. Why? | ||
| 733 | * Probably, no packets returned in time. | ||
| 734 | * Reset our results. | ||
| 735 | */ | ||
| 736 | if (!(dst_metric_locked(dst, RTAX_RTT))) | ||
| 737 | dst_metric_set(dst, RTAX_RTT, 0); | ||
| 738 | return; | ||
| 739 | } | ||
| 740 | |||
| 741 | rtt = dst_metric_rtt(dst, RTAX_RTT); | ||
| 742 | m = rtt - tp->srtt; | ||
| 743 | |||
| 744 | /* If newly calculated rtt larger than stored one, | ||
| 745 | * store new one. Otherwise, use EWMA. Remember, | ||
| 746 | * rtt overestimation is always better than underestimation. | ||
| 747 | */ | ||
| 748 | if (!(dst_metric_locked(dst, RTAX_RTT))) { | ||
| 749 | if (m <= 0) | ||
| 750 | set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt); | ||
| 751 | else | ||
| 752 | set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3)); | ||
| 753 | } | ||
| 754 | |||
| 755 | if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { | ||
| 756 | unsigned long var; | ||
| 757 | if (m < 0) | ||
| 758 | m = -m; | ||
| 759 | |||
| 760 | /* Scale deviation to rttvar fixed point */ | ||
| 761 | m >>= 1; | ||
| 762 | if (m < tp->mdev) | ||
| 763 | m = tp->mdev; | ||
| 764 | |||
| 765 | var = dst_metric_rtt(dst, RTAX_RTTVAR); | ||
| 766 | if (m >= var) | ||
| 767 | var = m; | ||
| 768 | else | ||
| 769 | var -= (var - m) >> 2; | ||
| 770 | |||
| 771 | set_dst_metric_rtt(dst, RTAX_RTTVAR, var); | ||
| 772 | } | ||
| 773 | |||
| 774 | if (tcp_in_initial_slowstart(tp)) { | ||
| 775 | /* Slow start still did not finish. */ | ||
| 776 | if (dst_metric(dst, RTAX_SSTHRESH) && | ||
| 777 | !dst_metric_locked(dst, RTAX_SSTHRESH) && | ||
| 778 | (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) | ||
| 779 | dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1); | ||
| 780 | if (!dst_metric_locked(dst, RTAX_CWND) && | ||
| 781 | tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) | ||
| 782 | dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd); | ||
| 783 | } else if (tp->snd_cwnd > tp->snd_ssthresh && | ||
| 784 | icsk->icsk_ca_state == TCP_CA_Open) { | ||
| 785 | /* Cong. avoidance phase, cwnd is reliable. */ | ||
| 786 | if (!dst_metric_locked(dst, RTAX_SSTHRESH)) | ||
| 787 | dst_metric_set(dst, RTAX_SSTHRESH, | ||
| 788 | max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); | ||
| 789 | if (!dst_metric_locked(dst, RTAX_CWND)) | ||
| 790 | dst_metric_set(dst, RTAX_CWND, | ||
| 791 | (dst_metric(dst, RTAX_CWND) + | ||
| 792 | tp->snd_cwnd) >> 1); | ||
| 793 | } else { | ||
| 794 | /* Else slow start did not finish, cwnd is non-sense, | ||
| 795 | ssthresh may be also invalid. | ||
| 796 | */ | ||
| 797 | if (!dst_metric_locked(dst, RTAX_CWND)) | ||
| 798 | dst_metric_set(dst, RTAX_CWND, | ||
| 799 | (dst_metric(dst, RTAX_CWND) + | ||
| 800 | tp->snd_ssthresh) >> 1); | ||
| 801 | if (dst_metric(dst, RTAX_SSTHRESH) && | ||
| 802 | !dst_metric_locked(dst, RTAX_SSTHRESH) && | ||
| 803 | tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) | ||
| 804 | dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh); | ||
| 805 | } | ||
| 806 | |||
| 807 | if (!dst_metric_locked(dst, RTAX_REORDERING)) { | ||
| 808 | if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && | ||
| 809 | tp->reordering != sysctl_tcp_reordering) | ||
| 810 | dst_metric_set(dst, RTAX_REORDERING, tp->reordering); | ||
| 811 | } | ||
| 812 | } | ||
| 813 | } | ||
| 814 | |||
| 815 | __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) | ||
| 738 | { | 816 | { |
| 739 | __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); | 817 | __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); |
| 740 | 818 | ||
| @@ -743,22 +821,124 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) | |||
| 743 | return min_t(__u32, cwnd, tp->snd_cwnd_clamp); | 821 | return min_t(__u32, cwnd, tp->snd_cwnd_clamp); |
| 744 | } | 822 | } |
| 745 | 823 | ||
| 824 | /* Set slow start threshold and cwnd not falling to slow start */ | ||
| 825 | void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) | ||
| 826 | { | ||
| 827 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 828 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 829 | |||
| 830 | tp->prior_ssthresh = 0; | ||
| 831 | tp->bytes_acked = 0; | ||
| 832 | if (icsk->icsk_ca_state < TCP_CA_CWR) { | ||
| 833 | tp->undo_marker = 0; | ||
| 834 | if (set_ssthresh) | ||
| 835 | tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); | ||
| 836 | tp->snd_cwnd = min(tp->snd_cwnd, | ||
| 837 | tcp_packets_in_flight(tp) + 1U); | ||
| 838 | tp->snd_cwnd_cnt = 0; | ||
| 839 | tp->high_seq = tp->snd_nxt; | ||
| 840 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
| 841 | TCP_ECN_queue_cwr(tp); | ||
| 842 | |||
| 843 | tcp_set_ca_state(sk, TCP_CA_CWR); | ||
| 844 | } | ||
| 845 | } | ||
| 846 | |||
| 746 | /* | 847 | /* |
| 747 | * Packet counting of FACK is based on in-order assumptions, therefore TCP | 848 | * Packet counting of FACK is based on in-order assumptions, therefore TCP |
| 748 | * disables it when reordering is detected | 849 | * disables it when reordering is detected |
| 749 | */ | 850 | */ |
| 750 | void tcp_disable_fack(struct tcp_sock *tp) | 851 | static void tcp_disable_fack(struct tcp_sock *tp) |
| 751 | { | 852 | { |
| 752 | /* RFC3517 uses different metric in lost marker => reset on change */ | 853 | /* RFC3517 uses different metric in lost marker => reset on change */ |
| 753 | if (tcp_is_fack(tp)) | 854 | if (tcp_is_fack(tp)) |
| 754 | tp->lost_skb_hint = NULL; | 855 | tp->lost_skb_hint = NULL; |
| 755 | tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED; | 856 | tp->rx_opt.sack_ok &= ~2; |
| 756 | } | 857 | } |
| 757 | 858 | ||
| 758 | /* Take a notice that peer is sending D-SACKs */ | 859 | /* Take a notice that peer is sending D-SACKs */ |
| 759 | static void tcp_dsack_seen(struct tcp_sock *tp) | 860 | static void tcp_dsack_seen(struct tcp_sock *tp) |
| 760 | { | 861 | { |
| 761 | tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; | 862 | tp->rx_opt.sack_ok |= 4; |
| 863 | } | ||
| 864 | |||
| 865 | /* Initialize metrics on socket. */ | ||
| 866 | |||
| 867 | static void tcp_init_metrics(struct sock *sk) | ||
| 868 | { | ||
| 869 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 870 | struct dst_entry *dst = __sk_dst_get(sk); | ||
| 871 | |||
| 872 | if (dst == NULL) | ||
| 873 | goto reset; | ||
| 874 | |||
| 875 | dst_confirm(dst); | ||
| 876 | |||
| 877 | if (dst_metric_locked(dst, RTAX_CWND)) | ||
| 878 | tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); | ||
| 879 | if (dst_metric(dst, RTAX_SSTHRESH)) { | ||
| 880 | tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); | ||
| 881 | if (tp->snd_ssthresh > tp->snd_cwnd_clamp) | ||
| 882 | tp->snd_ssthresh = tp->snd_cwnd_clamp; | ||
| 883 | } else { | ||
| 884 | /* ssthresh may have been reduced unnecessarily during. | ||
| 885 | * 3WHS. Restore it back to its initial default. | ||
| 886 | */ | ||
| 887 | tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; | ||
| 888 | } | ||
| 889 | if (dst_metric(dst, RTAX_REORDERING) && | ||
| 890 | tp->reordering != dst_metric(dst, RTAX_REORDERING)) { | ||
| 891 | tcp_disable_fack(tp); | ||
| 892 | tp->reordering = dst_metric(dst, RTAX_REORDERING); | ||
| 893 | } | ||
| 894 | |||
| 895 | if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0) | ||
| 896 | goto reset; | ||
| 897 | |||
| 898 | /* Initial rtt is determined from SYN,SYN-ACK. | ||
| 899 | * The segment is small and rtt may appear much | ||
| 900 | * less than real one. Use per-dst memory | ||
| 901 | * to make it more realistic. | ||
| 902 | * | ||
| 903 | * A bit of theory. RTT is time passed after "normal" sized packet | ||
| 904 | * is sent until it is ACKed. In normal circumstances sending small | ||
| 905 | * packets force peer to delay ACKs and calculation is correct too. | ||
| 906 | * The algorithm is adaptive and, provided we follow specs, it | ||
| 907 | * NEVER underestimate RTT. BUT! If peer tries to make some clever | ||
| 908 | * tricks sort of "quick acks" for time long enough to decrease RTT | ||
| 909 | * to low value, and then abruptly stops to do it and starts to delay | ||
| 910 | * ACKs, wait for troubles. | ||
| 911 | */ | ||
| 912 | if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) { | ||
| 913 | tp->srtt = dst_metric_rtt(dst, RTAX_RTT); | ||
| 914 | tp->rtt_seq = tp->snd_nxt; | ||
| 915 | } | ||
| 916 | if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) { | ||
| 917 | tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR); | ||
| 918 | tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); | ||
| 919 | } | ||
| 920 | tcp_set_rto(sk); | ||
| 921 | reset: | ||
| 922 | if (tp->srtt == 0) { | ||
| 923 | /* RFC2988bis: We've failed to get a valid RTT sample from | ||
| 924 | * 3WHS. This is most likely due to retransmission, | ||
| 925 | * including spurious one. Reset the RTO back to 3secs | ||
| 926 | * from the more aggressive 1sec to avoid more spurious | ||
| 927 | * retransmission. | ||
| 928 | */ | ||
| 929 | tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; | ||
| 930 | inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; | ||
| 931 | } | ||
| 932 | /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been | ||
| 933 | * retransmitted. In light of RFC2988bis' more aggressive 1sec | ||
| 934 | * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK | ||
| 935 | * retransmission has occurred. | ||
| 936 | */ | ||
| 937 | if (tp->total_retrans > 1) | ||
| 938 | tp->snd_cwnd = 1; | ||
| 939 | else | ||
| 940 | tp->snd_cwnd = tcp_init_cwnd(tp, dst); | ||
| 941 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
| 762 | } | 942 | } |
| 763 | 943 | ||
| 764 | static void tcp_update_reordering(struct sock *sk, const int metric, | 944 | static void tcp_update_reordering(struct sock *sk, const int metric, |
| @@ -782,18 +962,15 @@ static void tcp_update_reordering(struct sock *sk, const int metric, | |||
| 782 | 962 | ||
| 783 | NET_INC_STATS_BH(sock_net(sk), mib_idx); | 963 | NET_INC_STATS_BH(sock_net(sk), mib_idx); |
| 784 | #if FASTRETRANS_DEBUG > 1 | 964 | #if FASTRETRANS_DEBUG > 1 |
| 785 | pr_debug("Disorder%d %d %u f%u s%u rr%d\n", | 965 | printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", |
| 786 | tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, | 966 | tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, |
| 787 | tp->reordering, | 967 | tp->reordering, |
| 788 | tp->fackets_out, | 968 | tp->fackets_out, |
| 789 | tp->sacked_out, | 969 | tp->sacked_out, |
| 790 | tp->undo_marker ? tp->undo_retrans : 0); | 970 | tp->undo_marker ? tp->undo_retrans : 0); |
| 791 | #endif | 971 | #endif |
| 792 | tcp_disable_fack(tp); | 972 | tcp_disable_fack(tp); |
| 793 | } | 973 | } |
| 794 | |||
| 795 | if (metric > 0) | ||
| 796 | tcp_disable_early_retrans(tp); | ||
| 797 | } | 974 | } |
| 798 | 975 | ||
| 799 | /* This must be called before lost_out is incremented */ | 976 | /* This must be called before lost_out is incremented */ |
| @@ -851,11 +1028,13 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, | |||
| 851 | * These 6 states form finite state machine, controlled by the following events: | 1028 | * These 6 states form finite state machine, controlled by the following events: |
| 852 | * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) | 1029 | * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) |
| 853 | * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) | 1030 | * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) |
| 854 | * 3. Loss detection event of two flavors: | 1031 | * 3. Loss detection event of one of three flavors: |
| 855 | * A. Scoreboard estimator decided the packet is lost. | 1032 | * A. Scoreboard estimator decided the packet is lost. |
| 856 | * A'. Reno "three dupacks" marks head of queue lost. | 1033 | * A'. Reno "three dupacks" marks head of queue lost. |
| 857 | * A''. Its FACK modification, head until snd.fack is lost. | 1034 | * A''. Its FACK modfication, head until snd.fack is lost. |
| 858 | * B. SACK arrives sacking SND.NXT at the moment, when the | 1035 | * B. SACK arrives sacking data transmitted after never retransmitted |
| 1036 | * hole was sent out. | ||
| 1037 | * C. SACK arrives sacking SND.NXT at the moment, when the | ||
| 859 | * segment was retransmitted. | 1038 | * segment was retransmitted. |
| 860 | * 4. D-SACK added new rule: D-SACK changes any tag to S. | 1039 | * 4. D-SACK added new rule: D-SACK changes any tag to S. |
| 861 | * | 1040 | * |
| @@ -924,36 +1103,36 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, | |||
| 924 | * the exact amount is rather hard to quantify. However, tp->max_window can | 1103 | * the exact amount is rather hard to quantify. However, tp->max_window can |
| 925 | * be used as an exaggerated estimate. | 1104 | * be used as an exaggerated estimate. |
| 926 | */ | 1105 | */ |
| 927 | static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, | 1106 | static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, |
| 928 | u32 start_seq, u32 end_seq) | 1107 | u32 start_seq, u32 end_seq) |
| 929 | { | 1108 | { |
| 930 | /* Too far in future, or reversed (interpretation is ambiguous) */ | 1109 | /* Too far in future, or reversed (interpretation is ambiguous) */ |
| 931 | if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) | 1110 | if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) |
| 932 | return false; | 1111 | return 0; |
| 933 | 1112 | ||
| 934 | /* Nasty start_seq wrap-around check (see comments above) */ | 1113 | /* Nasty start_seq wrap-around check (see comments above) */ |
| 935 | if (!before(start_seq, tp->snd_nxt)) | 1114 | if (!before(start_seq, tp->snd_nxt)) |
| 936 | return false; | 1115 | return 0; |
| 937 | 1116 | ||
| 938 | /* In outstanding window? ...This is valid exit for D-SACKs too. | 1117 | /* In outstanding window? ...This is valid exit for D-SACKs too. |
| 939 | * start_seq == snd_una is non-sensical (see comments above) | 1118 | * start_seq == snd_una is non-sensical (see comments above) |
| 940 | */ | 1119 | */ |
| 941 | if (after(start_seq, tp->snd_una)) | 1120 | if (after(start_seq, tp->snd_una)) |
| 942 | return true; | 1121 | return 1; |
| 943 | 1122 | ||
| 944 | if (!is_dsack || !tp->undo_marker) | 1123 | if (!is_dsack || !tp->undo_marker) |
| 945 | return false; | 1124 | return 0; |
| 946 | 1125 | ||
| 947 | /* ...Then it's D-SACK, and must reside below snd_una completely */ | 1126 | /* ...Then it's D-SACK, and must reside below snd_una completely */ |
| 948 | if (after(end_seq, tp->snd_una)) | 1127 | if (after(end_seq, tp->snd_una)) |
| 949 | return false; | 1128 | return 0; |
| 950 | 1129 | ||
| 951 | if (!before(start_seq, tp->undo_marker)) | 1130 | if (!before(start_seq, tp->undo_marker)) |
| 952 | return true; | 1131 | return 1; |
| 953 | 1132 | ||
| 954 | /* Too old */ | 1133 | /* Too old */ |
| 955 | if (!after(end_seq, tp->undo_marker)) | 1134 | if (!after(end_seq, tp->undo_marker)) |
| 956 | return false; | 1135 | return 0; |
| 957 | 1136 | ||
| 958 | /* Undo_marker boundary crossing (overestimates a lot). Known already: | 1137 | /* Undo_marker boundary crossing (overestimates a lot). Known already: |
| 959 | * start_seq < undo_marker and end_seq >= undo_marker. | 1138 | * start_seq < undo_marker and end_seq >= undo_marker. |
| @@ -962,7 +1141,7 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, | |||
| 962 | } | 1141 | } |
| 963 | 1142 | ||
| 964 | /* Check for lost retransmit. This superb idea is borrowed from "ratehalving". | 1143 | /* Check for lost retransmit. This superb idea is borrowed from "ratehalving". |
| 965 | * Event "B". Later note: FACK people cheated me again 8), we have to account | 1144 | * Event "C". Later note: FACK people cheated me again 8), we have to account |
| 966 | * for reordering! Ugly, but should help. | 1145 | * for reordering! Ugly, but should help. |
| 967 | * | 1146 | * |
| 968 | * Search retransmitted skbs from write_queue that were sent when snd_nxt was | 1147 | * Search retransmitted skbs from write_queue that were sent when snd_nxt was |
| @@ -1025,17 +1204,17 @@ static void tcp_mark_lost_retrans(struct sock *sk) | |||
| 1025 | tp->lost_retrans_low = new_low_seq; | 1204 | tp->lost_retrans_low = new_low_seq; |
| 1026 | } | 1205 | } |
| 1027 | 1206 | ||
| 1028 | static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, | 1207 | static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, |
| 1029 | struct tcp_sack_block_wire *sp, int num_sacks, | 1208 | struct tcp_sack_block_wire *sp, int num_sacks, |
| 1030 | u32 prior_snd_una) | 1209 | u32 prior_snd_una) |
| 1031 | { | 1210 | { |
| 1032 | struct tcp_sock *tp = tcp_sk(sk); | 1211 | struct tcp_sock *tp = tcp_sk(sk); |
| 1033 | u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); | 1212 | u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); |
| 1034 | u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); | 1213 | u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); |
| 1035 | bool dup_sack = false; | 1214 | int dup_sack = 0; |
| 1036 | 1215 | ||
| 1037 | if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { | 1216 | if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { |
| 1038 | dup_sack = true; | 1217 | dup_sack = 1; |
| 1039 | tcp_dsack_seen(tp); | 1218 | tcp_dsack_seen(tp); |
| 1040 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV); | 1219 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV); |
| 1041 | } else if (num_sacks > 1) { | 1220 | } else if (num_sacks > 1) { |
| @@ -1044,7 +1223,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, | |||
| 1044 | 1223 | ||
| 1045 | if (!after(end_seq_0, end_seq_1) && | 1224 | if (!after(end_seq_0, end_seq_1) && |
| 1046 | !before(start_seq_0, start_seq_1)) { | 1225 | !before(start_seq_0, start_seq_1)) { |
| 1047 | dup_sack = true; | 1226 | dup_sack = 1; |
| 1048 | tcp_dsack_seen(tp); | 1227 | tcp_dsack_seen(tp); |
| 1049 | NET_INC_STATS_BH(sock_net(sk), | 1228 | NET_INC_STATS_BH(sock_net(sk), |
| 1050 | LINUX_MIB_TCPDSACKOFORECV); | 1229 | LINUX_MIB_TCPDSACKOFORECV); |
| @@ -1075,10 +1254,9 @@ struct tcp_sacktag_state { | |||
| 1075 | * FIXME: this could be merged to shift decision code | 1254 | * FIXME: this could be merged to shift decision code |
| 1076 | */ | 1255 | */ |
| 1077 | static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, | 1256 | static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, |
| 1078 | u32 start_seq, u32 end_seq) | 1257 | u32 start_seq, u32 end_seq) |
| 1079 | { | 1258 | { |
| 1080 | int err; | 1259 | int in_sack, err; |
| 1081 | bool in_sack; | ||
| 1082 | unsigned int pkt_len; | 1260 | unsigned int pkt_len; |
| 1083 | unsigned int mss; | 1261 | unsigned int mss; |
| 1084 | 1262 | ||
| @@ -1120,26 +1298,25 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, | |||
| 1120 | return in_sack; | 1298 | return in_sack; |
| 1121 | } | 1299 | } |
| 1122 | 1300 | ||
| 1123 | /* Mark the given newly-SACKed range as such, adjusting counters and hints. */ | 1301 | static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, |
| 1124 | static u8 tcp_sacktag_one(struct sock *sk, | 1302 | struct tcp_sacktag_state *state, |
| 1125 | struct tcp_sacktag_state *state, u8 sacked, | 1303 | int dup_sack, int pcount) |
| 1126 | u32 start_seq, u32 end_seq, | ||
| 1127 | bool dup_sack, int pcount) | ||
| 1128 | { | 1304 | { |
| 1129 | struct tcp_sock *tp = tcp_sk(sk); | 1305 | struct tcp_sock *tp = tcp_sk(sk); |
| 1306 | u8 sacked = TCP_SKB_CB(skb)->sacked; | ||
| 1130 | int fack_count = state->fack_count; | 1307 | int fack_count = state->fack_count; |
| 1131 | 1308 | ||
| 1132 | /* Account D-SACK for retransmitted packet. */ | 1309 | /* Account D-SACK for retransmitted packet. */ |
| 1133 | if (dup_sack && (sacked & TCPCB_RETRANS)) { | 1310 | if (dup_sack && (sacked & TCPCB_RETRANS)) { |
| 1134 | if (tp->undo_marker && tp->undo_retrans && | 1311 | if (tp->undo_marker && tp->undo_retrans && |
| 1135 | after(end_seq, tp->undo_marker)) | 1312 | after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) |
| 1136 | tp->undo_retrans--; | 1313 | tp->undo_retrans--; |
| 1137 | if (sacked & TCPCB_SACKED_ACKED) | 1314 | if (sacked & TCPCB_SACKED_ACKED) |
| 1138 | state->reord = min(fack_count, state->reord); | 1315 | state->reord = min(fack_count, state->reord); |
| 1139 | } | 1316 | } |
| 1140 | 1317 | ||
| 1141 | /* Nothing to do; acked frame is about to be dropped (was ACKed). */ | 1318 | /* Nothing to do; acked frame is about to be dropped (was ACKed). */ |
| 1142 | if (!after(end_seq, tp->snd_una)) | 1319 | if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) |
| 1143 | return sacked; | 1320 | return sacked; |
| 1144 | 1321 | ||
| 1145 | if (!(sacked & TCPCB_SACKED_ACKED)) { | 1322 | if (!(sacked & TCPCB_SACKED_ACKED)) { |
| @@ -1158,13 +1335,13 @@ static u8 tcp_sacktag_one(struct sock *sk, | |||
| 1158 | /* New sack for not retransmitted frame, | 1335 | /* New sack for not retransmitted frame, |
| 1159 | * which was in hole. It is reordering. | 1336 | * which was in hole. It is reordering. |
| 1160 | */ | 1337 | */ |
| 1161 | if (before(start_seq, | 1338 | if (before(TCP_SKB_CB(skb)->seq, |
| 1162 | tcp_highest_sack_seq(tp))) | 1339 | tcp_highest_sack_seq(tp))) |
| 1163 | state->reord = min(fack_count, | 1340 | state->reord = min(fack_count, |
| 1164 | state->reord); | 1341 | state->reord); |
| 1165 | 1342 | ||
| 1166 | /* SACK enhanced F-RTO (RFC4138; Appendix B) */ | 1343 | /* SACK enhanced F-RTO (RFC4138; Appendix B) */ |
| 1167 | if (!after(end_seq, tp->frto_highmark)) | 1344 | if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) |
| 1168 | state->flag |= FLAG_ONLY_ORIG_SACKED; | 1345 | state->flag |= FLAG_ONLY_ORIG_SACKED; |
| 1169 | } | 1346 | } |
| 1170 | 1347 | ||
| @@ -1182,7 +1359,8 @@ static u8 tcp_sacktag_one(struct sock *sk, | |||
| 1182 | 1359 | ||
| 1183 | /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ | 1360 | /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ |
| 1184 | if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && | 1361 | if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && |
| 1185 | before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) | 1362 | before(TCP_SKB_CB(skb)->seq, |
| 1363 | TCP_SKB_CB(tp->lost_skb_hint)->seq)) | ||
| 1186 | tp->lost_cnt_hint += pcount; | 1364 | tp->lost_cnt_hint += pcount; |
| 1187 | 1365 | ||
| 1188 | if (fack_count > tp->fackets_out) | 1366 | if (fack_count > tp->fackets_out) |
| @@ -1201,30 +1379,16 @@ static u8 tcp_sacktag_one(struct sock *sk, | |||
| 1201 | return sacked; | 1379 | return sacked; |
| 1202 | } | 1380 | } |
| 1203 | 1381 | ||
| 1204 | /* Shift newly-SACKed bytes from this skb to the immediately previous | 1382 | static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, |
| 1205 | * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. | 1383 | struct tcp_sacktag_state *state, |
| 1206 | */ | 1384 | unsigned int pcount, int shifted, int mss, |
| 1207 | static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | 1385 | int dup_sack) |
| 1208 | struct tcp_sacktag_state *state, | ||
| 1209 | unsigned int pcount, int shifted, int mss, | ||
| 1210 | bool dup_sack) | ||
| 1211 | { | 1386 | { |
| 1212 | struct tcp_sock *tp = tcp_sk(sk); | 1387 | struct tcp_sock *tp = tcp_sk(sk); |
| 1213 | struct sk_buff *prev = tcp_write_queue_prev(sk, skb); | 1388 | struct sk_buff *prev = tcp_write_queue_prev(sk, skb); |
| 1214 | u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ | ||
| 1215 | u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ | ||
| 1216 | 1389 | ||
| 1217 | BUG_ON(!pcount); | 1390 | BUG_ON(!pcount); |
| 1218 | 1391 | ||
| 1219 | /* Adjust counters and hints for the newly sacked sequence | ||
| 1220 | * range but discard the return value since prev is already | ||
| 1221 | * marked. We must tag the range first because the seq | ||
| 1222 | * advancement below implicitly advances | ||
| 1223 | * tcp_highest_sack_seq() when skb is highest_sack. | ||
| 1224 | */ | ||
| 1225 | tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, | ||
| 1226 | start_seq, end_seq, dup_sack, pcount); | ||
| 1227 | |||
| 1228 | if (skb == tp->lost_skb_hint) | 1392 | if (skb == tp->lost_skb_hint) |
| 1229 | tp->lost_cnt_hint += pcount; | 1393 | tp->lost_cnt_hint += pcount; |
| 1230 | 1394 | ||
| @@ -1251,13 +1415,16 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | |||
| 1251 | skb_shinfo(skb)->gso_type = 0; | 1415 | skb_shinfo(skb)->gso_type = 0; |
| 1252 | } | 1416 | } |
| 1253 | 1417 | ||
| 1418 | /* We discard results */ | ||
| 1419 | tcp_sacktag_one(skb, sk, state, dup_sack, pcount); | ||
| 1420 | |||
| 1254 | /* Difference in this won't matter, both ACKed by the same cumul. ACK */ | 1421 | /* Difference in this won't matter, both ACKed by the same cumul. ACK */ |
| 1255 | TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); | 1422 | TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); |
| 1256 | 1423 | ||
| 1257 | if (skb->len > 0) { | 1424 | if (skb->len > 0) { |
| 1258 | BUG_ON(!tcp_skb_pcount(skb)); | 1425 | BUG_ON(!tcp_skb_pcount(skb)); |
| 1259 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED); | 1426 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED); |
| 1260 | return false; | 1427 | return 0; |
| 1261 | } | 1428 | } |
| 1262 | 1429 | ||
| 1263 | /* Whole SKB was eaten :-) */ | 1430 | /* Whole SKB was eaten :-) */ |
| @@ -1271,7 +1438,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | |||
| 1271 | tp->lost_cnt_hint -= tcp_skb_pcount(prev); | 1438 | tp->lost_cnt_hint -= tcp_skb_pcount(prev); |
| 1272 | } | 1439 | } |
| 1273 | 1440 | ||
| 1274 | TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(prev)->tcp_flags; | 1441 | TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags; |
| 1275 | if (skb == tcp_highest_sack(sk)) | 1442 | if (skb == tcp_highest_sack(sk)) |
| 1276 | tcp_advance_highest_sack(sk, skb); | 1443 | tcp_advance_highest_sack(sk, skb); |
| 1277 | 1444 | ||
| @@ -1280,19 +1447,19 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | |||
| 1280 | 1447 | ||
| 1281 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED); | 1448 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED); |
| 1282 | 1449 | ||
| 1283 | return true; | 1450 | return 1; |
| 1284 | } | 1451 | } |
| 1285 | 1452 | ||
| 1286 | /* I wish gso_size would have a bit more sane initialization than | 1453 | /* I wish gso_size would have a bit more sane initialization than |
| 1287 | * something-or-zero which complicates things | 1454 | * something-or-zero which complicates things |
| 1288 | */ | 1455 | */ |
| 1289 | static int tcp_skb_seglen(const struct sk_buff *skb) | 1456 | static int tcp_skb_seglen(struct sk_buff *skb) |
| 1290 | { | 1457 | { |
| 1291 | return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb); | 1458 | return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb); |
| 1292 | } | 1459 | } |
| 1293 | 1460 | ||
| 1294 | /* Shifting pages past head area doesn't work */ | 1461 | /* Shifting pages past head area doesn't work */ |
| 1295 | static int skb_can_shift(const struct sk_buff *skb) | 1462 | static int skb_can_shift(struct sk_buff *skb) |
| 1296 | { | 1463 | { |
| 1297 | return !skb_headlen(skb) && skb_is_nonlinear(skb); | 1464 | return !skb_headlen(skb) && skb_is_nonlinear(skb); |
| 1298 | } | 1465 | } |
| @@ -1303,7 +1470,7 @@ static int skb_can_shift(const struct sk_buff *skb) | |||
| 1303 | static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, | 1470 | static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, |
| 1304 | struct tcp_sacktag_state *state, | 1471 | struct tcp_sacktag_state *state, |
| 1305 | u32 start_seq, u32 end_seq, | 1472 | u32 start_seq, u32 end_seq, |
| 1306 | bool dup_sack) | 1473 | int dup_sack) |
| 1307 | { | 1474 | { |
| 1308 | struct tcp_sock *tp = tcp_sk(sk); | 1475 | struct tcp_sock *tp = tcp_sk(sk); |
| 1309 | struct sk_buff *prev; | 1476 | struct sk_buff *prev; |
| @@ -1398,10 +1565,6 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, | |||
| 1398 | } | 1565 | } |
| 1399 | } | 1566 | } |
| 1400 | 1567 | ||
| 1401 | /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */ | ||
| 1402 | if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una)) | ||
| 1403 | goto fallback; | ||
| 1404 | |||
| 1405 | if (!skb_shift(prev, skb, len)) | 1568 | if (!skb_shift(prev, skb, len)) |
| 1406 | goto fallback; | 1569 | goto fallback; |
| 1407 | if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) | 1570 | if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) |
| @@ -1442,14 +1605,14 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, | |||
| 1442 | struct tcp_sack_block *next_dup, | 1605 | struct tcp_sack_block *next_dup, |
| 1443 | struct tcp_sacktag_state *state, | 1606 | struct tcp_sacktag_state *state, |
| 1444 | u32 start_seq, u32 end_seq, | 1607 | u32 start_seq, u32 end_seq, |
| 1445 | bool dup_sack_in) | 1608 | int dup_sack_in) |
| 1446 | { | 1609 | { |
| 1447 | struct tcp_sock *tp = tcp_sk(sk); | 1610 | struct tcp_sock *tp = tcp_sk(sk); |
| 1448 | struct sk_buff *tmp; | 1611 | struct sk_buff *tmp; |
| 1449 | 1612 | ||
| 1450 | tcp_for_write_queue_from(skb, sk) { | 1613 | tcp_for_write_queue_from(skb, sk) { |
| 1451 | int in_sack = 0; | 1614 | int in_sack = 0; |
| 1452 | bool dup_sack = dup_sack_in; | 1615 | int dup_sack = dup_sack_in; |
| 1453 | 1616 | ||
| 1454 | if (skb == tcp_send_head(sk)) | 1617 | if (skb == tcp_send_head(sk)) |
| 1455 | break; | 1618 | break; |
| @@ -1464,7 +1627,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, | |||
| 1464 | next_dup->start_seq, | 1627 | next_dup->start_seq, |
| 1465 | next_dup->end_seq); | 1628 | next_dup->end_seq); |
| 1466 | if (in_sack > 0) | 1629 | if (in_sack > 0) |
| 1467 | dup_sack = true; | 1630 | dup_sack = 1; |
| 1468 | } | 1631 | } |
| 1469 | 1632 | ||
| 1470 | /* skb reference here is a bit tricky to get right, since | 1633 | /* skb reference here is a bit tricky to get right, since |
| @@ -1492,14 +1655,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, | |||
| 1492 | break; | 1655 | break; |
| 1493 | 1656 | ||
| 1494 | if (in_sack) { | 1657 | if (in_sack) { |
| 1495 | TCP_SKB_CB(skb)->sacked = | 1658 | TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk, |
| 1496 | tcp_sacktag_one(sk, | 1659 | state, |
| 1497 | state, | 1660 | dup_sack, |
| 1498 | TCP_SKB_CB(skb)->sacked, | 1661 | tcp_skb_pcount(skb)); |
| 1499 | TCP_SKB_CB(skb)->seq, | ||
| 1500 | TCP_SKB_CB(skb)->end_seq, | ||
| 1501 | dup_sack, | ||
| 1502 | tcp_skb_pcount(skb)); | ||
| 1503 | 1662 | ||
| 1504 | if (!before(TCP_SKB_CB(skb)->seq, | 1663 | if (!before(TCP_SKB_CB(skb)->seq, |
| 1505 | tcp_highest_sack_seq(tp))) | 1664 | tcp_highest_sack_seq(tp))) |
| @@ -1549,19 +1708,19 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, | |||
| 1549 | return skb; | 1708 | return skb; |
| 1550 | } | 1709 | } |
| 1551 | 1710 | ||
| 1552 | static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache) | 1711 | static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache) |
| 1553 | { | 1712 | { |
| 1554 | return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); | 1713 | return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); |
| 1555 | } | 1714 | } |
| 1556 | 1715 | ||
| 1557 | static int | 1716 | static int |
| 1558 | tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | 1717 | tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, |
| 1559 | u32 prior_snd_una) | 1718 | u32 prior_snd_una) |
| 1560 | { | 1719 | { |
| 1561 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1720 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 1562 | struct tcp_sock *tp = tcp_sk(sk); | 1721 | struct tcp_sock *tp = tcp_sk(sk); |
| 1563 | const unsigned char *ptr = (skb_transport_header(ack_skb) + | 1722 | unsigned char *ptr = (skb_transport_header(ack_skb) + |
| 1564 | TCP_SKB_CB(ack_skb)->sacked); | 1723 | TCP_SKB_CB(ack_skb)->sacked); |
| 1565 | struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); | 1724 | struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); |
| 1566 | struct tcp_sack_block sp[TCP_NUM_SACKS]; | 1725 | struct tcp_sack_block sp[TCP_NUM_SACKS]; |
| 1567 | struct tcp_sack_block *cache; | 1726 | struct tcp_sack_block *cache; |
| @@ -1569,7 +1728,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | |||
| 1569 | struct sk_buff *skb; | 1728 | struct sk_buff *skb; |
| 1570 | int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); | 1729 | int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); |
| 1571 | int used_sacks; | 1730 | int used_sacks; |
| 1572 | bool found_dup_sack = false; | 1731 | int found_dup_sack = 0; |
| 1573 | int i, j; | 1732 | int i, j; |
| 1574 | int first_sack_index; | 1733 | int first_sack_index; |
| 1575 | 1734 | ||
| @@ -1600,7 +1759,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | |||
| 1600 | used_sacks = 0; | 1759 | used_sacks = 0; |
| 1601 | first_sack_index = 0; | 1760 | first_sack_index = 0; |
| 1602 | for (i = 0; i < num_sacks; i++) { | 1761 | for (i = 0; i < num_sacks; i++) { |
| 1603 | bool dup_sack = !i && found_dup_sack; | 1762 | int dup_sack = !i && found_dup_sack; |
| 1604 | 1763 | ||
| 1605 | sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq); | 1764 | sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq); |
| 1606 | sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq); | 1765 | sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq); |
| @@ -1667,12 +1826,16 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | |||
| 1667 | while (i < used_sacks) { | 1826 | while (i < used_sacks) { |
| 1668 | u32 start_seq = sp[i].start_seq; | 1827 | u32 start_seq = sp[i].start_seq; |
| 1669 | u32 end_seq = sp[i].end_seq; | 1828 | u32 end_seq = sp[i].end_seq; |
| 1670 | bool dup_sack = (found_dup_sack && (i == first_sack_index)); | 1829 | int dup_sack = (found_dup_sack && (i == first_sack_index)); |
| 1671 | struct tcp_sack_block *next_dup = NULL; | 1830 | struct tcp_sack_block *next_dup = NULL; |
| 1672 | 1831 | ||
| 1673 | if (found_dup_sack && ((i + 1) == first_sack_index)) | 1832 | if (found_dup_sack && ((i + 1) == first_sack_index)) |
| 1674 | next_dup = &sp[i + 1]; | 1833 | next_dup = &sp[i + 1]; |
| 1675 | 1834 | ||
| 1835 | /* Event "B" in the comment above. */ | ||
| 1836 | if (after(end_seq, tp->high_seq)) | ||
| 1837 | state.flag |= FLAG_DATA_LOST; | ||
| 1838 | |||
| 1676 | /* Skip too early cached blocks */ | 1839 | /* Skip too early cached blocks */ |
| 1677 | while (tcp_sack_cache_ok(tp, cache) && | 1840 | while (tcp_sack_cache_ok(tp, cache) && |
| 1678 | !before(start_seq, cache->end_seq)) | 1841 | !before(start_seq, cache->end_seq)) |
| @@ -1769,9 +1932,9 @@ out: | |||
| 1769 | } | 1932 | } |
| 1770 | 1933 | ||
| 1771 | /* Limits sacked_out so that sum with lost_out isn't ever larger than | 1934 | /* Limits sacked_out so that sum with lost_out isn't ever larger than |
| 1772 | * packets_out. Returns false if sacked_out adjustement wasn't necessary. | 1935 | * packets_out. Returns zero if sacked_out adjustement wasn't necessary. |
| 1773 | */ | 1936 | */ |
| 1774 | static bool tcp_limit_reno_sacked(struct tcp_sock *tp) | 1937 | static int tcp_limit_reno_sacked(struct tcp_sock *tp) |
| 1775 | { | 1938 | { |
| 1776 | u32 holes; | 1939 | u32 holes; |
| 1777 | 1940 | ||
| @@ -1780,9 +1943,9 @@ static bool tcp_limit_reno_sacked(struct tcp_sock *tp) | |||
| 1780 | 1943 | ||
| 1781 | if ((tp->sacked_out + holes) > tp->packets_out) { | 1944 | if ((tp->sacked_out + holes) > tp->packets_out) { |
| 1782 | tp->sacked_out = tp->packets_out - holes; | 1945 | tp->sacked_out = tp->packets_out - holes; |
| 1783 | return true; | 1946 | return 1; |
| 1784 | } | 1947 | } |
| 1785 | return false; | 1948 | return 0; |
| 1786 | } | 1949 | } |
| 1787 | 1950 | ||
| 1788 | /* If we receive more dupacks than we expected counting segments | 1951 | /* If we receive more dupacks than we expected counting segments |
| @@ -1836,40 +1999,40 @@ static int tcp_is_sackfrto(const struct tcp_sock *tp) | |||
| 1836 | /* F-RTO can only be used if TCP has never retransmitted anything other than | 1999 | /* F-RTO can only be used if TCP has never retransmitted anything other than |
| 1837 | * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) | 2000 | * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) |
| 1838 | */ | 2001 | */ |
| 1839 | bool tcp_use_frto(struct sock *sk) | 2002 | int tcp_use_frto(struct sock *sk) |
| 1840 | { | 2003 | { |
| 1841 | const struct tcp_sock *tp = tcp_sk(sk); | 2004 | const struct tcp_sock *tp = tcp_sk(sk); |
| 1842 | const struct inet_connection_sock *icsk = inet_csk(sk); | 2005 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 1843 | struct sk_buff *skb; | 2006 | struct sk_buff *skb; |
| 1844 | 2007 | ||
| 1845 | if (!sysctl_tcp_frto) | 2008 | if (!sysctl_tcp_frto) |
| 1846 | return false; | 2009 | return 0; |
| 1847 | 2010 | ||
| 1848 | /* MTU probe and F-RTO won't really play nicely along currently */ | 2011 | /* MTU probe and F-RTO won't really play nicely along currently */ |
| 1849 | if (icsk->icsk_mtup.probe_size) | 2012 | if (icsk->icsk_mtup.probe_size) |
| 1850 | return false; | 2013 | return 0; |
| 1851 | 2014 | ||
| 1852 | if (tcp_is_sackfrto(tp)) | 2015 | if (tcp_is_sackfrto(tp)) |
| 1853 | return true; | 2016 | return 1; |
| 1854 | 2017 | ||
| 1855 | /* Avoid expensive walking of rexmit queue if possible */ | 2018 | /* Avoid expensive walking of rexmit queue if possible */ |
| 1856 | if (tp->retrans_out > 1) | 2019 | if (tp->retrans_out > 1) |
| 1857 | return false; | 2020 | return 0; |
| 1858 | 2021 | ||
| 1859 | skb = tcp_write_queue_head(sk); | 2022 | skb = tcp_write_queue_head(sk); |
| 1860 | if (tcp_skb_is_last(sk, skb)) | 2023 | if (tcp_skb_is_last(sk, skb)) |
| 1861 | return true; | 2024 | return 1; |
| 1862 | skb = tcp_write_queue_next(sk, skb); /* Skips head */ | 2025 | skb = tcp_write_queue_next(sk, skb); /* Skips head */ |
| 1863 | tcp_for_write_queue_from(skb, sk) { | 2026 | tcp_for_write_queue_from(skb, sk) { |
| 1864 | if (skb == tcp_send_head(sk)) | 2027 | if (skb == tcp_send_head(sk)) |
| 1865 | break; | 2028 | break; |
| 1866 | if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) | 2029 | if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) |
| 1867 | return false; | 2030 | return 0; |
| 1868 | /* Short-circuit when first non-SACKed skb has been checked */ | 2031 | /* Short-circuit when first non-SACKed skb has been checked */ |
| 1869 | if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) | 2032 | if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) |
| 1870 | break; | 2033 | break; |
| 1871 | } | 2034 | } |
| 1872 | return true; | 2035 | return 1; |
| 1873 | } | 2036 | } |
| 1874 | 2037 | ||
| 1875 | /* RTO occurred, but do not yet enter Loss state. Instead, defer RTO | 2038 | /* RTO occurred, but do not yet enter Loss state. Instead, defer RTO |
| @@ -2105,7 +2268,7 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
| 2105 | * | 2268 | * |
| 2106 | * Do processing similar to RTO timeout. | 2269 | * Do processing similar to RTO timeout. |
| 2107 | */ | 2270 | */ |
| 2108 | static bool tcp_check_sack_reneging(struct sock *sk, int flag) | 2271 | static int tcp_check_sack_reneging(struct sock *sk, int flag) |
| 2109 | { | 2272 | { |
| 2110 | if (flag & FLAG_SACK_RENEGING) { | 2273 | if (flag & FLAG_SACK_RENEGING) { |
| 2111 | struct inet_connection_sock *icsk = inet_csk(sk); | 2274 | struct inet_connection_sock *icsk = inet_csk(sk); |
| @@ -2116,12 +2279,12 @@ static bool tcp_check_sack_reneging(struct sock *sk, int flag) | |||
| 2116 | tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); | 2279 | tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); |
| 2117 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | 2280 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
| 2118 | icsk->icsk_rto, TCP_RTO_MAX); | 2281 | icsk->icsk_rto, TCP_RTO_MAX); |
| 2119 | return true; | 2282 | return 1; |
| 2120 | } | 2283 | } |
| 2121 | return false; | 2284 | return 0; |
| 2122 | } | 2285 | } |
| 2123 | 2286 | ||
| 2124 | static inline int tcp_fackets_out(const struct tcp_sock *tp) | 2287 | static inline int tcp_fackets_out(struct tcp_sock *tp) |
| 2125 | { | 2288 | { |
| 2126 | return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out; | 2289 | return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out; |
| 2127 | } | 2290 | } |
| @@ -2141,41 +2304,19 @@ static inline int tcp_fackets_out(const struct tcp_sock *tp) | |||
| 2141 | * they differ. Since neither occurs due to loss, TCP should really | 2304 | * they differ. Since neither occurs due to loss, TCP should really |
| 2142 | * ignore them. | 2305 | * ignore them. |
| 2143 | */ | 2306 | */ |
| 2144 | static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) | 2307 | static inline int tcp_dupack_heuristics(struct tcp_sock *tp) |
| 2145 | { | 2308 | { |
| 2146 | return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; | 2309 | return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; |
| 2147 | } | 2310 | } |
| 2148 | 2311 | ||
| 2149 | static bool tcp_pause_early_retransmit(struct sock *sk, int flag) | 2312 | static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) |
| 2150 | { | ||
| 2151 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2152 | unsigned long delay; | ||
| 2153 | |||
| 2154 | /* Delay early retransmit and entering fast recovery for | ||
| 2155 | * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples | ||
| 2156 | * available, or RTO is scheduled to fire first. | ||
| 2157 | */ | ||
| 2158 | if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt) | ||
| 2159 | return false; | ||
| 2160 | |||
| 2161 | delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); | ||
| 2162 | if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) | ||
| 2163 | return false; | ||
| 2164 | |||
| 2165 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX); | ||
| 2166 | tp->early_retrans_delayed = 1; | ||
| 2167 | return true; | ||
| 2168 | } | ||
| 2169 | |||
| 2170 | static inline int tcp_skb_timedout(const struct sock *sk, | ||
| 2171 | const struct sk_buff *skb) | ||
| 2172 | { | 2313 | { |
| 2173 | return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto; | 2314 | return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto; |
| 2174 | } | 2315 | } |
| 2175 | 2316 | ||
| 2176 | static inline int tcp_head_timedout(const struct sock *sk) | 2317 | static inline int tcp_head_timedout(struct sock *sk) |
| 2177 | { | 2318 | { |
| 2178 | const struct tcp_sock *tp = tcp_sk(sk); | 2319 | struct tcp_sock *tp = tcp_sk(sk); |
| 2179 | 2320 | ||
| 2180 | return tp->packets_out && | 2321 | return tp->packets_out && |
| 2181 | tcp_skb_timedout(sk, tcp_write_queue_head(sk)); | 2322 | tcp_skb_timedout(sk, tcp_write_queue_head(sk)); |
| @@ -2274,28 +2415,28 @@ static inline int tcp_head_timedout(const struct sock *sk) | |||
| 2274 | * Main question: may we further continue forward transmission | 2415 | * Main question: may we further continue forward transmission |
| 2275 | * with the same cwnd? | 2416 | * with the same cwnd? |
| 2276 | */ | 2417 | */ |
| 2277 | static bool tcp_time_to_recover(struct sock *sk, int flag) | 2418 | static int tcp_time_to_recover(struct sock *sk) |
| 2278 | { | 2419 | { |
| 2279 | struct tcp_sock *tp = tcp_sk(sk); | 2420 | struct tcp_sock *tp = tcp_sk(sk); |
| 2280 | __u32 packets_out; | 2421 | __u32 packets_out; |
| 2281 | 2422 | ||
| 2282 | /* Do not perform any recovery during F-RTO algorithm */ | 2423 | /* Do not perform any recovery during F-RTO algorithm */ |
| 2283 | if (tp->frto_counter) | 2424 | if (tp->frto_counter) |
| 2284 | return false; | 2425 | return 0; |
| 2285 | 2426 | ||
| 2286 | /* Trick#1: The loss is proven. */ | 2427 | /* Trick#1: The loss is proven. */ |
| 2287 | if (tp->lost_out) | 2428 | if (tp->lost_out) |
| 2288 | return true; | 2429 | return 1; |
| 2289 | 2430 | ||
| 2290 | /* Not-A-Trick#2 : Classic rule... */ | 2431 | /* Not-A-Trick#2 : Classic rule... */ |
| 2291 | if (tcp_dupack_heuristics(tp) > tp->reordering) | 2432 | if (tcp_dupack_heuristics(tp) > tp->reordering) |
| 2292 | return true; | 2433 | return 1; |
| 2293 | 2434 | ||
| 2294 | /* Trick#3 : when we use RFC2988 timer restart, fast | 2435 | /* Trick#3 : when we use RFC2988 timer restart, fast |
| 2295 | * retransmit can be triggered by timeout of queue head. | 2436 | * retransmit can be triggered by timeout of queue head. |
| 2296 | */ | 2437 | */ |
| 2297 | if (tcp_is_fack(tp) && tcp_head_timedout(sk)) | 2438 | if (tcp_is_fack(tp) && tcp_head_timedout(sk)) |
| 2298 | return true; | 2439 | return 1; |
| 2299 | 2440 | ||
| 2300 | /* Trick#4: It is still not OK... But will it be useful to delay | 2441 | /* Trick#4: It is still not OK... But will it be useful to delay |
| 2301 | * recovery more? | 2442 | * recovery more? |
| @@ -2307,7 +2448,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) | |||
| 2307 | /* We have nothing to send. This connection is limited | 2448 | /* We have nothing to send. This connection is limited |
| 2308 | * either by receiver window or by application. | 2449 | * either by receiver window or by application. |
| 2309 | */ | 2450 | */ |
| 2310 | return true; | 2451 | return 1; |
| 2311 | } | 2452 | } |
| 2312 | 2453 | ||
| 2313 | /* If a thin stream is detected, retransmit after first | 2454 | /* If a thin stream is detected, retransmit after first |
| @@ -2318,19 +2459,9 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) | |||
| 2318 | if ((tp->thin_dupack || sysctl_tcp_thin_dupack) && | 2459 | if ((tp->thin_dupack || sysctl_tcp_thin_dupack) && |
| 2319 | tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 && | 2460 | tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 && |
| 2320 | tcp_is_sack(tp) && !tcp_send_head(sk)) | 2461 | tcp_is_sack(tp) && !tcp_send_head(sk)) |
| 2321 | return true; | 2462 | return 1; |
| 2322 | 2463 | ||
| 2323 | /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious | 2464 | return 0; |
| 2324 | * retransmissions due to small network reorderings, we implement | ||
| 2325 | * Mitigation A.3 in the RFC and delay the retransmission for a short | ||
| 2326 | * interval if appropriate. | ||
| 2327 | */ | ||
| 2328 | if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && | ||
| 2329 | (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && | ||
| 2330 | !tcp_may_send_now(sk)) | ||
| 2331 | return !tcp_pause_early_retransmit(sk, flag); | ||
| 2332 | |||
| 2333 | return false; | ||
| 2334 | } | 2465 | } |
| 2335 | 2466 | ||
| 2336 | /* New heuristics: it is possible only after we switched to restart timer | 2467 | /* New heuristics: it is possible only after we switched to restart timer |
| @@ -2371,11 +2502,8 @@ static void tcp_timeout_skbs(struct sock *sk) | |||
| 2371 | tcp_verify_left_out(tp); | 2502 | tcp_verify_left_out(tp); |
| 2372 | } | 2503 | } |
| 2373 | 2504 | ||
| 2374 | /* Detect loss in event "A" above by marking head of queue up as lost. | 2505 | /* Mark head of queue up as lost. With RFC3517 SACK, the packets is |
| 2375 | * For FACK or non-SACK(Reno) senders, the first "packets" number of segments | 2506 | * is against sacked "cnt", otherwise it's against facked "cnt" |
| 2376 | * are considered lost. For RFC3517 SACK, a segment is considered lost if it | ||
| 2377 | * has at least tp->reordering SACKed seqments above it; "packets" refers to | ||
| 2378 | * the maximum SACKed segments to pass before reaching this limit. | ||
| 2379 | */ | 2507 | */ |
| 2380 | static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) | 2508 | static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) |
| 2381 | { | 2509 | { |
| @@ -2384,8 +2512,6 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) | |||
| 2384 | int cnt, oldcnt; | 2512 | int cnt, oldcnt; |
| 2385 | int err; | 2513 | int err; |
| 2386 | unsigned int mss; | 2514 | unsigned int mss; |
| 2387 | /* Use SACK to deduce losses of new sequences sent during recovery */ | ||
| 2388 | const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; | ||
| 2389 | 2515 | ||
| 2390 | WARN_ON(packets > tp->packets_out); | 2516 | WARN_ON(packets > tp->packets_out); |
| 2391 | if (tp->lost_skb_hint) { | 2517 | if (tp->lost_skb_hint) { |
| @@ -2407,7 +2533,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) | |||
| 2407 | tp->lost_skb_hint = skb; | 2533 | tp->lost_skb_hint = skb; |
| 2408 | tp->lost_cnt_hint = cnt; | 2534 | tp->lost_cnt_hint = cnt; |
| 2409 | 2535 | ||
| 2410 | if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) | 2536 | if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) |
| 2411 | break; | 2537 | break; |
| 2412 | 2538 | ||
| 2413 | oldcnt = cnt; | 2539 | oldcnt = cnt; |
| @@ -2417,7 +2543,6 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) | |||
| 2417 | 2543 | ||
| 2418 | if (cnt > packets) { | 2544 | if (cnt > packets) { |
| 2419 | if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || | 2545 | if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || |
| 2420 | (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || | ||
| 2421 | (oldcnt >= packets)) | 2546 | (oldcnt >= packets)) |
| 2422 | break; | 2547 | break; |
| 2423 | 2548 | ||
| @@ -2470,10 +2595,39 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) | |||
| 2470 | tp->snd_cwnd_stamp = tcp_time_stamp; | 2595 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| 2471 | } | 2596 | } |
| 2472 | 2597 | ||
| 2598 | /* Lower bound on congestion window is slow start threshold | ||
| 2599 | * unless congestion avoidance choice decides to overide it. | ||
| 2600 | */ | ||
| 2601 | static inline u32 tcp_cwnd_min(const struct sock *sk) | ||
| 2602 | { | ||
| 2603 | const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; | ||
| 2604 | |||
| 2605 | return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh; | ||
| 2606 | } | ||
| 2607 | |||
| 2608 | /* Decrease cwnd each second ack. */ | ||
| 2609 | static void tcp_cwnd_down(struct sock *sk, int flag) | ||
| 2610 | { | ||
| 2611 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2612 | int decr = tp->snd_cwnd_cnt + 1; | ||
| 2613 | |||
| 2614 | if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) || | ||
| 2615 | (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) { | ||
| 2616 | tp->snd_cwnd_cnt = decr & 1; | ||
| 2617 | decr >>= 1; | ||
| 2618 | |||
| 2619 | if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) | ||
| 2620 | tp->snd_cwnd -= decr; | ||
| 2621 | |||
| 2622 | tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1); | ||
| 2623 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
| 2624 | } | ||
| 2625 | } | ||
| 2626 | |||
| 2473 | /* Nothing was retransmitted or returned timestamp is less | 2627 | /* Nothing was retransmitted or returned timestamp is less |
| 2474 | * than timestamp of the first retransmission. | 2628 | * than timestamp of the first retransmission. |
| 2475 | */ | 2629 | */ |
| 2476 | static inline bool tcp_packet_delayed(const struct tcp_sock *tp) | 2630 | static inline int tcp_packet_delayed(struct tcp_sock *tp) |
| 2477 | { | 2631 | { |
| 2478 | return !tp->retrans_stamp || | 2632 | return !tp->retrans_stamp || |
| 2479 | (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && | 2633 | (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
| @@ -2489,22 +2643,22 @@ static void DBGUNDO(struct sock *sk, const char *msg) | |||
| 2489 | struct inet_sock *inet = inet_sk(sk); | 2643 | struct inet_sock *inet = inet_sk(sk); |
| 2490 | 2644 | ||
| 2491 | if (sk->sk_family == AF_INET) { | 2645 | if (sk->sk_family == AF_INET) { |
| 2492 | pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", | 2646 | printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", |
| 2493 | msg, | 2647 | msg, |
| 2494 | &inet->inet_daddr, ntohs(inet->inet_dport), | 2648 | &inet->inet_daddr, ntohs(inet->inet_dport), |
| 2495 | tp->snd_cwnd, tcp_left_out(tp), | 2649 | tp->snd_cwnd, tcp_left_out(tp), |
| 2496 | tp->snd_ssthresh, tp->prior_ssthresh, | 2650 | tp->snd_ssthresh, tp->prior_ssthresh, |
| 2497 | tp->packets_out); | 2651 | tp->packets_out); |
| 2498 | } | 2652 | } |
| 2499 | #if IS_ENABLED(CONFIG_IPV6) | 2653 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 2500 | else if (sk->sk_family == AF_INET6) { | 2654 | else if (sk->sk_family == AF_INET6) { |
| 2501 | struct ipv6_pinfo *np = inet6_sk(sk); | 2655 | struct ipv6_pinfo *np = inet6_sk(sk); |
| 2502 | pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", | 2656 | printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", |
| 2503 | msg, | 2657 | msg, |
| 2504 | &np->daddr, ntohs(inet->inet_dport), | 2658 | &np->daddr, ntohs(inet->inet_dport), |
| 2505 | tp->snd_cwnd, tcp_left_out(tp), | 2659 | tp->snd_cwnd, tcp_left_out(tp), |
| 2506 | tp->snd_ssthresh, tp->prior_ssthresh, | 2660 | tp->snd_ssthresh, tp->prior_ssthresh, |
| 2507 | tp->packets_out); | 2661 | tp->packets_out); |
| 2508 | } | 2662 | } |
| 2509 | #endif | 2663 | #endif |
| 2510 | } | 2664 | } |
| @@ -2534,13 +2688,13 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) | |||
| 2534 | tp->snd_cwnd_stamp = tcp_time_stamp; | 2688 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| 2535 | } | 2689 | } |
| 2536 | 2690 | ||
| 2537 | static inline bool tcp_may_undo(const struct tcp_sock *tp) | 2691 | static inline int tcp_may_undo(struct tcp_sock *tp) |
| 2538 | { | 2692 | { |
| 2539 | return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); | 2693 | return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); |
| 2540 | } | 2694 | } |
| 2541 | 2695 | ||
| 2542 | /* People celebrate: "We love our President!" */ | 2696 | /* People celebrate: "We love our President!" */ |
| 2543 | static bool tcp_try_undo_recovery(struct sock *sk) | 2697 | static int tcp_try_undo_recovery(struct sock *sk) |
| 2544 | { | 2698 | { |
| 2545 | struct tcp_sock *tp = tcp_sk(sk); | 2699 | struct tcp_sock *tp = tcp_sk(sk); |
| 2546 | 2700 | ||
| @@ -2565,10 +2719,10 @@ static bool tcp_try_undo_recovery(struct sock *sk) | |||
| 2565 | * is ACKed. For Reno it is MUST to prevent false | 2719 | * is ACKed. For Reno it is MUST to prevent false |
| 2566 | * fast retransmits (RFC2582). SACK TCP is safe. */ | 2720 | * fast retransmits (RFC2582). SACK TCP is safe. */ |
| 2567 | tcp_moderate_cwnd(tp); | 2721 | tcp_moderate_cwnd(tp); |
| 2568 | return true; | 2722 | return 1; |
| 2569 | } | 2723 | } |
| 2570 | tcp_set_ca_state(sk, TCP_CA_Open); | 2724 | tcp_set_ca_state(sk, TCP_CA_Open); |
| 2571 | return false; | 2725 | return 0; |
| 2572 | } | 2726 | } |
| 2573 | 2727 | ||
| 2574 | /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ | 2728 | /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ |
| @@ -2598,19 +2752,19 @@ static void tcp_try_undo_dsack(struct sock *sk) | |||
| 2598 | * that successive retransmissions of a segment must not advance | 2752 | * that successive retransmissions of a segment must not advance |
| 2599 | * retrans_stamp under any conditions. | 2753 | * retrans_stamp under any conditions. |
| 2600 | */ | 2754 | */ |
| 2601 | static bool tcp_any_retrans_done(const struct sock *sk) | 2755 | static int tcp_any_retrans_done(struct sock *sk) |
| 2602 | { | 2756 | { |
| 2603 | const struct tcp_sock *tp = tcp_sk(sk); | 2757 | struct tcp_sock *tp = tcp_sk(sk); |
| 2604 | struct sk_buff *skb; | 2758 | struct sk_buff *skb; |
| 2605 | 2759 | ||
| 2606 | if (tp->retrans_out) | 2760 | if (tp->retrans_out) |
| 2607 | return true; | 2761 | return 1; |
| 2608 | 2762 | ||
| 2609 | skb = tcp_write_queue_head(sk); | 2763 | skb = tcp_write_queue_head(sk); |
| 2610 | if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) | 2764 | if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) |
| 2611 | return true; | 2765 | return 1; |
| 2612 | 2766 | ||
| 2613 | return false; | 2767 | return 0; |
| 2614 | } | 2768 | } |
| 2615 | 2769 | ||
| 2616 | /* Undo during fast recovery after partial ACK. */ | 2770 | /* Undo during fast recovery after partial ACK. */ |
| @@ -2644,7 +2798,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) | |||
| 2644 | } | 2798 | } |
| 2645 | 2799 | ||
| 2646 | /* Undo during loss recovery after partial ACK. */ | 2800 | /* Undo during loss recovery after partial ACK. */ |
| 2647 | static bool tcp_try_undo_loss(struct sock *sk) | 2801 | static int tcp_try_undo_loss(struct sock *sk) |
| 2648 | { | 2802 | { |
| 2649 | struct tcp_sock *tp = tcp_sk(sk); | 2803 | struct tcp_sock *tp = tcp_sk(sk); |
| 2650 | 2804 | ||
| @@ -2666,91 +2820,28 @@ static bool tcp_try_undo_loss(struct sock *sk) | |||
| 2666 | tp->undo_marker = 0; | 2820 | tp->undo_marker = 0; |
| 2667 | if (tcp_is_sack(tp)) | 2821 | if (tcp_is_sack(tp)) |
| 2668 | tcp_set_ca_state(sk, TCP_CA_Open); | 2822 | tcp_set_ca_state(sk, TCP_CA_Open); |
| 2669 | return true; | 2823 | return 1; |
| 2670 | } | ||
| 2671 | return false; | ||
| 2672 | } | ||
| 2673 | |||
| 2674 | /* The cwnd reduction in CWR and Recovery use the PRR algorithm | ||
| 2675 | * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/ | ||
| 2676 | * It computes the number of packets to send (sndcnt) based on packets newly | ||
| 2677 | * delivered: | ||
| 2678 | * 1) If the packets in flight is larger than ssthresh, PRR spreads the | ||
| 2679 | * cwnd reductions across a full RTT. | ||
| 2680 | * 2) If packets in flight is lower than ssthresh (such as due to excess | ||
| 2681 | * losses and/or application stalls), do not perform any further cwnd | ||
| 2682 | * reductions, but instead slow start up to ssthresh. | ||
| 2683 | */ | ||
| 2684 | static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) | ||
| 2685 | { | ||
| 2686 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2687 | |||
| 2688 | tp->high_seq = tp->snd_nxt; | ||
| 2689 | tp->bytes_acked = 0; | ||
| 2690 | tp->snd_cwnd_cnt = 0; | ||
| 2691 | tp->prior_cwnd = tp->snd_cwnd; | ||
| 2692 | tp->prr_delivered = 0; | ||
| 2693 | tp->prr_out = 0; | ||
| 2694 | if (set_ssthresh) | ||
| 2695 | tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); | ||
| 2696 | TCP_ECN_queue_cwr(tp); | ||
| 2697 | } | ||
| 2698 | |||
| 2699 | static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, | ||
| 2700 | int fast_rexmit) | ||
| 2701 | { | ||
| 2702 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2703 | int sndcnt = 0; | ||
| 2704 | int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); | ||
| 2705 | |||
| 2706 | tp->prr_delivered += newly_acked_sacked; | ||
| 2707 | if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) { | ||
| 2708 | u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + | ||
| 2709 | tp->prior_cwnd - 1; | ||
| 2710 | sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; | ||
| 2711 | } else { | ||
| 2712 | sndcnt = min_t(int, delta, | ||
| 2713 | max_t(int, tp->prr_delivered - tp->prr_out, | ||
| 2714 | newly_acked_sacked) + 1); | ||
| 2715 | } | 2824 | } |
| 2716 | 2825 | return 0; | |
| 2717 | sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0)); | ||
| 2718 | tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; | ||
| 2719 | } | 2826 | } |
| 2720 | 2827 | ||
| 2721 | static inline void tcp_end_cwnd_reduction(struct sock *sk) | 2828 | static inline void tcp_complete_cwr(struct sock *sk) |
| 2722 | { | 2829 | { |
| 2723 | struct tcp_sock *tp = tcp_sk(sk); | 2830 | struct tcp_sock *tp = tcp_sk(sk); |
| 2724 | 2831 | /* Do not moderate cwnd if it's already undone in cwr or recovery */ | |
| 2725 | /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ | 2832 | if (tp->undo_marker && tp->snd_cwnd > tp->snd_ssthresh) { |
| 2726 | if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || | ||
| 2727 | (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { | ||
| 2728 | tp->snd_cwnd = tp->snd_ssthresh; | 2833 | tp->snd_cwnd = tp->snd_ssthresh; |
| 2729 | tp->snd_cwnd_stamp = tcp_time_stamp; | 2834 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| 2730 | } | 2835 | } |
| 2731 | tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); | 2836 | tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); |
| 2732 | } | 2837 | } |
| 2733 | 2838 | ||
| 2734 | /* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */ | ||
| 2735 | void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) | ||
| 2736 | { | ||
| 2737 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2738 | |||
| 2739 | tp->prior_ssthresh = 0; | ||
| 2740 | tp->bytes_acked = 0; | ||
| 2741 | if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { | ||
| 2742 | tp->undo_marker = 0; | ||
| 2743 | tcp_init_cwnd_reduction(sk, set_ssthresh); | ||
| 2744 | tcp_set_ca_state(sk, TCP_CA_CWR); | ||
| 2745 | } | ||
| 2746 | } | ||
| 2747 | |||
| 2748 | static void tcp_try_keep_open(struct sock *sk) | 2839 | static void tcp_try_keep_open(struct sock *sk) |
| 2749 | { | 2840 | { |
| 2750 | struct tcp_sock *tp = tcp_sk(sk); | 2841 | struct tcp_sock *tp = tcp_sk(sk); |
| 2751 | int state = TCP_CA_Open; | 2842 | int state = TCP_CA_Open; |
| 2752 | 2843 | ||
| 2753 | if (tcp_left_out(tp) || tcp_any_retrans_done(sk)) | 2844 | if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker) |
| 2754 | state = TCP_CA_Disorder; | 2845 | state = TCP_CA_Disorder; |
| 2755 | 2846 | ||
| 2756 | if (inet_csk(sk)->icsk_ca_state != state) { | 2847 | if (inet_csk(sk)->icsk_ca_state != state) { |
| @@ -2759,7 +2850,7 @@ static void tcp_try_keep_open(struct sock *sk) | |||
| 2759 | } | 2850 | } |
| 2760 | } | 2851 | } |
| 2761 | 2852 | ||
| 2762 | static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked) | 2853 | static void tcp_try_to_open(struct sock *sk, int flag) |
| 2763 | { | 2854 | { |
| 2764 | struct tcp_sock *tp = tcp_sk(sk); | 2855 | struct tcp_sock *tp = tcp_sk(sk); |
| 2765 | 2856 | ||
| @@ -2773,10 +2864,9 @@ static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked) | |||
| 2773 | 2864 | ||
| 2774 | if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { | 2865 | if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { |
| 2775 | tcp_try_keep_open(sk); | 2866 | tcp_try_keep_open(sk); |
| 2776 | if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open) | 2867 | tcp_moderate_cwnd(tp); |
| 2777 | tcp_moderate_cwnd(tp); | ||
| 2778 | } else { | 2868 | } else { |
| 2779 | tcp_cwnd_reduction(sk, newly_acked_sacked, 0); | 2869 | tcp_cwnd_down(sk, flag); |
| 2780 | } | 2870 | } |
| 2781 | } | 2871 | } |
| 2782 | 2872 | ||
| @@ -2858,30 +2948,6 @@ void tcp_simple_retransmit(struct sock *sk) | |||
| 2858 | } | 2948 | } |
| 2859 | EXPORT_SYMBOL(tcp_simple_retransmit); | 2949 | EXPORT_SYMBOL(tcp_simple_retransmit); |
| 2860 | 2950 | ||
| 2861 | static void tcp_enter_recovery(struct sock *sk, bool ece_ack) | ||
| 2862 | { | ||
| 2863 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2864 | int mib_idx; | ||
| 2865 | |||
| 2866 | if (tcp_is_reno(tp)) | ||
| 2867 | mib_idx = LINUX_MIB_TCPRENORECOVERY; | ||
| 2868 | else | ||
| 2869 | mib_idx = LINUX_MIB_TCPSACKRECOVERY; | ||
| 2870 | |||
| 2871 | NET_INC_STATS_BH(sock_net(sk), mib_idx); | ||
| 2872 | |||
| 2873 | tp->prior_ssthresh = 0; | ||
| 2874 | tp->undo_marker = tp->snd_una; | ||
| 2875 | tp->undo_retrans = tp->retrans_out; | ||
| 2876 | |||
| 2877 | if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { | ||
| 2878 | if (!ece_ack) | ||
| 2879 | tp->prior_ssthresh = tcp_current_ssthresh(sk); | ||
| 2880 | tcp_init_cwnd_reduction(sk, true); | ||
| 2881 | } | ||
| 2882 | tcp_set_ca_state(sk, TCP_CA_Recovery); | ||
| 2883 | } | ||
| 2884 | |||
| 2885 | /* Process an event, which can update packets-in-flight not trivially. | 2951 | /* Process an event, which can update packets-in-flight not trivially. |
| 2886 | * Main goal of this function is to calculate new estimate for left_out, | 2952 | * Main goal of this function is to calculate new estimate for left_out, |
| 2887 | * taking into account both packets sitting in receiver's buffer and | 2953 | * taking into account both packets sitting in receiver's buffer and |
| @@ -2893,16 +2959,14 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) | |||
| 2893 | * It does _not_ decide what to send, it is made in function | 2959 | * It does _not_ decide what to send, it is made in function |
| 2894 | * tcp_xmit_retransmit_queue(). | 2960 | * tcp_xmit_retransmit_queue(). |
| 2895 | */ | 2961 | */ |
| 2896 | static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | 2962 | static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) |
| 2897 | int prior_sacked, bool is_dupack, | ||
| 2898 | int flag) | ||
| 2899 | { | 2963 | { |
| 2900 | struct inet_connection_sock *icsk = inet_csk(sk); | 2964 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 2901 | struct tcp_sock *tp = tcp_sk(sk); | 2965 | struct tcp_sock *tp = tcp_sk(sk); |
| 2966 | int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); | ||
| 2902 | int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && | 2967 | int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && |
| 2903 | (tcp_fackets_out(tp) > tp->reordering)); | 2968 | (tcp_fackets_out(tp) > tp->reordering)); |
| 2904 | int newly_acked_sacked = 0; | 2969 | int fast_rexmit = 0, mib_idx; |
| 2905 | int fast_rexmit = 0; | ||
| 2906 | 2970 | ||
| 2907 | if (WARN_ON(!tp->packets_out && tp->sacked_out)) | 2971 | if (WARN_ON(!tp->packets_out && tp->sacked_out)) |
| 2908 | tp->sacked_out = 0; | 2972 | tp->sacked_out = 0; |
| @@ -2918,10 +2982,19 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | |||
| 2918 | if (tcp_check_sack_reneging(sk, flag)) | 2982 | if (tcp_check_sack_reneging(sk, flag)) |
| 2919 | return; | 2983 | return; |
| 2920 | 2984 | ||
| 2921 | /* C. Check consistency of the current state. */ | 2985 | /* C. Process data loss notification, provided it is valid. */ |
| 2986 | if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) && | ||
| 2987 | before(tp->snd_una, tp->high_seq) && | ||
| 2988 | icsk->icsk_ca_state != TCP_CA_Open && | ||
| 2989 | tp->fackets_out > tp->reordering) { | ||
| 2990 | tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0); | ||
| 2991 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); | ||
| 2992 | } | ||
| 2993 | |||
| 2994 | /* D. Check consistency of the current state. */ | ||
| 2922 | tcp_verify_left_out(tp); | 2995 | tcp_verify_left_out(tp); |
| 2923 | 2996 | ||
| 2924 | /* D. Check state exit conditions. State can be terminated | 2997 | /* E. Check state exit conditions. State can be terminated |
| 2925 | * when high_seq is ACKed. */ | 2998 | * when high_seq is ACKed. */ |
| 2926 | if (icsk->icsk_ca_state == TCP_CA_Open) { | 2999 | if (icsk->icsk_ca_state == TCP_CA_Open) { |
| 2927 | WARN_ON(tp->retrans_out != 0); | 3000 | WARN_ON(tp->retrans_out != 0); |
| @@ -2938,7 +3011,18 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | |||
| 2938 | /* CWR is to be held something *above* high_seq | 3011 | /* CWR is to be held something *above* high_seq |
| 2939 | * is ACKed for CWR bit to reach receiver. */ | 3012 | * is ACKed for CWR bit to reach receiver. */ |
| 2940 | if (tp->snd_una != tp->high_seq) { | 3013 | if (tp->snd_una != tp->high_seq) { |
| 2941 | tcp_end_cwnd_reduction(sk); | 3014 | tcp_complete_cwr(sk); |
| 3015 | tcp_set_ca_state(sk, TCP_CA_Open); | ||
| 3016 | } | ||
| 3017 | break; | ||
| 3018 | |||
| 3019 | case TCP_CA_Disorder: | ||
| 3020 | tcp_try_undo_dsack(sk); | ||
| 3021 | if (!tp->undo_marker || | ||
| 3022 | /* For SACK case do not Open to allow to undo | ||
| 3023 | * catching for all duplicate ACKs. */ | ||
| 3024 | tcp_is_reno(tp) || tp->snd_una != tp->high_seq) { | ||
| 3025 | tp->undo_marker = 0; | ||
| 2942 | tcp_set_ca_state(sk, TCP_CA_Open); | 3026 | tcp_set_ca_state(sk, TCP_CA_Open); |
| 2943 | } | 3027 | } |
| 2944 | break; | 3028 | break; |
| @@ -2948,12 +3032,12 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | |||
| 2948 | tcp_reset_reno_sack(tp); | 3032 | tcp_reset_reno_sack(tp); |
| 2949 | if (tcp_try_undo_recovery(sk)) | 3033 | if (tcp_try_undo_recovery(sk)) |
| 2950 | return; | 3034 | return; |
| 2951 | tcp_end_cwnd_reduction(sk); | 3035 | tcp_complete_cwr(sk); |
| 2952 | break; | 3036 | break; |
| 2953 | } | 3037 | } |
| 2954 | } | 3038 | } |
| 2955 | 3039 | ||
| 2956 | /* E. Process state. */ | 3040 | /* F. Process state. */ |
| 2957 | switch (icsk->icsk_ca_state) { | 3041 | switch (icsk->icsk_ca_state) { |
| 2958 | case TCP_CA_Recovery: | 3042 | case TCP_CA_Recovery: |
| 2959 | if (!(flag & FLAG_SND_UNA_ADVANCED)) { | 3043 | if (!(flag & FLAG_SND_UNA_ADVANCED)) { |
| @@ -2961,7 +3045,6 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | |||
| 2961 | tcp_add_reno_sack(sk); | 3045 | tcp_add_reno_sack(sk); |
| 2962 | } else | 3046 | } else |
| 2963 | do_lost = tcp_try_undo_partial(sk, pkts_acked); | 3047 | do_lost = tcp_try_undo_partial(sk, pkts_acked); |
| 2964 | newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; | ||
| 2965 | break; | 3048 | break; |
| 2966 | case TCP_CA_Loss: | 3049 | case TCP_CA_Loss: |
| 2967 | if (flag & FLAG_DATA_ACKED) | 3050 | if (flag & FLAG_DATA_ACKED) |
| @@ -2983,13 +3066,12 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | |||
| 2983 | if (is_dupack) | 3066 | if (is_dupack) |
| 2984 | tcp_add_reno_sack(sk); | 3067 | tcp_add_reno_sack(sk); |
| 2985 | } | 3068 | } |
| 2986 | newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; | ||
| 2987 | 3069 | ||
| 2988 | if (icsk->icsk_ca_state <= TCP_CA_Disorder) | 3070 | if (icsk->icsk_ca_state == TCP_CA_Disorder) |
| 2989 | tcp_try_undo_dsack(sk); | 3071 | tcp_try_undo_dsack(sk); |
| 2990 | 3072 | ||
| 2991 | if (!tcp_time_to_recover(sk, flag)) { | 3073 | if (!tcp_time_to_recover(sk)) { |
| 2992 | tcp_try_to_open(sk, flag, newly_acked_sacked); | 3074 | tcp_try_to_open(sk, flag); |
| 2993 | return; | 3075 | return; |
| 2994 | } | 3076 | } |
| 2995 | 3077 | ||
| @@ -3005,13 +3087,35 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | |||
| 3005 | } | 3087 | } |
| 3006 | 3088 | ||
| 3007 | /* Otherwise enter Recovery state */ | 3089 | /* Otherwise enter Recovery state */ |
| 3008 | tcp_enter_recovery(sk, (flag & FLAG_ECE)); | 3090 | |
| 3091 | if (tcp_is_reno(tp)) | ||
| 3092 | mib_idx = LINUX_MIB_TCPRENORECOVERY; | ||
| 3093 | else | ||
| 3094 | mib_idx = LINUX_MIB_TCPSACKRECOVERY; | ||
| 3095 | |||
| 3096 | NET_INC_STATS_BH(sock_net(sk), mib_idx); | ||
| 3097 | |||
| 3098 | tp->high_seq = tp->snd_nxt; | ||
| 3099 | tp->prior_ssthresh = 0; | ||
| 3100 | tp->undo_marker = tp->snd_una; | ||
| 3101 | tp->undo_retrans = tp->retrans_out; | ||
| 3102 | |||
| 3103 | if (icsk->icsk_ca_state < TCP_CA_CWR) { | ||
| 3104 | if (!(flag & FLAG_ECE)) | ||
| 3105 | tp->prior_ssthresh = tcp_current_ssthresh(sk); | ||
| 3106 | tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); | ||
| 3107 | TCP_ECN_queue_cwr(tp); | ||
| 3108 | } | ||
| 3109 | |||
| 3110 | tp->bytes_acked = 0; | ||
| 3111 | tp->snd_cwnd_cnt = 0; | ||
| 3112 | tcp_set_ca_state(sk, TCP_CA_Recovery); | ||
| 3009 | fast_rexmit = 1; | 3113 | fast_rexmit = 1; |
| 3010 | } | 3114 | } |
| 3011 | 3115 | ||
| 3012 | if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) | 3116 | if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) |
| 3013 | tcp_update_scoreboard(sk, fast_rexmit); | 3117 | tcp_update_scoreboard(sk, fast_rexmit); |
| 3014 | tcp_cwnd_reduction(sk, newly_acked_sacked, fast_rexmit); | 3118 | tcp_cwnd_down(sk, flag); |
| 3015 | tcp_xmit_retransmit_queue(sk); | 3119 | tcp_xmit_retransmit_queue(sk); |
| 3016 | } | 3120 | } |
| 3017 | 3121 | ||
| @@ -3086,53 +3190,16 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) | |||
| 3086 | /* Restart timer after forward progress on connection. | 3190 | /* Restart timer after forward progress on connection. |
| 3087 | * RFC2988 recommends to restart timer to now+rto. | 3191 | * RFC2988 recommends to restart timer to now+rto. |
| 3088 | */ | 3192 | */ |
| 3089 | void tcp_rearm_rto(struct sock *sk) | 3193 | static void tcp_rearm_rto(struct sock *sk) |
| 3090 | { | 3194 | { |
| 3091 | struct tcp_sock *tp = tcp_sk(sk); | 3195 | struct tcp_sock *tp = tcp_sk(sk); |
| 3092 | 3196 | ||
| 3093 | /* If the retrans timer is currently being used by Fast Open | ||
| 3094 | * for SYN-ACK retrans purpose, stay put. | ||
| 3095 | */ | ||
| 3096 | if (tp->fastopen_rsk) | ||
| 3097 | return; | ||
| 3098 | |||
| 3099 | if (!tp->packets_out) { | 3197 | if (!tp->packets_out) { |
| 3100 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); | 3198 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); |
| 3101 | } else { | 3199 | } else { |
| 3102 | u32 rto = inet_csk(sk)->icsk_rto; | 3200 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
| 3103 | /* Offset the time elapsed after installing regular RTO */ | 3201 | inet_csk(sk)->icsk_rto, TCP_RTO_MAX); |
| 3104 | if (tp->early_retrans_delayed) { | ||
| 3105 | struct sk_buff *skb = tcp_write_queue_head(sk); | ||
| 3106 | const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; | ||
| 3107 | s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); | ||
| 3108 | /* delta may not be positive if the socket is locked | ||
| 3109 | * when the delayed ER timer fires and is rescheduled. | ||
| 3110 | */ | ||
| 3111 | if (delta > 0) | ||
| 3112 | rto = delta; | ||
| 3113 | } | ||
| 3114 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, | ||
| 3115 | TCP_RTO_MAX); | ||
| 3116 | } | 3202 | } |
| 3117 | tp->early_retrans_delayed = 0; | ||
| 3118 | } | ||
| 3119 | |||
| 3120 | /* This function is called when the delayed ER timer fires. TCP enters | ||
| 3121 | * fast recovery and performs fast-retransmit. | ||
| 3122 | */ | ||
| 3123 | void tcp_resume_early_retransmit(struct sock *sk) | ||
| 3124 | { | ||
| 3125 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 3126 | |||
| 3127 | tcp_rearm_rto(sk); | ||
| 3128 | |||
| 3129 | /* Stop if ER is disabled after the delayed ER timer is scheduled */ | ||
| 3130 | if (!tp->do_early_retrans) | ||
| 3131 | return; | ||
| 3132 | |||
| 3133 | tcp_enter_recovery(sk, false); | ||
| 3134 | tcp_update_scoreboard(sk, 1); | ||
| 3135 | tcp_xmit_retransmit_queue(sk); | ||
| 3136 | } | 3203 | } |
| 3137 | 3204 | ||
| 3138 | /* If we get here, the whole TSO packet has not been acked. */ | 3205 | /* If we get here, the whole TSO packet has not been acked. */ |
| @@ -3167,7 +3234,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
| 3167 | const struct inet_connection_sock *icsk = inet_csk(sk); | 3234 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 3168 | struct sk_buff *skb; | 3235 | struct sk_buff *skb; |
| 3169 | u32 now = tcp_time_stamp; | 3236 | u32 now = tcp_time_stamp; |
| 3170 | int fully_acked = true; | 3237 | int fully_acked = 1; |
| 3171 | int flag = 0; | 3238 | int flag = 0; |
| 3172 | u32 pkts_acked = 0; | 3239 | u32 pkts_acked = 0; |
| 3173 | u32 reord = tp->packets_out; | 3240 | u32 reord = tp->packets_out; |
| @@ -3191,7 +3258,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
| 3191 | if (!acked_pcount) | 3258 | if (!acked_pcount) |
| 3192 | break; | 3259 | break; |
| 3193 | 3260 | ||
| 3194 | fully_acked = false; | 3261 | fully_acked = 0; |
| 3195 | } else { | 3262 | } else { |
| 3196 | acked_pcount = tcp_skb_pcount(skb); | 3263 | acked_pcount = tcp_skb_pcount(skb); |
| 3197 | } | 3264 | } |
| @@ -3229,7 +3296,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
| 3229 | * connection startup slow start one packet too | 3296 | * connection startup slow start one packet too |
| 3230 | * quickly. This is severely frowned upon behavior. | 3297 | * quickly. This is severely frowned upon behavior. |
| 3231 | */ | 3298 | */ |
| 3232 | if (!(scb->tcp_flags & TCPHDR_SYN)) { | 3299 | if (!(scb->flags & TCPHDR_SYN)) { |
| 3233 | flag |= FLAG_DATA_ACKED; | 3300 | flag |= FLAG_DATA_ACKED; |
| 3234 | } else { | 3301 | } else { |
| 3235 | flag |= FLAG_SYN_ACKED; | 3302 | flag |= FLAG_SYN_ACKED; |
| @@ -3308,18 +3375,18 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
| 3308 | if (!tp->packets_out && tcp_is_sack(tp)) { | 3375 | if (!tp->packets_out && tcp_is_sack(tp)) { |
| 3309 | icsk = inet_csk(sk); | 3376 | icsk = inet_csk(sk); |
| 3310 | if (tp->lost_out) { | 3377 | if (tp->lost_out) { |
| 3311 | pr_debug("Leak l=%u %d\n", | 3378 | printk(KERN_DEBUG "Leak l=%u %d\n", |
| 3312 | tp->lost_out, icsk->icsk_ca_state); | 3379 | tp->lost_out, icsk->icsk_ca_state); |
| 3313 | tp->lost_out = 0; | 3380 | tp->lost_out = 0; |
| 3314 | } | 3381 | } |
| 3315 | if (tp->sacked_out) { | 3382 | if (tp->sacked_out) { |
| 3316 | pr_debug("Leak s=%u %d\n", | 3383 | printk(KERN_DEBUG "Leak s=%u %d\n", |
| 3317 | tp->sacked_out, icsk->icsk_ca_state); | 3384 | tp->sacked_out, icsk->icsk_ca_state); |
| 3318 | tp->sacked_out = 0; | 3385 | tp->sacked_out = 0; |
| 3319 | } | 3386 | } |
| 3320 | if (tp->retrans_out) { | 3387 | if (tp->retrans_out) { |
| 3321 | pr_debug("Leak r=%u %d\n", | 3388 | printk(KERN_DEBUG "Leak r=%u %d\n", |
| 3322 | tp->retrans_out, icsk->icsk_ca_state); | 3389 | tp->retrans_out, icsk->icsk_ca_state); |
| 3323 | tp->retrans_out = 0; | 3390 | tp->retrans_out = 0; |
| 3324 | } | 3391 | } |
| 3325 | } | 3392 | } |
| @@ -3347,23 +3414,23 @@ static void tcp_ack_probe(struct sock *sk) | |||
| 3347 | } | 3414 | } |
| 3348 | } | 3415 | } |
| 3349 | 3416 | ||
| 3350 | static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) | 3417 | static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) |
| 3351 | { | 3418 | { |
| 3352 | return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || | 3419 | return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || |
| 3353 | inet_csk(sk)->icsk_ca_state != TCP_CA_Open; | 3420 | inet_csk(sk)->icsk_ca_state != TCP_CA_Open; |
| 3354 | } | 3421 | } |
| 3355 | 3422 | ||
| 3356 | static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) | 3423 | static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) |
| 3357 | { | 3424 | { |
| 3358 | const struct tcp_sock *tp = tcp_sk(sk); | 3425 | const struct tcp_sock *tp = tcp_sk(sk); |
| 3359 | return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && | 3426 | return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && |
| 3360 | !tcp_in_cwnd_reduction(sk); | 3427 | !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR)); |
| 3361 | } | 3428 | } |
| 3362 | 3429 | ||
| 3363 | /* Check that window update is acceptable. | 3430 | /* Check that window update is acceptable. |
| 3364 | * The function assumes that snd_una<=ack<=snd_next. | 3431 | * The function assumes that snd_una<=ack<=snd_next. |
| 3365 | */ | 3432 | */ |
| 3366 | static inline bool tcp_may_update_window(const struct tcp_sock *tp, | 3433 | static inline int tcp_may_update_window(const struct tcp_sock *tp, |
| 3367 | const u32 ack, const u32 ack_seq, | 3434 | const u32 ack, const u32 ack_seq, |
| 3368 | const u32 nwin) | 3435 | const u32 nwin) |
| 3369 | { | 3436 | { |
| @@ -3377,7 +3444,7 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp, | |||
| 3377 | * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 | 3444 | * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 |
| 3378 | * and in FreeBSD. NetBSD's one is even worse.) is wrong. | 3445 | * and in FreeBSD. NetBSD's one is even worse.) is wrong. |
| 3379 | */ | 3446 | */ |
| 3380 | static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack, | 3447 | static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack, |
| 3381 | u32 ack_seq) | 3448 | u32 ack_seq) |
| 3382 | { | 3449 | { |
| 3383 | struct tcp_sock *tp = tcp_sk(sk); | 3450 | struct tcp_sock *tp = tcp_sk(sk); |
| @@ -3425,9 +3492,9 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp) | |||
| 3425 | } | 3492 | } |
| 3426 | 3493 | ||
| 3427 | /* A conservative spurious RTO response algorithm: reduce cwnd using | 3494 | /* A conservative spurious RTO response algorithm: reduce cwnd using |
| 3428 | * PRR and continue in congestion avoidance. | 3495 | * rate halving and continue in congestion avoidance. |
| 3429 | */ | 3496 | */ |
| 3430 | static void tcp_cwr_spur_to_response(struct sock *sk) | 3497 | static void tcp_ratehalving_spur_to_response(struct sock *sk) |
| 3431 | { | 3498 | { |
| 3432 | tcp_enter_cwr(sk, 0); | 3499 | tcp_enter_cwr(sk, 0); |
| 3433 | } | 3500 | } |
| @@ -3435,7 +3502,7 @@ static void tcp_cwr_spur_to_response(struct sock *sk) | |||
| 3435 | static void tcp_undo_spur_to_response(struct sock *sk, int flag) | 3502 | static void tcp_undo_spur_to_response(struct sock *sk, int flag) |
| 3436 | { | 3503 | { |
| 3437 | if (flag & FLAG_ECE) | 3504 | if (flag & FLAG_ECE) |
| 3438 | tcp_cwr_spur_to_response(sk); | 3505 | tcp_ratehalving_spur_to_response(sk); |
| 3439 | else | 3506 | else |
| 3440 | tcp_undo_cwr(sk, true); | 3507 | tcp_undo_cwr(sk, true); |
| 3441 | } | 3508 | } |
| @@ -3470,7 +3537,7 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag) | |||
| 3470 | * to prove that the RTO is indeed spurious. It transfers the control | 3537 | * to prove that the RTO is indeed spurious. It transfers the control |
| 3471 | * from F-RTO to the conventional RTO recovery | 3538 | * from F-RTO to the conventional RTO recovery |
| 3472 | */ | 3539 | */ |
| 3473 | static bool tcp_process_frto(struct sock *sk, int flag) | 3540 | static int tcp_process_frto(struct sock *sk, int flag) |
| 3474 | { | 3541 | { |
| 3475 | struct tcp_sock *tp = tcp_sk(sk); | 3542 | struct tcp_sock *tp = tcp_sk(sk); |
| 3476 | 3543 | ||
| @@ -3486,7 +3553,7 @@ static bool tcp_process_frto(struct sock *sk, int flag) | |||
| 3486 | 3553 | ||
| 3487 | if (!before(tp->snd_una, tp->frto_highmark)) { | 3554 | if (!before(tp->snd_una, tp->frto_highmark)) { |
| 3488 | tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); | 3555 | tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); |
| 3489 | return true; | 3556 | return 1; |
| 3490 | } | 3557 | } |
| 3491 | 3558 | ||
| 3492 | if (!tcp_is_sackfrto(tp)) { | 3559 | if (!tcp_is_sackfrto(tp)) { |
| @@ -3495,19 +3562,19 @@ static bool tcp_process_frto(struct sock *sk, int flag) | |||
| 3495 | * data, winupdate | 3562 | * data, winupdate |
| 3496 | */ | 3563 | */ |
| 3497 | if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP)) | 3564 | if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP)) |
| 3498 | return true; | 3565 | return 1; |
| 3499 | 3566 | ||
| 3500 | if (!(flag & FLAG_DATA_ACKED)) { | 3567 | if (!(flag & FLAG_DATA_ACKED)) { |
| 3501 | tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), | 3568 | tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), |
| 3502 | flag); | 3569 | flag); |
| 3503 | return true; | 3570 | return 1; |
| 3504 | } | 3571 | } |
| 3505 | } else { | 3572 | } else { |
| 3506 | if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { | 3573 | if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { |
| 3507 | /* Prevent sending of new data. */ | 3574 | /* Prevent sending of new data. */ |
| 3508 | tp->snd_cwnd = min(tp->snd_cwnd, | 3575 | tp->snd_cwnd = min(tp->snd_cwnd, |
| 3509 | tcp_packets_in_flight(tp)); | 3576 | tcp_packets_in_flight(tp)); |
| 3510 | return true; | 3577 | return 1; |
| 3511 | } | 3578 | } |
| 3512 | 3579 | ||
| 3513 | if ((tp->frto_counter >= 2) && | 3580 | if ((tp->frto_counter >= 2) && |
| @@ -3517,10 +3584,10 @@ static bool tcp_process_frto(struct sock *sk, int flag) | |||
| 3517 | /* RFC4138 shortcoming (see comment above) */ | 3584 | /* RFC4138 shortcoming (see comment above) */ |
| 3518 | if (!(flag & FLAG_FORWARD_PROGRESS) && | 3585 | if (!(flag & FLAG_FORWARD_PROGRESS) && |
| 3519 | (flag & FLAG_NOT_DUP)) | 3586 | (flag & FLAG_NOT_DUP)) |
| 3520 | return true; | 3587 | return 1; |
| 3521 | 3588 | ||
| 3522 | tcp_enter_frto_loss(sk, 3, flag); | 3589 | tcp_enter_frto_loss(sk, 3, flag); |
| 3523 | return true; | 3590 | return 1; |
| 3524 | } | 3591 | } |
| 3525 | } | 3592 | } |
| 3526 | 3593 | ||
| @@ -3532,7 +3599,7 @@ static bool tcp_process_frto(struct sock *sk, int flag) | |||
| 3532 | if (!tcp_may_send_now(sk)) | 3599 | if (!tcp_may_send_now(sk)) |
| 3533 | tcp_enter_frto_loss(sk, 2, flag); | 3600 | tcp_enter_frto_loss(sk, 2, flag); |
| 3534 | 3601 | ||
| 3535 | return true; | 3602 | return 1; |
| 3536 | } else { | 3603 | } else { |
| 3537 | switch (sysctl_tcp_frto_response) { | 3604 | switch (sysctl_tcp_frto_response) { |
| 3538 | case 2: | 3605 | case 2: |
| @@ -3542,61 +3609,34 @@ static bool tcp_process_frto(struct sock *sk, int flag) | |||
| 3542 | tcp_conservative_spur_to_response(tp); | 3609 | tcp_conservative_spur_to_response(tp); |
| 3543 | break; | 3610 | break; |
| 3544 | default: | 3611 | default: |
| 3545 | tcp_cwr_spur_to_response(sk); | 3612 | tcp_ratehalving_spur_to_response(sk); |
| 3546 | break; | 3613 | break; |
| 3547 | } | 3614 | } |
| 3548 | tp->frto_counter = 0; | 3615 | tp->frto_counter = 0; |
| 3549 | tp->undo_marker = 0; | 3616 | tp->undo_marker = 0; |
| 3550 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); | 3617 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); |
| 3551 | } | 3618 | } |
| 3552 | return false; | 3619 | return 0; |
| 3553 | } | ||
| 3554 | |||
| 3555 | /* RFC 5961 7 [ACK Throttling] */ | ||
| 3556 | static void tcp_send_challenge_ack(struct sock *sk) | ||
| 3557 | { | ||
| 3558 | /* unprotected vars, we dont care of overwrites */ | ||
| 3559 | static u32 challenge_timestamp; | ||
| 3560 | static unsigned int challenge_count; | ||
| 3561 | u32 now = jiffies / HZ; | ||
| 3562 | |||
| 3563 | if (now != challenge_timestamp) { | ||
| 3564 | challenge_timestamp = now; | ||
| 3565 | challenge_count = 0; | ||
| 3566 | } | ||
| 3567 | if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { | ||
| 3568 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); | ||
| 3569 | tcp_send_ack(sk); | ||
| 3570 | } | ||
| 3571 | } | 3620 | } |
| 3572 | 3621 | ||
| 3573 | /* This routine deals with incoming acks, but not outgoing ones. */ | 3622 | /* This routine deals with incoming acks, but not outgoing ones. */ |
| 3574 | static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | 3623 | static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) |
| 3575 | { | 3624 | { |
| 3576 | struct inet_connection_sock *icsk = inet_csk(sk); | 3625 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 3577 | struct tcp_sock *tp = tcp_sk(sk); | 3626 | struct tcp_sock *tp = tcp_sk(sk); |
| 3578 | u32 prior_snd_una = tp->snd_una; | 3627 | u32 prior_snd_una = tp->snd_una; |
| 3579 | u32 ack_seq = TCP_SKB_CB(skb)->seq; | 3628 | u32 ack_seq = TCP_SKB_CB(skb)->seq; |
| 3580 | u32 ack = TCP_SKB_CB(skb)->ack_seq; | 3629 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
| 3581 | bool is_dupack = false; | ||
| 3582 | u32 prior_in_flight; | 3630 | u32 prior_in_flight; |
| 3583 | u32 prior_fackets; | 3631 | u32 prior_fackets; |
| 3584 | int prior_packets; | 3632 | int prior_packets; |
| 3585 | int prior_sacked = tp->sacked_out; | 3633 | int frto_cwnd = 0; |
| 3586 | int pkts_acked = 0; | ||
| 3587 | bool frto_cwnd = false; | ||
| 3588 | 3634 | ||
| 3589 | /* If the ack is older than previous acks | 3635 | /* If the ack is older than previous acks |
| 3590 | * then we can probably ignore it. | 3636 | * then we can probably ignore it. |
| 3591 | */ | 3637 | */ |
| 3592 | if (before(ack, prior_snd_una)) { | 3638 | if (before(ack, prior_snd_una)) |
| 3593 | /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ | ||
| 3594 | if (before(ack, prior_snd_una - tp->max_window)) { | ||
| 3595 | tcp_send_challenge_ack(sk); | ||
| 3596 | return -1; | ||
| 3597 | } | ||
| 3598 | goto old_ack; | 3639 | goto old_ack; |
| 3599 | } | ||
| 3600 | 3640 | ||
| 3601 | /* If the ack includes data we haven't sent yet, discard | 3641 | /* If the ack includes data we haven't sent yet, discard |
| 3602 | * this segment (RFC793 Section 3.9). | 3642 | * this segment (RFC793 Section 3.9). |
| @@ -3604,9 +3644,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
| 3604 | if (after(ack, tp->snd_nxt)) | 3644 | if (after(ack, tp->snd_nxt)) |
| 3605 | goto invalid_ack; | 3645 | goto invalid_ack; |
| 3606 | 3646 | ||
| 3607 | if (tp->early_retrans_delayed) | ||
| 3608 | tcp_rearm_rto(sk); | ||
| 3609 | |||
| 3610 | if (after(ack, prior_snd_una)) | 3647 | if (after(ack, prior_snd_una)) |
| 3611 | flag |= FLAG_SND_UNA_ADVANCED; | 3648 | flag |= FLAG_SND_UNA_ADVANCED; |
| 3612 | 3649 | ||
| @@ -3664,8 +3701,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
| 3664 | /* See if we can take anything off of the retransmit queue. */ | 3701 | /* See if we can take anything off of the retransmit queue. */ |
| 3665 | flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); | 3702 | flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); |
| 3666 | 3703 | ||
| 3667 | pkts_acked = prior_packets - tp->packets_out; | ||
| 3668 | |||
| 3669 | if (tp->frto_counter) | 3704 | if (tp->frto_counter) |
| 3670 | frto_cwnd = tcp_process_frto(sk, flag); | 3705 | frto_cwnd = tcp_process_frto(sk, flag); |
| 3671 | /* Guarantee sacktag reordering detection against wrap-arounds */ | 3706 | /* Guarantee sacktag reordering detection against wrap-arounds */ |
| @@ -3677,26 +3712,19 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
| 3677 | if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && | 3712 | if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && |
| 3678 | tcp_may_raise_cwnd(sk, flag)) | 3713 | tcp_may_raise_cwnd(sk, flag)) |
| 3679 | tcp_cong_avoid(sk, ack, prior_in_flight); | 3714 | tcp_cong_avoid(sk, ack, prior_in_flight); |
| 3680 | is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); | 3715 | tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, |
| 3681 | tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, | 3716 | flag); |
| 3682 | is_dupack, flag); | ||
| 3683 | } else { | 3717 | } else { |
| 3684 | if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) | 3718 | if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) |
| 3685 | tcp_cong_avoid(sk, ack, prior_in_flight); | 3719 | tcp_cong_avoid(sk, ack, prior_in_flight); |
| 3686 | } | 3720 | } |
| 3687 | 3721 | ||
| 3688 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { | 3722 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) |
| 3689 | struct dst_entry *dst = __sk_dst_get(sk); | 3723 | dst_confirm(__sk_dst_get(sk)); |
| 3690 | if (dst) | 3724 | |
| 3691 | dst_confirm(dst); | ||
| 3692 | } | ||
| 3693 | return 1; | 3725 | return 1; |
| 3694 | 3726 | ||
| 3695 | no_queue: | 3727 | no_queue: |
| 3696 | /* If data was DSACKed, see if we can undo a cwnd reduction. */ | ||
| 3697 | if (flag & FLAG_DSACKING_ACK) | ||
| 3698 | tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, | ||
| 3699 | is_dupack, flag); | ||
| 3700 | /* If this ack opens up a zero window, clear backoff. It was | 3728 | /* If this ack opens up a zero window, clear backoff. It was |
| 3701 | * being used to time the probes, and is probably far higher than | 3729 | * being used to time the probes, and is probably far higher than |
| 3702 | * it needs to be for normal retransmission. | 3730 | * it needs to be for normal retransmission. |
| @@ -3710,13 +3738,10 @@ invalid_ack: | |||
| 3710 | return -1; | 3738 | return -1; |
| 3711 | 3739 | ||
| 3712 | old_ack: | 3740 | old_ack: |
| 3713 | /* If data was SACKed, tag it and see if we should send more data. | ||
| 3714 | * If data was DSACKed, see if we can undo a cwnd reduction. | ||
| 3715 | */ | ||
| 3716 | if (TCP_SKB_CB(skb)->sacked) { | 3741 | if (TCP_SKB_CB(skb)->sacked) { |
| 3717 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); | 3742 | tcp_sacktag_write_queue(sk, skb, prior_snd_una); |
| 3718 | tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, | 3743 | if (icsk->icsk_ca_state == TCP_CA_Open) |
| 3719 | is_dupack, flag); | 3744 | tcp_try_keep_open(sk); |
| 3720 | } | 3745 | } |
| 3721 | 3746 | ||
| 3722 | SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); | 3747 | SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); |
| @@ -3727,15 +3752,14 @@ old_ack: | |||
| 3727 | * But, this can also be called on packets in the established flow when | 3752 | * But, this can also be called on packets in the established flow when |
| 3728 | * the fast version below fails. | 3753 | * the fast version below fails. |
| 3729 | */ | 3754 | */ |
| 3730 | void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx, | 3755 | void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, |
| 3731 | const u8 **hvpp, int estab, | 3756 | u8 **hvpp, int estab) |
| 3732 | struct tcp_fastopen_cookie *foc) | ||
| 3733 | { | 3757 | { |
| 3734 | const unsigned char *ptr; | 3758 | unsigned char *ptr; |
| 3735 | const struct tcphdr *th = tcp_hdr(skb); | 3759 | struct tcphdr *th = tcp_hdr(skb); |
| 3736 | int length = (th->doff * 4) - sizeof(struct tcphdr); | 3760 | int length = (th->doff * 4) - sizeof(struct tcphdr); |
| 3737 | 3761 | ||
| 3738 | ptr = (const unsigned char *)(th + 1); | 3762 | ptr = (unsigned char *)(th + 1); |
| 3739 | opt_rx->saw_tstamp = 0; | 3763 | opt_rx->saw_tstamp = 0; |
| 3740 | 3764 | ||
| 3741 | while (length > 0) { | 3765 | while (length > 0) { |
| @@ -3772,9 +3796,10 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o | |||
| 3772 | __u8 snd_wscale = *(__u8 *)ptr; | 3796 | __u8 snd_wscale = *(__u8 *)ptr; |
| 3773 | opt_rx->wscale_ok = 1; | 3797 | opt_rx->wscale_ok = 1; |
| 3774 | if (snd_wscale > 14) { | 3798 | if (snd_wscale > 14) { |
| 3775 | net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n", | 3799 | if (net_ratelimit()) |
| 3776 | __func__, | 3800 | printk(KERN_INFO "tcp_parse_options: Illegal window " |
| 3777 | snd_wscale); | 3801 | "scaling value %d >14 received.\n", |
| 3802 | snd_wscale); | ||
| 3778 | snd_wscale = 14; | 3803 | snd_wscale = 14; |
| 3779 | } | 3804 | } |
| 3780 | opt_rx->snd_wscale = snd_wscale; | 3805 | opt_rx->snd_wscale = snd_wscale; |
| @@ -3792,7 +3817,7 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o | |||
| 3792 | case TCPOPT_SACK_PERM: | 3817 | case TCPOPT_SACK_PERM: |
| 3793 | if (opsize == TCPOLEN_SACK_PERM && th->syn && | 3818 | if (opsize == TCPOLEN_SACK_PERM && th->syn && |
| 3794 | !estab && sysctl_tcp_sack) { | 3819 | !estab && sysctl_tcp_sack) { |
| 3795 | opt_rx->sack_ok = TCP_SACK_SEEN; | 3820 | opt_rx->sack_ok = 1; |
| 3796 | tcp_sack_reset(opt_rx); | 3821 | tcp_sack_reset(opt_rx); |
| 3797 | } | 3822 | } |
| 3798 | break; | 3823 | break; |
| @@ -3836,25 +3861,8 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o | |||
| 3836 | break; | 3861 | break; |
| 3837 | } | 3862 | } |
| 3838 | break; | 3863 | break; |
| 3839 | |||
| 3840 | case TCPOPT_EXP: | ||
| 3841 | /* Fast Open option shares code 254 using a | ||
| 3842 | * 16 bits magic number. It's valid only in | ||
| 3843 | * SYN or SYN-ACK with an even size. | ||
| 3844 | */ | ||
| 3845 | if (opsize < TCPOLEN_EXP_FASTOPEN_BASE || | ||
| 3846 | get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC || | ||
| 3847 | foc == NULL || !th->syn || (opsize & 1)) | ||
| 3848 | break; | ||
| 3849 | foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE; | ||
| 3850 | if (foc->len >= TCP_FASTOPEN_COOKIE_MIN && | ||
| 3851 | foc->len <= TCP_FASTOPEN_COOKIE_MAX) | ||
| 3852 | memcpy(foc->val, ptr + 2, foc->len); | ||
| 3853 | else if (foc->len != 0) | ||
| 3854 | foc->len = -1; | ||
| 3855 | break; | ||
| 3856 | |||
| 3857 | } | 3864 | } |
| 3865 | |||
| 3858 | ptr += opsize-2; | 3866 | ptr += opsize-2; |
| 3859 | length -= opsize; | 3867 | length -= opsize; |
| 3860 | } | 3868 | } |
| @@ -3862,9 +3870,9 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o | |||
| 3862 | } | 3870 | } |
| 3863 | EXPORT_SYMBOL(tcp_parse_options); | 3871 | EXPORT_SYMBOL(tcp_parse_options); |
| 3864 | 3872 | ||
| 3865 | static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th) | 3873 | static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th) |
| 3866 | { | 3874 | { |
| 3867 | const __be32 *ptr = (const __be32 *)(th + 1); | 3875 | __be32 *ptr = (__be32 *)(th + 1); |
| 3868 | 3876 | ||
| 3869 | if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 3877 | if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
| 3870 | | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { | 3878 | | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { |
| @@ -3873,41 +3881,40 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr | |||
| 3873 | tp->rx_opt.rcv_tsval = ntohl(*ptr); | 3881 | tp->rx_opt.rcv_tsval = ntohl(*ptr); |
| 3874 | ++ptr; | 3882 | ++ptr; |
| 3875 | tp->rx_opt.rcv_tsecr = ntohl(*ptr); | 3883 | tp->rx_opt.rcv_tsecr = ntohl(*ptr); |
| 3876 | return true; | 3884 | return 1; |
| 3877 | } | 3885 | } |
| 3878 | return false; | 3886 | return 0; |
| 3879 | } | 3887 | } |
| 3880 | 3888 | ||
| 3881 | /* Fast parse options. This hopes to only see timestamps. | 3889 | /* Fast parse options. This hopes to only see timestamps. |
| 3882 | * If it is wrong it falls back on tcp_parse_options(). | 3890 | * If it is wrong it falls back on tcp_parse_options(). |
| 3883 | */ | 3891 | */ |
| 3884 | static bool tcp_fast_parse_options(const struct sk_buff *skb, | 3892 | static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, |
| 3885 | const struct tcphdr *th, | 3893 | struct tcp_sock *tp, u8 **hvpp) |
| 3886 | struct tcp_sock *tp, const u8 **hvpp) | ||
| 3887 | { | 3894 | { |
| 3888 | /* In the spirit of fast parsing, compare doff directly to constant | 3895 | /* In the spirit of fast parsing, compare doff directly to constant |
| 3889 | * values. Because equality is used, short doff can be ignored here. | 3896 | * values. Because equality is used, short doff can be ignored here. |
| 3890 | */ | 3897 | */ |
| 3891 | if (th->doff == (sizeof(*th) / 4)) { | 3898 | if (th->doff == (sizeof(*th) / 4)) { |
| 3892 | tp->rx_opt.saw_tstamp = 0; | 3899 | tp->rx_opt.saw_tstamp = 0; |
| 3893 | return false; | 3900 | return 0; |
| 3894 | } else if (tp->rx_opt.tstamp_ok && | 3901 | } else if (tp->rx_opt.tstamp_ok && |
| 3895 | th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { | 3902 | th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { |
| 3896 | if (tcp_parse_aligned_timestamp(tp, th)) | 3903 | if (tcp_parse_aligned_timestamp(tp, th)) |
| 3897 | return true; | 3904 | return 1; |
| 3898 | } | 3905 | } |
| 3899 | tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL); | 3906 | tcp_parse_options(skb, &tp->rx_opt, hvpp, 1); |
| 3900 | return true; | 3907 | return 1; |
| 3901 | } | 3908 | } |
| 3902 | 3909 | ||
| 3903 | #ifdef CONFIG_TCP_MD5SIG | 3910 | #ifdef CONFIG_TCP_MD5SIG |
| 3904 | /* | 3911 | /* |
| 3905 | * Parse MD5 Signature option | 3912 | * Parse MD5 Signature option |
| 3906 | */ | 3913 | */ |
| 3907 | const u8 *tcp_parse_md5sig_option(const struct tcphdr *th) | 3914 | u8 *tcp_parse_md5sig_option(struct tcphdr *th) |
| 3908 | { | 3915 | { |
| 3909 | int length = (th->doff << 2) - sizeof(*th); | 3916 | int length = (th->doff << 2) - sizeof (*th); |
| 3910 | const u8 *ptr = (const u8 *)(th + 1); | 3917 | u8 *ptr = (u8*)(th + 1); |
| 3911 | 3918 | ||
| 3912 | /* If the TCP option is too short, we can short cut */ | 3919 | /* If the TCP option is too short, we can short cut */ |
| 3913 | if (length < TCPOLEN_MD5SIG) | 3920 | if (length < TCPOLEN_MD5SIG) |
| @@ -3984,8 +3991,8 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) | |||
| 3984 | 3991 | ||
| 3985 | static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) | 3992 | static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) |
| 3986 | { | 3993 | { |
| 3987 | const struct tcp_sock *tp = tcp_sk(sk); | 3994 | struct tcp_sock *tp = tcp_sk(sk); |
| 3988 | const struct tcphdr *th = tcp_hdr(skb); | 3995 | struct tcphdr *th = tcp_hdr(skb); |
| 3989 | u32 seq = TCP_SKB_CB(skb)->seq; | 3996 | u32 seq = TCP_SKB_CB(skb)->seq; |
| 3990 | u32 ack = TCP_SKB_CB(skb)->ack_seq; | 3997 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
| 3991 | 3998 | ||
| @@ -4002,7 +4009,7 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) | |||
| 4002 | (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); | 4009 | (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); |
| 4003 | } | 4010 | } |
| 4004 | 4011 | ||
| 4005 | static inline bool tcp_paws_discard(const struct sock *sk, | 4012 | static inline int tcp_paws_discard(const struct sock *sk, |
| 4006 | const struct sk_buff *skb) | 4013 | const struct sk_buff *skb) |
| 4007 | { | 4014 | { |
| 4008 | const struct tcp_sock *tp = tcp_sk(sk); | 4015 | const struct tcp_sock *tp = tcp_sk(sk); |
| @@ -4024,14 +4031,14 @@ static inline bool tcp_paws_discard(const struct sock *sk, | |||
| 4024 | * (borrowed from freebsd) | 4031 | * (borrowed from freebsd) |
| 4025 | */ | 4032 | */ |
| 4026 | 4033 | ||
| 4027 | static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) | 4034 | static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq) |
| 4028 | { | 4035 | { |
| 4029 | return !before(end_seq, tp->rcv_wup) && | 4036 | return !before(end_seq, tp->rcv_wup) && |
| 4030 | !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); | 4037 | !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); |
| 4031 | } | 4038 | } |
| 4032 | 4039 | ||
| 4033 | /* When we get a reset we do this. */ | 4040 | /* When we get a reset we do this. */ |
| 4034 | void tcp_reset(struct sock *sk) | 4041 | static void tcp_reset(struct sock *sk) |
| 4035 | { | 4042 | { |
| 4036 | /* We want the right error as BSD sees it (and indeed as we do). */ | 4043 | /* We want the right error as BSD sees it (and indeed as we do). */ |
| 4037 | switch (sk->sk_state) { | 4044 | switch (sk->sk_state) { |
| @@ -4069,7 +4076,7 @@ void tcp_reset(struct sock *sk) | |||
| 4069 | * | 4076 | * |
| 4070 | * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. | 4077 | * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. |
| 4071 | */ | 4078 | */ |
| 4072 | static void tcp_fin(struct sock *sk) | 4079 | static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) |
| 4073 | { | 4080 | { |
| 4074 | struct tcp_sock *tp = tcp_sk(sk); | 4081 | struct tcp_sock *tp = tcp_sk(sk); |
| 4075 | 4082 | ||
| @@ -4113,7 +4120,7 @@ static void tcp_fin(struct sock *sk) | |||
| 4113 | /* Only TCP_LISTEN and TCP_CLOSE are left, in these | 4120 | /* Only TCP_LISTEN and TCP_CLOSE are left, in these |
| 4114 | * cases we should never reach this piece of code. | 4121 | * cases we should never reach this piece of code. |
| 4115 | */ | 4122 | */ |
| 4116 | pr_err("%s: Impossible, sk->sk_state=%d\n", | 4123 | printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n", |
| 4117 | __func__, sk->sk_state); | 4124 | __func__, sk->sk_state); |
| 4118 | break; | 4125 | break; |
| 4119 | } | 4126 | } |
| @@ -4138,7 +4145,7 @@ static void tcp_fin(struct sock *sk) | |||
| 4138 | } | 4145 | } |
| 4139 | } | 4146 | } |
| 4140 | 4147 | ||
| 4141 | static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, | 4148 | static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, |
| 4142 | u32 end_seq) | 4149 | u32 end_seq) |
| 4143 | { | 4150 | { |
| 4144 | if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { | 4151 | if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { |
| @@ -4146,9 +4153,9 @@ static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, | |||
| 4146 | sp->start_seq = seq; | 4153 | sp->start_seq = seq; |
| 4147 | if (after(end_seq, sp->end_seq)) | 4154 | if (after(end_seq, sp->end_seq)) |
| 4148 | sp->end_seq = end_seq; | 4155 | sp->end_seq = end_seq; |
| 4149 | return true; | 4156 | return 1; |
| 4150 | } | 4157 | } |
| 4151 | return false; | 4158 | return 0; |
| 4152 | } | 4159 | } |
| 4153 | 4160 | ||
| 4154 | static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) | 4161 | static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) |
| @@ -4181,7 +4188,7 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq) | |||
| 4181 | tcp_sack_extend(tp->duplicate_sack, seq, end_seq); | 4188 | tcp_sack_extend(tp->duplicate_sack, seq, end_seq); |
| 4182 | } | 4189 | } |
| 4183 | 4190 | ||
| 4184 | static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) | 4191 | static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) |
| 4185 | { | 4192 | { |
| 4186 | struct tcp_sock *tp = tcp_sk(sk); | 4193 | struct tcp_sock *tp = tcp_sk(sk); |
| 4187 | 4194 | ||
| @@ -4340,258 +4347,37 @@ static void tcp_ofo_queue(struct sock *sk) | |||
| 4340 | __skb_queue_tail(&sk->sk_receive_queue, skb); | 4347 | __skb_queue_tail(&sk->sk_receive_queue, skb); |
| 4341 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | 4348 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
| 4342 | if (tcp_hdr(skb)->fin) | 4349 | if (tcp_hdr(skb)->fin) |
| 4343 | tcp_fin(sk); | 4350 | tcp_fin(skb, sk, tcp_hdr(skb)); |
| 4344 | } | 4351 | } |
| 4345 | } | 4352 | } |
| 4346 | 4353 | ||
| 4347 | static bool tcp_prune_ofo_queue(struct sock *sk); | 4354 | static int tcp_prune_ofo_queue(struct sock *sk); |
| 4348 | static int tcp_prune_queue(struct sock *sk); | 4355 | static int tcp_prune_queue(struct sock *sk); |
| 4349 | 4356 | ||
| 4350 | static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, | 4357 | static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) |
| 4351 | unsigned int size) | ||
| 4352 | { | 4358 | { |
| 4353 | if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || | 4359 | if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || |
| 4354 | !sk_rmem_schedule(sk, skb, size)) { | 4360 | !sk_rmem_schedule(sk, size)) { |
| 4355 | 4361 | ||
| 4356 | if (tcp_prune_queue(sk) < 0) | 4362 | if (tcp_prune_queue(sk) < 0) |
| 4357 | return -1; | 4363 | return -1; |
| 4358 | 4364 | ||
| 4359 | if (!sk_rmem_schedule(sk, skb, size)) { | 4365 | if (!sk_rmem_schedule(sk, size)) { |
| 4360 | if (!tcp_prune_ofo_queue(sk)) | 4366 | if (!tcp_prune_ofo_queue(sk)) |
| 4361 | return -1; | 4367 | return -1; |
| 4362 | 4368 | ||
| 4363 | if (!sk_rmem_schedule(sk, skb, size)) | 4369 | if (!sk_rmem_schedule(sk, size)) |
| 4364 | return -1; | 4370 | return -1; |
| 4365 | } | 4371 | } |
| 4366 | } | 4372 | } |
| 4367 | return 0; | 4373 | return 0; |
| 4368 | } | 4374 | } |
| 4369 | 4375 | ||
| 4370 | /** | ||
| 4371 | * tcp_try_coalesce - try to merge skb to prior one | ||
| 4372 | * @sk: socket | ||
| 4373 | * @to: prior buffer | ||
| 4374 | * @from: buffer to add in queue | ||
| 4375 | * @fragstolen: pointer to boolean | ||
| 4376 | * | ||
| 4377 | * Before queueing skb @from after @to, try to merge them | ||
| 4378 | * to reduce overall memory use and queue lengths, if cost is small. | ||
| 4379 | * Packets in ofo or receive queues can stay a long time. | ||
| 4380 | * Better try to coalesce them right now to avoid future collapses. | ||
| 4381 | * Returns true if caller should free @from instead of queueing it | ||
| 4382 | */ | ||
| 4383 | static bool tcp_try_coalesce(struct sock *sk, | ||
| 4384 | struct sk_buff *to, | ||
| 4385 | struct sk_buff *from, | ||
| 4386 | bool *fragstolen) | ||
| 4387 | { | ||
| 4388 | int delta; | ||
| 4389 | |||
| 4390 | *fragstolen = false; | ||
| 4391 | |||
| 4392 | if (tcp_hdr(from)->fin) | ||
| 4393 | return false; | ||
| 4394 | |||
| 4395 | /* Its possible this segment overlaps with prior segment in queue */ | ||
| 4396 | if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) | ||
| 4397 | return false; | ||
| 4398 | |||
| 4399 | if (!skb_try_coalesce(to, from, fragstolen, &delta)) | ||
| 4400 | return false; | ||
| 4401 | |||
| 4402 | atomic_add(delta, &sk->sk_rmem_alloc); | ||
| 4403 | sk_mem_charge(sk, delta); | ||
| 4404 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); | ||
| 4405 | TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; | ||
| 4406 | TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; | ||
| 4407 | return true; | ||
| 4408 | } | ||
| 4409 | |||
| 4410 | static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) | ||
| 4411 | { | ||
| 4412 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 4413 | struct sk_buff *skb1; | ||
| 4414 | u32 seq, end_seq; | ||
| 4415 | |||
| 4416 | TCP_ECN_check_ce(tp, skb); | ||
| 4417 | |||
| 4418 | if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { | ||
| 4419 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); | ||
| 4420 | __kfree_skb(skb); | ||
| 4421 | return; | ||
| 4422 | } | ||
| 4423 | |||
| 4424 | /* Disable header prediction. */ | ||
| 4425 | tp->pred_flags = 0; | ||
| 4426 | inet_csk_schedule_ack(sk); | ||
| 4427 | |||
| 4428 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); | ||
| 4429 | SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", | ||
| 4430 | tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); | ||
| 4431 | |||
| 4432 | skb1 = skb_peek_tail(&tp->out_of_order_queue); | ||
| 4433 | if (!skb1) { | ||
| 4434 | /* Initial out of order segment, build 1 SACK. */ | ||
| 4435 | if (tcp_is_sack(tp)) { | ||
| 4436 | tp->rx_opt.num_sacks = 1; | ||
| 4437 | tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; | ||
| 4438 | tp->selective_acks[0].end_seq = | ||
| 4439 | TCP_SKB_CB(skb)->end_seq; | ||
| 4440 | } | ||
| 4441 | __skb_queue_head(&tp->out_of_order_queue, skb); | ||
| 4442 | goto end; | ||
| 4443 | } | ||
| 4444 | |||
| 4445 | seq = TCP_SKB_CB(skb)->seq; | ||
| 4446 | end_seq = TCP_SKB_CB(skb)->end_seq; | ||
| 4447 | |||
| 4448 | if (seq == TCP_SKB_CB(skb1)->end_seq) { | ||
| 4449 | bool fragstolen; | ||
| 4450 | |||
| 4451 | if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { | ||
| 4452 | __skb_queue_after(&tp->out_of_order_queue, skb1, skb); | ||
| 4453 | } else { | ||
| 4454 | kfree_skb_partial(skb, fragstolen); | ||
| 4455 | skb = NULL; | ||
| 4456 | } | ||
| 4457 | |||
| 4458 | if (!tp->rx_opt.num_sacks || | ||
| 4459 | tp->selective_acks[0].end_seq != seq) | ||
| 4460 | goto add_sack; | ||
| 4461 | |||
| 4462 | /* Common case: data arrive in order after hole. */ | ||
| 4463 | tp->selective_acks[0].end_seq = end_seq; | ||
| 4464 | goto end; | ||
| 4465 | } | ||
| 4466 | |||
| 4467 | /* Find place to insert this segment. */ | ||
| 4468 | while (1) { | ||
| 4469 | if (!after(TCP_SKB_CB(skb1)->seq, seq)) | ||
| 4470 | break; | ||
| 4471 | if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) { | ||
| 4472 | skb1 = NULL; | ||
| 4473 | break; | ||
| 4474 | } | ||
| 4475 | skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1); | ||
| 4476 | } | ||
| 4477 | |||
| 4478 | /* Do skb overlap to previous one? */ | ||
| 4479 | if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { | ||
| 4480 | if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { | ||
| 4481 | /* All the bits are present. Drop. */ | ||
| 4482 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); | ||
| 4483 | __kfree_skb(skb); | ||
| 4484 | skb = NULL; | ||
| 4485 | tcp_dsack_set(sk, seq, end_seq); | ||
| 4486 | goto add_sack; | ||
| 4487 | } | ||
| 4488 | if (after(seq, TCP_SKB_CB(skb1)->seq)) { | ||
| 4489 | /* Partial overlap. */ | ||
| 4490 | tcp_dsack_set(sk, seq, | ||
| 4491 | TCP_SKB_CB(skb1)->end_seq); | ||
| 4492 | } else { | ||
| 4493 | if (skb_queue_is_first(&tp->out_of_order_queue, | ||
| 4494 | skb1)) | ||
| 4495 | skb1 = NULL; | ||
| 4496 | else | ||
| 4497 | skb1 = skb_queue_prev( | ||
| 4498 | &tp->out_of_order_queue, | ||
| 4499 | skb1); | ||
| 4500 | } | ||
| 4501 | } | ||
| 4502 | if (!skb1) | ||
| 4503 | __skb_queue_head(&tp->out_of_order_queue, skb); | ||
| 4504 | else | ||
| 4505 | __skb_queue_after(&tp->out_of_order_queue, skb1, skb); | ||
| 4506 | |||
| 4507 | /* And clean segments covered by new one as whole. */ | ||
| 4508 | while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { | ||
| 4509 | skb1 = skb_queue_next(&tp->out_of_order_queue, skb); | ||
| 4510 | |||
| 4511 | if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) | ||
| 4512 | break; | ||
| 4513 | if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { | ||
| 4514 | tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, | ||
| 4515 | end_seq); | ||
| 4516 | break; | ||
| 4517 | } | ||
| 4518 | __skb_unlink(skb1, &tp->out_of_order_queue); | ||
| 4519 | tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, | ||
| 4520 | TCP_SKB_CB(skb1)->end_seq); | ||
| 4521 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); | ||
| 4522 | __kfree_skb(skb1); | ||
| 4523 | } | ||
| 4524 | |||
| 4525 | add_sack: | ||
| 4526 | if (tcp_is_sack(tp)) | ||
| 4527 | tcp_sack_new_ofo_skb(sk, seq, end_seq); | ||
| 4528 | end: | ||
| 4529 | if (skb) | ||
| 4530 | skb_set_owner_r(skb, sk); | ||
| 4531 | } | ||
| 4532 | |||
| 4533 | static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, | ||
| 4534 | bool *fragstolen) | ||
| 4535 | { | ||
| 4536 | int eaten; | ||
| 4537 | struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); | ||
| 4538 | |||
| 4539 | __skb_pull(skb, hdrlen); | ||
| 4540 | eaten = (tail && | ||
| 4541 | tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; | ||
| 4542 | tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | ||
| 4543 | if (!eaten) { | ||
| 4544 | __skb_queue_tail(&sk->sk_receive_queue, skb); | ||
| 4545 | skb_set_owner_r(skb, sk); | ||
| 4546 | } | ||
| 4547 | return eaten; | ||
| 4548 | } | ||
| 4549 | |||
| 4550 | int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) | ||
| 4551 | { | ||
| 4552 | struct sk_buff *skb = NULL; | ||
| 4553 | struct tcphdr *th; | ||
| 4554 | bool fragstolen; | ||
| 4555 | |||
| 4556 | if (size == 0) | ||
| 4557 | return 0; | ||
| 4558 | |||
| 4559 | skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); | ||
| 4560 | if (!skb) | ||
| 4561 | goto err; | ||
| 4562 | |||
| 4563 | if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th))) | ||
| 4564 | goto err_free; | ||
| 4565 | |||
| 4566 | th = (struct tcphdr *)skb_put(skb, sizeof(*th)); | ||
| 4567 | skb_reset_transport_header(skb); | ||
| 4568 | memset(th, 0, sizeof(*th)); | ||
| 4569 | |||
| 4570 | if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) | ||
| 4571 | goto err_free; | ||
| 4572 | |||
| 4573 | TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt; | ||
| 4574 | TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; | ||
| 4575 | TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; | ||
| 4576 | |||
| 4577 | if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) { | ||
| 4578 | WARN_ON_ONCE(fragstolen); /* should not happen */ | ||
| 4579 | __kfree_skb(skb); | ||
| 4580 | } | ||
| 4581 | return size; | ||
| 4582 | |||
| 4583 | err_free: | ||
| 4584 | kfree_skb(skb); | ||
| 4585 | err: | ||
| 4586 | return -ENOMEM; | ||
| 4587 | } | ||
| 4588 | |||
| 4589 | static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) | 4376 | static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) |
| 4590 | { | 4377 | { |
| 4591 | const struct tcphdr *th = tcp_hdr(skb); | 4378 | struct tcphdr *th = tcp_hdr(skb); |
| 4592 | struct tcp_sock *tp = tcp_sk(sk); | 4379 | struct tcp_sock *tp = tcp_sk(sk); |
| 4593 | int eaten = -1; | 4380 | int eaten = -1; |
| 4594 | bool fragstolen = false; | ||
| 4595 | 4381 | ||
| 4596 | if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) | 4382 | if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) |
| 4597 | goto drop; | 4383 | goto drop; |
| @@ -4633,16 +4419,17 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) | |||
| 4633 | if (eaten <= 0) { | 4419 | if (eaten <= 0) { |
| 4634 | queue_and_out: | 4420 | queue_and_out: |
| 4635 | if (eaten < 0 && | 4421 | if (eaten < 0 && |
| 4636 | tcp_try_rmem_schedule(sk, skb, skb->truesize)) | 4422 | tcp_try_rmem_schedule(sk, skb->truesize)) |
| 4637 | goto drop; | 4423 | goto drop; |
| 4638 | 4424 | ||
| 4639 | eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); | 4425 | skb_set_owner_r(skb, sk); |
| 4426 | __skb_queue_tail(&sk->sk_receive_queue, skb); | ||
| 4640 | } | 4427 | } |
| 4641 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | 4428 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
| 4642 | if (skb->len) | 4429 | if (skb->len) |
| 4643 | tcp_event_data_recv(sk, skb); | 4430 | tcp_event_data_recv(sk, skb); |
| 4644 | if (th->fin) | 4431 | if (th->fin) |
| 4645 | tcp_fin(sk); | 4432 | tcp_fin(skb, sk, th); |
| 4646 | 4433 | ||
| 4647 | if (!skb_queue_empty(&tp->out_of_order_queue)) { | 4434 | if (!skb_queue_empty(&tp->out_of_order_queue)) { |
| 4648 | tcp_ofo_queue(sk); | 4435 | tcp_ofo_queue(sk); |
| @@ -4660,8 +4447,8 @@ queue_and_out: | |||
| 4660 | tcp_fast_path_check(sk); | 4447 | tcp_fast_path_check(sk); |
| 4661 | 4448 | ||
| 4662 | if (eaten > 0) | 4449 | if (eaten > 0) |
| 4663 | kfree_skb_partial(skb, fragstolen); | 4450 | __kfree_skb(skb); |
| 4664 | if (!sock_flag(sk, SOCK_DEAD)) | 4451 | else if (!sock_flag(sk, SOCK_DEAD)) |
| 4665 | sk->sk_data_ready(sk, 0); | 4452 | sk->sk_data_ready(sk, 0); |
| 4666 | return; | 4453 | return; |
| 4667 | } | 4454 | } |
| @@ -4701,7 +4488,105 @@ drop: | |||
| 4701 | goto queue_and_out; | 4488 | goto queue_and_out; |
| 4702 | } | 4489 | } |
| 4703 | 4490 | ||
| 4704 | tcp_data_queue_ofo(sk, skb); | 4491 | TCP_ECN_check_ce(tp, skb); |
| 4492 | |||
| 4493 | if (tcp_try_rmem_schedule(sk, skb->truesize)) | ||
| 4494 | goto drop; | ||
| 4495 | |||
| 4496 | /* Disable header prediction. */ | ||
| 4497 | tp->pred_flags = 0; | ||
| 4498 | inet_csk_schedule_ack(sk); | ||
| 4499 | |||
| 4500 | SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", | ||
| 4501 | tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); | ||
| 4502 | |||
| 4503 | skb_set_owner_r(skb, sk); | ||
| 4504 | |||
| 4505 | if (!skb_peek(&tp->out_of_order_queue)) { | ||
| 4506 | /* Initial out of order segment, build 1 SACK. */ | ||
| 4507 | if (tcp_is_sack(tp)) { | ||
| 4508 | tp->rx_opt.num_sacks = 1; | ||
| 4509 | tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; | ||
| 4510 | tp->selective_acks[0].end_seq = | ||
| 4511 | TCP_SKB_CB(skb)->end_seq; | ||
| 4512 | } | ||
| 4513 | __skb_queue_head(&tp->out_of_order_queue, skb); | ||
| 4514 | } else { | ||
| 4515 | struct sk_buff *skb1 = skb_peek_tail(&tp->out_of_order_queue); | ||
| 4516 | u32 seq = TCP_SKB_CB(skb)->seq; | ||
| 4517 | u32 end_seq = TCP_SKB_CB(skb)->end_seq; | ||
| 4518 | |||
| 4519 | if (seq == TCP_SKB_CB(skb1)->end_seq) { | ||
| 4520 | __skb_queue_after(&tp->out_of_order_queue, skb1, skb); | ||
| 4521 | |||
| 4522 | if (!tp->rx_opt.num_sacks || | ||
| 4523 | tp->selective_acks[0].end_seq != seq) | ||
| 4524 | goto add_sack; | ||
| 4525 | |||
| 4526 | /* Common case: data arrive in order after hole. */ | ||
| 4527 | tp->selective_acks[0].end_seq = end_seq; | ||
| 4528 | return; | ||
| 4529 | } | ||
| 4530 | |||
| 4531 | /* Find place to insert this segment. */ | ||
| 4532 | while (1) { | ||
| 4533 | if (!after(TCP_SKB_CB(skb1)->seq, seq)) | ||
| 4534 | break; | ||
| 4535 | if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) { | ||
| 4536 | skb1 = NULL; | ||
| 4537 | break; | ||
| 4538 | } | ||
| 4539 | skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1); | ||
| 4540 | } | ||
| 4541 | |||
| 4542 | /* Do skb overlap to previous one? */ | ||
| 4543 | if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { | ||
| 4544 | if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { | ||
| 4545 | /* All the bits are present. Drop. */ | ||
| 4546 | __kfree_skb(skb); | ||
| 4547 | tcp_dsack_set(sk, seq, end_seq); | ||
| 4548 | goto add_sack; | ||
| 4549 | } | ||
| 4550 | if (after(seq, TCP_SKB_CB(skb1)->seq)) { | ||
| 4551 | /* Partial overlap. */ | ||
| 4552 | tcp_dsack_set(sk, seq, | ||
| 4553 | TCP_SKB_CB(skb1)->end_seq); | ||
| 4554 | } else { | ||
| 4555 | if (skb_queue_is_first(&tp->out_of_order_queue, | ||
| 4556 | skb1)) | ||
| 4557 | skb1 = NULL; | ||
| 4558 | else | ||
| 4559 | skb1 = skb_queue_prev( | ||
| 4560 | &tp->out_of_order_queue, | ||
| 4561 | skb1); | ||
| 4562 | } | ||
| 4563 | } | ||
| 4564 | if (!skb1) | ||
| 4565 | __skb_queue_head(&tp->out_of_order_queue, skb); | ||
| 4566 | else | ||
| 4567 | __skb_queue_after(&tp->out_of_order_queue, skb1, skb); | ||
| 4568 | |||
| 4569 | /* And clean segments covered by new one as whole. */ | ||
| 4570 | while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { | ||
| 4571 | skb1 = skb_queue_next(&tp->out_of_order_queue, skb); | ||
| 4572 | |||
| 4573 | if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) | ||
| 4574 | break; | ||
| 4575 | if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { | ||
| 4576 | tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, | ||
| 4577 | end_seq); | ||
| 4578 | break; | ||
| 4579 | } | ||
| 4580 | __skb_unlink(skb1, &tp->out_of_order_queue); | ||
| 4581 | tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, | ||
| 4582 | TCP_SKB_CB(skb1)->end_seq); | ||
| 4583 | __kfree_skb(skb1); | ||
| 4584 | } | ||
| 4585 | |||
| 4586 | add_sack: | ||
| 4587 | if (tcp_is_sack(tp)) | ||
| 4588 | tcp_sack_new_ofo_skb(sk, seq, end_seq); | ||
| 4589 | } | ||
| 4705 | } | 4590 | } |
| 4706 | 4591 | ||
| 4707 | static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, | 4592 | static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, |
| @@ -4880,10 +4765,10 @@ static void tcp_collapse_ofo_queue(struct sock *sk) | |||
| 4880 | * Purge the out-of-order queue. | 4765 | * Purge the out-of-order queue. |
| 4881 | * Return true if queue was pruned. | 4766 | * Return true if queue was pruned. |
| 4882 | */ | 4767 | */ |
| 4883 | static bool tcp_prune_ofo_queue(struct sock *sk) | 4768 | static int tcp_prune_ofo_queue(struct sock *sk) |
| 4884 | { | 4769 | { |
| 4885 | struct tcp_sock *tp = tcp_sk(sk); | 4770 | struct tcp_sock *tp = tcp_sk(sk); |
| 4886 | bool res = false; | 4771 | int res = 0; |
| 4887 | 4772 | ||
| 4888 | if (!skb_queue_empty(&tp->out_of_order_queue)) { | 4773 | if (!skb_queue_empty(&tp->out_of_order_queue)) { |
| 4889 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); | 4774 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); |
| @@ -4897,7 +4782,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk) | |||
| 4897 | if (tp->rx_opt.sack_ok) | 4782 | if (tp->rx_opt.sack_ok) |
| 4898 | tcp_sack_reset(&tp->rx_opt); | 4783 | tcp_sack_reset(&tp->rx_opt); |
| 4899 | sk_mem_reclaim(sk); | 4784 | sk_mem_reclaim(sk); |
| 4900 | res = true; | 4785 | res = 1; |
| 4901 | } | 4786 | } |
| 4902 | return res; | 4787 | return res; |
| 4903 | } | 4788 | } |
| @@ -4919,7 +4804,7 @@ static int tcp_prune_queue(struct sock *sk) | |||
| 4919 | 4804 | ||
| 4920 | if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) | 4805 | if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) |
| 4921 | tcp_clamp_window(sk); | 4806 | tcp_clamp_window(sk); |
| 4922 | else if (sk_under_memory_pressure(sk)) | 4807 | else if (tcp_memory_pressure) |
| 4923 | tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); | 4808 | tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); |
| 4924 | 4809 | ||
| 4925 | tcp_collapse_ofo_queue(sk); | 4810 | tcp_collapse_ofo_queue(sk); |
| @@ -4974,29 +4859,29 @@ void tcp_cwnd_application_limited(struct sock *sk) | |||
| 4974 | tp->snd_cwnd_stamp = tcp_time_stamp; | 4859 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| 4975 | } | 4860 | } |
| 4976 | 4861 | ||
| 4977 | static bool tcp_should_expand_sndbuf(const struct sock *sk) | 4862 | static int tcp_should_expand_sndbuf(struct sock *sk) |
| 4978 | { | 4863 | { |
| 4979 | const struct tcp_sock *tp = tcp_sk(sk); | 4864 | struct tcp_sock *tp = tcp_sk(sk); |
| 4980 | 4865 | ||
| 4981 | /* If the user specified a specific send buffer setting, do | 4866 | /* If the user specified a specific send buffer setting, do |
| 4982 | * not modify it. | 4867 | * not modify it. |
| 4983 | */ | 4868 | */ |
| 4984 | if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) | 4869 | if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) |
| 4985 | return false; | 4870 | return 0; |
| 4986 | 4871 | ||
| 4987 | /* If we are under global TCP memory pressure, do not expand. */ | 4872 | /* If we are under global TCP memory pressure, do not expand. */ |
| 4988 | if (sk_under_memory_pressure(sk)) | 4873 | if (tcp_memory_pressure) |
| 4989 | return false; | 4874 | return 0; |
| 4990 | 4875 | ||
| 4991 | /* If we are under soft global TCP memory pressure, do not expand. */ | 4876 | /* If we are under soft global TCP memory pressure, do not expand. */ |
| 4992 | if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0)) | 4877 | if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) |
| 4993 | return false; | 4878 | return 0; |
| 4994 | 4879 | ||
| 4995 | /* If we filled the congestion window, do not expand. */ | 4880 | /* If we filled the congestion window, do not expand. */ |
| 4996 | if (tp->packets_out >= tp->snd_cwnd) | 4881 | if (tp->packets_out >= tp->snd_cwnd) |
| 4997 | return false; | 4882 | return 0; |
| 4998 | 4883 | ||
| 4999 | return true; | 4884 | return 1; |
| 5000 | } | 4885 | } |
| 5001 | 4886 | ||
| 5002 | /* When incoming ACK allowed to free some skb from write_queue, | 4887 | /* When incoming ACK allowed to free some skb from write_queue, |
| @@ -5010,10 +4895,8 @@ static void tcp_new_space(struct sock *sk) | |||
| 5010 | struct tcp_sock *tp = tcp_sk(sk); | 4895 | struct tcp_sock *tp = tcp_sk(sk); |
| 5011 | 4896 | ||
| 5012 | if (tcp_should_expand_sndbuf(sk)) { | 4897 | if (tcp_should_expand_sndbuf(sk)) { |
| 5013 | int sndmem = SKB_TRUESIZE(max_t(u32, | 4898 | int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + |
| 5014 | tp->rx_opt.mss_clamp, | 4899 | MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); |
| 5015 | tp->mss_cache) + | ||
| 5016 | MAX_TCP_HEADER); | ||
| 5017 | int demanded = max_t(unsigned int, tp->snd_cwnd, | 4900 | int demanded = max_t(unsigned int, tp->snd_cwnd, |
| 5018 | tp->reordering + 1); | 4901 | tp->reordering + 1); |
| 5019 | sndmem *= 2 * demanded; | 4902 | sndmem *= 2 * demanded; |
| @@ -5085,7 +4968,7 @@ static inline void tcp_ack_snd_check(struct sock *sk) | |||
| 5085 | * either form (or just set the sysctl tcp_stdurg). | 4968 | * either form (or just set the sysctl tcp_stdurg). |
| 5086 | */ | 4969 | */ |
| 5087 | 4970 | ||
| 5088 | static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) | 4971 | static void tcp_check_urg(struct sock *sk, struct tcphdr *th) |
| 5089 | { | 4972 | { |
| 5090 | struct tcp_sock *tp = tcp_sk(sk); | 4973 | struct tcp_sock *tp = tcp_sk(sk); |
| 5091 | u32 ptr = ntohs(th->urg_ptr); | 4974 | u32 ptr = ntohs(th->urg_ptr); |
| @@ -5151,7 +5034,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) | |||
| 5151 | } | 5034 | } |
| 5152 | 5035 | ||
| 5153 | /* This is the 'fast' part of urgent handling. */ | 5036 | /* This is the 'fast' part of urgent handling. */ |
| 5154 | static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th) | 5037 | static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th) |
| 5155 | { | 5038 | { |
| 5156 | struct tcp_sock *tp = tcp_sk(sk); | 5039 | struct tcp_sock *tp = tcp_sk(sk); |
| 5157 | 5040 | ||
| @@ -5214,7 +5097,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk, | |||
| 5214 | return result; | 5097 | return result; |
| 5215 | } | 5098 | } |
| 5216 | 5099 | ||
| 5217 | static inline bool tcp_checksum_complete_user(struct sock *sk, | 5100 | static inline int tcp_checksum_complete_user(struct sock *sk, |
| 5218 | struct sk_buff *skb) | 5101 | struct sk_buff *skb) |
| 5219 | { | 5102 | { |
| 5220 | return !skb_csum_unnecessary(skb) && | 5103 | return !skb_csum_unnecessary(skb) && |
| @@ -5222,19 +5105,19 @@ static inline bool tcp_checksum_complete_user(struct sock *sk, | |||
| 5222 | } | 5105 | } |
| 5223 | 5106 | ||
| 5224 | #ifdef CONFIG_NET_DMA | 5107 | #ifdef CONFIG_NET_DMA |
| 5225 | static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, | 5108 | static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, |
| 5226 | int hlen) | 5109 | int hlen) |
| 5227 | { | 5110 | { |
| 5228 | struct tcp_sock *tp = tcp_sk(sk); | 5111 | struct tcp_sock *tp = tcp_sk(sk); |
| 5229 | int chunk = skb->len - hlen; | 5112 | int chunk = skb->len - hlen; |
| 5230 | int dma_cookie; | 5113 | int dma_cookie; |
| 5231 | bool copied_early = false; | 5114 | int copied_early = 0; |
| 5232 | 5115 | ||
| 5233 | if (tp->ucopy.wakeup) | 5116 | if (tp->ucopy.wakeup) |
| 5234 | return false; | 5117 | return 0; |
| 5235 | 5118 | ||
| 5236 | if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) | 5119 | if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) |
| 5237 | tp->ucopy.dma_chan = net_dma_find_channel(); | 5120 | tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); |
| 5238 | 5121 | ||
| 5239 | if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { | 5122 | if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { |
| 5240 | 5123 | ||
| @@ -5247,7 +5130,7 @@ static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, | |||
| 5247 | goto out; | 5130 | goto out; |
| 5248 | 5131 | ||
| 5249 | tp->ucopy.dma_cookie = dma_cookie; | 5132 | tp->ucopy.dma_cookie = dma_cookie; |
| 5250 | copied_early = true; | 5133 | copied_early = 1; |
| 5251 | 5134 | ||
| 5252 | tp->ucopy.len -= chunk; | 5135 | tp->ucopy.len -= chunk; |
| 5253 | tp->copied_seq += chunk; | 5136 | tp->copied_seq += chunk; |
| @@ -5271,10 +5154,10 @@ out: | |||
| 5271 | /* Does PAWS and seqno based validation of an incoming segment, flags will | 5154 | /* Does PAWS and seqno based validation of an incoming segment, flags will |
| 5272 | * play significant role here. | 5155 | * play significant role here. |
| 5273 | */ | 5156 | */ |
| 5274 | static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, | 5157 | static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, |
| 5275 | const struct tcphdr *th, int syn_inerr) | 5158 | struct tcphdr *th, int syn_inerr) |
| 5276 | { | 5159 | { |
| 5277 | const u8 *hash_location; | 5160 | u8 *hash_location; |
| 5278 | struct tcp_sock *tp = tcp_sk(sk); | 5161 | struct tcp_sock *tp = tcp_sk(sk); |
| 5279 | 5162 | ||
| 5280 | /* RFC1323: H1. Apply PAWS check first. */ | 5163 | /* RFC1323: H1. Apply PAWS check first. */ |
| @@ -5297,48 +5180,38 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, | |||
| 5297 | * an acknowledgment should be sent in reply (unless the RST | 5180 | * an acknowledgment should be sent in reply (unless the RST |
| 5298 | * bit is set, if so drop the segment and return)". | 5181 | * bit is set, if so drop the segment and return)". |
| 5299 | */ | 5182 | */ |
| 5300 | if (!th->rst) { | 5183 | if (!th->rst) |
| 5301 | if (th->syn) | ||
| 5302 | goto syn_challenge; | ||
| 5303 | tcp_send_dupack(sk, skb); | 5184 | tcp_send_dupack(sk, skb); |
| 5304 | } | ||
| 5305 | goto discard; | 5185 | goto discard; |
| 5306 | } | 5186 | } |
| 5307 | 5187 | ||
| 5308 | /* Step 2: check RST bit */ | 5188 | /* Step 2: check RST bit */ |
| 5309 | if (th->rst) { | 5189 | if (th->rst) { |
| 5310 | /* RFC 5961 3.2 : | 5190 | tcp_reset(sk); |
| 5311 | * If sequence number exactly matches RCV.NXT, then | ||
| 5312 | * RESET the connection | ||
| 5313 | * else | ||
| 5314 | * Send a challenge ACK | ||
| 5315 | */ | ||
| 5316 | if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) | ||
| 5317 | tcp_reset(sk); | ||
| 5318 | else | ||
| 5319 | tcp_send_challenge_ack(sk); | ||
| 5320 | goto discard; | 5191 | goto discard; |
| 5321 | } | 5192 | } |
| 5322 | 5193 | ||
| 5194 | /* ts_recent update must be made after we are sure that the packet | ||
| 5195 | * is in window. | ||
| 5196 | */ | ||
| 5197 | tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); | ||
| 5198 | |||
| 5323 | /* step 3: check security and precedence [ignored] */ | 5199 | /* step 3: check security and precedence [ignored] */ |
| 5324 | 5200 | ||
| 5325 | /* step 4: Check for a SYN | 5201 | /* step 4: Check for a SYN in window. */ |
| 5326 | * RFC 5691 4.2 : Send a challenge ack | 5202 | if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { |
| 5327 | */ | ||
| 5328 | if (th->syn) { | ||
| 5329 | syn_challenge: | ||
| 5330 | if (syn_inerr) | 5203 | if (syn_inerr) |
| 5331 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); | 5204 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); |
| 5332 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); | 5205 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); |
| 5333 | tcp_send_challenge_ack(sk); | 5206 | tcp_reset(sk); |
| 5334 | goto discard; | 5207 | return -1; |
| 5335 | } | 5208 | } |
| 5336 | 5209 | ||
| 5337 | return true; | 5210 | return 1; |
| 5338 | 5211 | ||
| 5339 | discard: | 5212 | discard: |
| 5340 | __kfree_skb(skb); | 5213 | __kfree_skb(skb); |
| 5341 | return false; | 5214 | return 0; |
| 5342 | } | 5215 | } |
| 5343 | 5216 | ||
| 5344 | /* | 5217 | /* |
| @@ -5365,12 +5238,11 @@ discard: | |||
| 5365 | * tcp_data_queue when everything is OK. | 5238 | * tcp_data_queue when everything is OK. |
| 5366 | */ | 5239 | */ |
| 5367 | int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | 5240 | int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, |
| 5368 | const struct tcphdr *th, unsigned int len) | 5241 | struct tcphdr *th, unsigned len) |
| 5369 | { | 5242 | { |
| 5370 | struct tcp_sock *tp = tcp_sk(sk); | 5243 | struct tcp_sock *tp = tcp_sk(sk); |
| 5244 | int res; | ||
| 5371 | 5245 | ||
| 5372 | if (unlikely(sk->sk_rx_dst == NULL)) | ||
| 5373 | inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); | ||
| 5374 | /* | 5246 | /* |
| 5375 | * Header prediction. | 5247 | * Header prediction. |
| 5376 | * The code loosely follows the one in the famous | 5248 | * The code loosely follows the one in the famous |
| @@ -5450,14 +5322,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
| 5450 | } else { | 5322 | } else { |
| 5451 | int eaten = 0; | 5323 | int eaten = 0; |
| 5452 | int copied_early = 0; | 5324 | int copied_early = 0; |
| 5453 | bool fragstolen = false; | ||
| 5454 | 5325 | ||
| 5455 | if (tp->copied_seq == tp->rcv_nxt && | 5326 | if (tp->copied_seq == tp->rcv_nxt && |
| 5456 | len - tcp_header_len <= tp->ucopy.len) { | 5327 | len - tcp_header_len <= tp->ucopy.len) { |
| 5457 | #ifdef CONFIG_NET_DMA | 5328 | #ifdef CONFIG_NET_DMA |
| 5458 | if (tp->ucopy.task == current && | 5329 | if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { |
| 5459 | sock_owned_by_user(sk) && | ||
| 5460 | tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { | ||
| 5461 | copied_early = 1; | 5330 | copied_early = 1; |
| 5462 | eaten = 1; | 5331 | eaten = 1; |
| 5463 | } | 5332 | } |
| @@ -5510,8 +5379,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
| 5510 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); | 5379 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); |
| 5511 | 5380 | ||
| 5512 | /* Bulk data transfer: receiver */ | 5381 | /* Bulk data transfer: receiver */ |
| 5513 | eaten = tcp_queue_rcv(sk, skb, tcp_header_len, | 5382 | __skb_pull(skb, tcp_header_len); |
| 5514 | &fragstolen); | 5383 | __skb_queue_tail(&sk->sk_receive_queue, skb); |
| 5384 | skb_set_owner_r(skb, sk); | ||
| 5385 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | ||
| 5515 | } | 5386 | } |
| 5516 | 5387 | ||
| 5517 | tcp_event_data_recv(sk, skb); | 5388 | tcp_event_data_recv(sk, skb); |
| @@ -5533,8 +5404,9 @@ no_ack: | |||
| 5533 | else | 5404 | else |
| 5534 | #endif | 5405 | #endif |
| 5535 | if (eaten) | 5406 | if (eaten) |
| 5536 | kfree_skb_partial(skb, fragstolen); | 5407 | __kfree_skb(skb); |
| 5537 | sk->sk_data_ready(sk, 0); | 5408 | else |
| 5409 | sk->sk_data_ready(sk, 0); | ||
| 5538 | return 0; | 5410 | return 0; |
| 5539 | } | 5411 | } |
| 5540 | } | 5412 | } |
| @@ -5543,25 +5415,18 @@ slow_path: | |||
| 5543 | if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb)) | 5415 | if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb)) |
| 5544 | goto csum_error; | 5416 | goto csum_error; |
| 5545 | 5417 | ||
| 5546 | if (!th->ack && !th->rst) | ||
| 5547 | goto discard; | ||
| 5548 | |||
| 5549 | /* | 5418 | /* |
| 5550 | * Standard slow path. | 5419 | * Standard slow path. |
| 5551 | */ | 5420 | */ |
| 5552 | 5421 | ||
| 5553 | if (!tcp_validate_incoming(sk, skb, th, 1)) | 5422 | res = tcp_validate_incoming(sk, skb, th, 1); |
| 5554 | return 0; | 5423 | if (res <= 0) |
| 5424 | return -res; | ||
| 5555 | 5425 | ||
| 5556 | step5: | 5426 | step5: |
| 5557 | if (tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) | 5427 | if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) |
| 5558 | goto discard; | 5428 | goto discard; |
| 5559 | 5429 | ||
| 5560 | /* ts_recent update must be made after we are sure that the packet | ||
| 5561 | * is in window. | ||
| 5562 | */ | ||
| 5563 | tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); | ||
| 5564 | |||
| 5565 | tcp_rcv_rtt_measure_ts(sk, skb); | 5430 | tcp_rcv_rtt_measure_ts(sk, skb); |
| 5566 | 5431 | ||
| 5567 | /* Process urgent data. */ | 5432 | /* Process urgent data. */ |
| @@ -5583,101 +5448,16 @@ discard: | |||
| 5583 | } | 5448 | } |
| 5584 | EXPORT_SYMBOL(tcp_rcv_established); | 5449 | EXPORT_SYMBOL(tcp_rcv_established); |
| 5585 | 5450 | ||
| 5586 | void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) | ||
| 5587 | { | ||
| 5588 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 5589 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 5590 | |||
| 5591 | tcp_set_state(sk, TCP_ESTABLISHED); | ||
| 5592 | |||
| 5593 | if (skb != NULL) { | ||
| 5594 | icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); | ||
| 5595 | security_inet_conn_established(sk, skb); | ||
| 5596 | } | ||
| 5597 | |||
| 5598 | /* Make sure socket is routed, for correct metrics. */ | ||
| 5599 | icsk->icsk_af_ops->rebuild_header(sk); | ||
| 5600 | |||
| 5601 | tcp_init_metrics(sk); | ||
| 5602 | |||
| 5603 | tcp_init_congestion_control(sk); | ||
| 5604 | |||
| 5605 | /* Prevent spurious tcp_cwnd_restart() on first data | ||
| 5606 | * packet. | ||
| 5607 | */ | ||
| 5608 | tp->lsndtime = tcp_time_stamp; | ||
| 5609 | |||
| 5610 | tcp_init_buffer_space(sk); | ||
| 5611 | |||
| 5612 | if (sock_flag(sk, SOCK_KEEPOPEN)) | ||
| 5613 | inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); | ||
| 5614 | |||
| 5615 | if (!tp->rx_opt.snd_wscale) | ||
| 5616 | __tcp_fast_path_on(tp, tp->snd_wnd); | ||
| 5617 | else | ||
| 5618 | tp->pred_flags = 0; | ||
| 5619 | |||
| 5620 | if (!sock_flag(sk, SOCK_DEAD)) { | ||
| 5621 | sk->sk_state_change(sk); | ||
| 5622 | sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); | ||
| 5623 | } | ||
| 5624 | } | ||
| 5625 | |||
| 5626 | static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, | ||
| 5627 | struct tcp_fastopen_cookie *cookie) | ||
| 5628 | { | ||
| 5629 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 5630 | struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; | ||
| 5631 | u16 mss = tp->rx_opt.mss_clamp; | ||
| 5632 | bool syn_drop; | ||
| 5633 | |||
| 5634 | if (mss == tp->rx_opt.user_mss) { | ||
| 5635 | struct tcp_options_received opt; | ||
| 5636 | const u8 *hash_location; | ||
| 5637 | |||
| 5638 | /* Get original SYNACK MSS value if user MSS sets mss_clamp */ | ||
| 5639 | tcp_clear_options(&opt); | ||
| 5640 | opt.user_mss = opt.mss_clamp = 0; | ||
| 5641 | tcp_parse_options(synack, &opt, &hash_location, 0, NULL); | ||
| 5642 | mss = opt.mss_clamp; | ||
| 5643 | } | ||
| 5644 | |||
| 5645 | if (!tp->syn_fastopen) /* Ignore an unsolicited cookie */ | ||
| 5646 | cookie->len = -1; | ||
| 5647 | |||
| 5648 | /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably | ||
| 5649 | * the remote receives only the retransmitted (regular) SYNs: either | ||
| 5650 | * the original SYN-data or the corresponding SYN-ACK is lost. | ||
| 5651 | */ | ||
| 5652 | syn_drop = (cookie->len <= 0 && data && | ||
| 5653 | inet_csk(sk)->icsk_retransmits); | ||
| 5654 | |||
| 5655 | tcp_fastopen_cache_set(sk, mss, cookie, syn_drop); | ||
| 5656 | |||
| 5657 | if (data) { /* Retransmit unacked data in SYN */ | ||
| 5658 | tcp_for_write_queue_from(data, sk) { | ||
| 5659 | if (data == tcp_send_head(sk) || | ||
| 5660 | __tcp_retransmit_skb(sk, data)) | ||
| 5661 | break; | ||
| 5662 | } | ||
| 5663 | tcp_rearm_rto(sk); | ||
| 5664 | return true; | ||
| 5665 | } | ||
| 5666 | tp->syn_data_acked = tp->syn_data; | ||
| 5667 | return false; | ||
| 5668 | } | ||
| 5669 | |||
| 5670 | static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | 5451 | static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, |
| 5671 | const struct tcphdr *th, unsigned int len) | 5452 | struct tcphdr *th, unsigned len) |
| 5672 | { | 5453 | { |
| 5673 | const u8 *hash_location; | 5454 | u8 *hash_location; |
| 5674 | struct inet_connection_sock *icsk = inet_csk(sk); | 5455 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 5675 | struct tcp_sock *tp = tcp_sk(sk); | 5456 | struct tcp_sock *tp = tcp_sk(sk); |
| 5676 | struct tcp_cookie_values *cvp = tp->cookie_values; | 5457 | struct tcp_cookie_values *cvp = tp->cookie_values; |
| 5677 | struct tcp_fastopen_cookie foc = { .len = -1 }; | ||
| 5678 | int saved_clamp = tp->rx_opt.mss_clamp; | 5458 | int saved_clamp = tp->rx_opt.mss_clamp; |
| 5679 | 5459 | ||
| 5680 | tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc); | 5460 | tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0); |
| 5681 | 5461 | ||
| 5682 | if (th->ack) { | 5462 | if (th->ack) { |
| 5683 | /* rfc793: | 5463 | /* rfc793: |
| @@ -5687,9 +5467,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 5687 | * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send | 5467 | * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send |
| 5688 | * a reset (unless the RST bit is set, if so drop | 5468 | * a reset (unless the RST bit is set, if so drop |
| 5689 | * the segment and return)" | 5469 | * the segment and return)" |
| 5470 | * | ||
| 5471 | * We do not send data with SYN, so that RFC-correct | ||
| 5472 | * test reduces to: | ||
| 5690 | */ | 5473 | */ |
| 5691 | if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || | 5474 | if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) |
| 5692 | after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) | ||
| 5693 | goto reset_and_undo; | 5475 | goto reset_and_undo; |
| 5694 | 5476 | ||
| 5695 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && | 5477 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
| @@ -5731,7 +5513,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 5731 | 5513 | ||
| 5732 | TCP_ECN_rcv_synack(tp, th); | 5514 | TCP_ECN_rcv_synack(tp, th); |
| 5733 | 5515 | ||
| 5734 | tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); | 5516 | tp->snd_wl1 = TCP_SKB_CB(skb)->seq; |
| 5735 | tcp_ack(sk, skb, FLAG_SLOWPATH); | 5517 | tcp_ack(sk, skb, FLAG_SLOWPATH); |
| 5736 | 5518 | ||
| 5737 | /* Ok.. it's good. Set up sequence numbers and | 5519 | /* Ok.. it's good. Set up sequence numbers and |
| @@ -5744,6 +5526,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 5744 | * never scaled. | 5526 | * never scaled. |
| 5745 | */ | 5527 | */ |
| 5746 | tp->snd_wnd = ntohs(th->window); | 5528 | tp->snd_wnd = ntohs(th->window); |
| 5529 | tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); | ||
| 5747 | 5530 | ||
| 5748 | if (!tp->rx_opt.wscale_ok) { | 5531 | if (!tp->rx_opt.wscale_ok) { |
| 5749 | tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; | 5532 | tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; |
| @@ -5797,12 +5580,36 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 5797 | } | 5580 | } |
| 5798 | 5581 | ||
| 5799 | smp_mb(); | 5582 | smp_mb(); |
| 5583 | tcp_set_state(sk, TCP_ESTABLISHED); | ||
| 5800 | 5584 | ||
| 5801 | tcp_finish_connect(sk, skb); | 5585 | security_inet_conn_established(sk, skb); |
| 5802 | 5586 | ||
| 5803 | if ((tp->syn_fastopen || tp->syn_data) && | 5587 | /* Make sure socket is routed, for correct metrics. */ |
| 5804 | tcp_rcv_fastopen_synack(sk, skb, &foc)) | 5588 | icsk->icsk_af_ops->rebuild_header(sk); |
| 5805 | return -1; | 5589 | |
| 5590 | tcp_init_metrics(sk); | ||
| 5591 | |||
| 5592 | tcp_init_congestion_control(sk); | ||
| 5593 | |||
| 5594 | /* Prevent spurious tcp_cwnd_restart() on first data | ||
| 5595 | * packet. | ||
| 5596 | */ | ||
| 5597 | tp->lsndtime = tcp_time_stamp; | ||
| 5598 | |||
| 5599 | tcp_init_buffer_space(sk); | ||
| 5600 | |||
| 5601 | if (sock_flag(sk, SOCK_KEEPOPEN)) | ||
| 5602 | inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); | ||
| 5603 | |||
| 5604 | if (!tp->rx_opt.snd_wscale) | ||
| 5605 | __tcp_fast_path_on(tp, tp->snd_wnd); | ||
| 5606 | else | ||
| 5607 | tp->pred_flags = 0; | ||
| 5608 | |||
| 5609 | if (!sock_flag(sk, SOCK_DEAD)) { | ||
| 5610 | sk->sk_state_change(sk); | ||
| 5611 | sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); | ||
| 5612 | } | ||
| 5806 | 5613 | ||
| 5807 | if (sk->sk_write_pending || | 5614 | if (sk->sk_write_pending || |
| 5808 | icsk->icsk_accept_queue.rskq_defer_accept || | 5615 | icsk->icsk_accept_queue.rskq_defer_accept || |
| @@ -5816,6 +5623,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 5816 | */ | 5623 | */ |
| 5817 | inet_csk_schedule_ack(sk); | 5624 | inet_csk_schedule_ack(sk); |
| 5818 | icsk->icsk_ack.lrcvtime = tcp_time_stamp; | 5625 | icsk->icsk_ack.lrcvtime = tcp_time_stamp; |
| 5626 | icsk->icsk_ack.ato = TCP_ATO_MIN; | ||
| 5627 | tcp_incr_quickack(sk); | ||
| 5819 | tcp_enter_quickack_mode(sk); | 5628 | tcp_enter_quickack_mode(sk); |
| 5820 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, | 5629 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, |
| 5821 | TCP_DELACK_MAX, TCP_RTO_MAX); | 5630 | TCP_DELACK_MAX, TCP_RTO_MAX); |
| @@ -5881,9 +5690,7 @@ discard: | |||
| 5881 | tcp_send_synack(sk); | 5690 | tcp_send_synack(sk); |
| 5882 | #if 0 | 5691 | #if 0 |
| 5883 | /* Note, we could accept data and URG from this segment. | 5692 | /* Note, we could accept data and URG from this segment. |
| 5884 | * There are no obstacles to make this (except that we must | 5693 | * There are no obstacles to make this. |
| 5885 | * either change tcp_recvmsg() to prevent it from returning data | ||
| 5886 | * before 3WHS completes per RFC793, or employ TCP Fast Open). | ||
| 5887 | * | 5694 | * |
| 5888 | * However, if we ignore data in ACKless segments sometimes, | 5695 | * However, if we ignore data in ACKless segments sometimes, |
| 5889 | * we have no reasons to accept it sometimes. | 5696 | * we have no reasons to accept it sometimes. |
| @@ -5919,12 +5726,12 @@ reset_and_undo: | |||
| 5919 | */ | 5726 | */ |
| 5920 | 5727 | ||
| 5921 | int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | 5728 | int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, |
| 5922 | const struct tcphdr *th, unsigned int len) | 5729 | struct tcphdr *th, unsigned len) |
| 5923 | { | 5730 | { |
| 5924 | struct tcp_sock *tp = tcp_sk(sk); | 5731 | struct tcp_sock *tp = tcp_sk(sk); |
| 5925 | struct inet_connection_sock *icsk = inet_csk(sk); | 5732 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 5926 | struct request_sock *req; | ||
| 5927 | int queued = 0; | 5733 | int queued = 0; |
| 5734 | int res; | ||
| 5928 | 5735 | ||
| 5929 | tp->rx_opt.saw_tstamp = 0; | 5736 | tp->rx_opt.saw_tstamp = 0; |
| 5930 | 5737 | ||
| @@ -5940,8 +5747,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 5940 | goto discard; | 5747 | goto discard; |
| 5941 | 5748 | ||
| 5942 | if (th->syn) { | 5749 | if (th->syn) { |
| 5943 | if (th->fin) | ||
| 5944 | goto discard; | ||
| 5945 | if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) | 5750 | if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) |
| 5946 | return 1; | 5751 | return 1; |
| 5947 | 5752 | ||
| @@ -5979,47 +5784,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 5979 | return 0; | 5784 | return 0; |
| 5980 | } | 5785 | } |
| 5981 | 5786 | ||
| 5982 | req = tp->fastopen_rsk; | 5787 | res = tcp_validate_incoming(sk, skb, th, 0); |
| 5983 | if (req != NULL) { | 5788 | if (res <= 0) |
| 5984 | WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && | 5789 | return -res; |
| 5985 | sk->sk_state != TCP_FIN_WAIT1); | ||
| 5986 | |||
| 5987 | if (tcp_check_req(sk, skb, req, NULL, true) == NULL) | ||
| 5988 | goto discard; | ||
| 5989 | } | ||
| 5990 | |||
| 5991 | if (!th->ack && !th->rst) | ||
| 5992 | goto discard; | ||
| 5993 | |||
| 5994 | if (!tcp_validate_incoming(sk, skb, th, 0)) | ||
| 5995 | return 0; | ||
| 5996 | 5790 | ||
| 5997 | /* step 5: check the ACK field */ | 5791 | /* step 5: check the ACK field */ |
| 5998 | if (true) { | 5792 | if (th->ack) { |
| 5999 | int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; | 5793 | int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; |
| 6000 | 5794 | ||
| 6001 | switch (sk->sk_state) { | 5795 | switch (sk->sk_state) { |
| 6002 | case TCP_SYN_RECV: | 5796 | case TCP_SYN_RECV: |
| 6003 | if (acceptable) { | 5797 | if (acceptable) { |
| 6004 | /* Once we leave TCP_SYN_RECV, we no longer | 5798 | tp->copied_seq = tp->rcv_nxt; |
| 6005 | * need req so release it. | ||
| 6006 | */ | ||
| 6007 | if (req) { | ||
| 6008 | tcp_synack_rtt_meas(sk, req); | ||
| 6009 | tp->total_retrans = req->num_retrans; | ||
| 6010 | |||
| 6011 | reqsk_fastopen_remove(sk, req, false); | ||
| 6012 | } else { | ||
| 6013 | /* Make sure socket is routed, for | ||
| 6014 | * correct metrics. | ||
| 6015 | */ | ||
| 6016 | icsk->icsk_af_ops->rebuild_header(sk); | ||
| 6017 | tcp_init_congestion_control(sk); | ||
| 6018 | |||
| 6019 | tcp_mtup_init(sk); | ||
| 6020 | tcp_init_buffer_space(sk); | ||
| 6021 | tp->copied_seq = tp->rcv_nxt; | ||
| 6022 | } | ||
| 6023 | smp_mb(); | 5799 | smp_mb(); |
| 6024 | tcp_set_state(sk, TCP_ESTABLISHED); | 5800 | tcp_set_state(sk, TCP_ESTABLISHED); |
| 6025 | sk->sk_state_change(sk); | 5801 | sk->sk_state_change(sk); |
| @@ -6041,27 +5817,23 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 6041 | if (tp->rx_opt.tstamp_ok) | 5817 | if (tp->rx_opt.tstamp_ok) |
| 6042 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; | 5818 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; |
| 6043 | 5819 | ||
| 6044 | if (req) { | 5820 | /* Make sure socket is routed, for |
| 6045 | /* Re-arm the timer because data may | 5821 | * correct metrics. |
| 6046 | * have been sent out. This is similar | 5822 | */ |
| 6047 | * to the regular data transmission case | 5823 | icsk->icsk_af_ops->rebuild_header(sk); |
| 6048 | * when new data has just been ack'ed. | 5824 | |
| 6049 | * | 5825 | tcp_init_metrics(sk); |
| 6050 | * (TFO) - we could try to be more | 5826 | |
| 6051 | * aggressive and retranmitting any data | 5827 | tcp_init_congestion_control(sk); |
| 6052 | * sooner based on when they were sent | ||
| 6053 | * out. | ||
| 6054 | */ | ||
| 6055 | tcp_rearm_rto(sk); | ||
| 6056 | } else | ||
| 6057 | tcp_init_metrics(sk); | ||
| 6058 | 5828 | ||
| 6059 | /* Prevent spurious tcp_cwnd_restart() on | 5829 | /* Prevent spurious tcp_cwnd_restart() on |
| 6060 | * first data packet. | 5830 | * first data packet. |
| 6061 | */ | 5831 | */ |
| 6062 | tp->lsndtime = tcp_time_stamp; | 5832 | tp->lsndtime = tcp_time_stamp; |
| 6063 | 5833 | ||
| 5834 | tcp_mtup_init(sk); | ||
| 6064 | tcp_initialize_rcv_mss(sk); | 5835 | tcp_initialize_rcv_mss(sk); |
| 5836 | tcp_init_buffer_space(sk); | ||
| 6065 | tcp_fast_path_on(tp); | 5837 | tcp_fast_path_on(tp); |
| 6066 | } else { | 5838 | } else { |
| 6067 | return 1; | 5839 | return 1; |
| @@ -6069,33 +5841,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 6069 | break; | 5841 | break; |
| 6070 | 5842 | ||
| 6071 | case TCP_FIN_WAIT1: | 5843 | case TCP_FIN_WAIT1: |
| 6072 | /* If we enter the TCP_FIN_WAIT1 state and we are a | ||
| 6073 | * Fast Open socket and this is the first acceptable | ||
| 6074 | * ACK we have received, this would have acknowledged | ||
| 6075 | * our SYNACK so stop the SYNACK timer. | ||
| 6076 | */ | ||
| 6077 | if (req != NULL) { | ||
| 6078 | /* Return RST if ack_seq is invalid. | ||
| 6079 | * Note that RFC793 only says to generate a | ||
| 6080 | * DUPACK for it but for TCP Fast Open it seems | ||
| 6081 | * better to treat this case like TCP_SYN_RECV | ||
| 6082 | * above. | ||
| 6083 | */ | ||
| 6084 | if (!acceptable) | ||
| 6085 | return 1; | ||
| 6086 | /* We no longer need the request sock. */ | ||
| 6087 | reqsk_fastopen_remove(sk, req, false); | ||
| 6088 | tcp_rearm_rto(sk); | ||
| 6089 | } | ||
| 6090 | if (tp->snd_una == tp->write_seq) { | 5844 | if (tp->snd_una == tp->write_seq) { |
| 6091 | struct dst_entry *dst; | ||
| 6092 | |||
| 6093 | tcp_set_state(sk, TCP_FIN_WAIT2); | 5845 | tcp_set_state(sk, TCP_FIN_WAIT2); |
| 6094 | sk->sk_shutdown |= SEND_SHUTDOWN; | 5846 | sk->sk_shutdown |= SEND_SHUTDOWN; |
| 6095 | 5847 | dst_confirm(__sk_dst_get(sk)); | |
| 6096 | dst = __sk_dst_get(sk); | ||
| 6097 | if (dst) | ||
| 6098 | dst_confirm(dst); | ||
| 6099 | 5848 | ||
| 6100 | if (!sock_flag(sk, SOCK_DEAD)) | 5849 | if (!sock_flag(sk, SOCK_DEAD)) |
| 6101 | /* Wake up lingering close() */ | 5850 | /* Wake up lingering close() */ |
| @@ -6145,12 +5894,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 6145 | } | 5894 | } |
| 6146 | break; | 5895 | break; |
| 6147 | } | 5896 | } |
| 6148 | } | 5897 | } else |
| 6149 | 5898 | goto discard; | |
| 6150 | /* ts_recent update must be made after we are sure that the packet | ||
| 6151 | * is in window. | ||
| 6152 | */ | ||
| 6153 | tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); | ||
| 6154 | 5899 | ||
| 6155 | /* step 6: check the URG bit */ | 5900 | /* step 6: check the URG bit */ |
| 6156 | tcp_urg(sk, skb, th); | 5901 | tcp_urg(sk, skb, th); |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 54139fa514e..6cdf6a28f6b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
| @@ -50,7 +50,6 @@ | |||
| 50 | * a single port at the same time. | 50 | * a single port at the same time. |
| 51 | */ | 51 | */ |
| 52 | 52 | ||
| 53 | #define pr_fmt(fmt) "TCP: " fmt | ||
| 54 | 53 | ||
| 55 | #include <linux/bottom_half.h> | 54 | #include <linux/bottom_half.h> |
| 56 | #include <linux/types.h> | 55 | #include <linux/types.h> |
| @@ -74,7 +73,6 @@ | |||
| 74 | #include <net/xfrm.h> | 73 | #include <net/xfrm.h> |
| 75 | #include <net/netdma.h> | 74 | #include <net/netdma.h> |
| 76 | #include <net/secure_seq.h> | 75 | #include <net/secure_seq.h> |
| 77 | #include <net/tcp_memcontrol.h> | ||
| 78 | 76 | ||
| 79 | #include <linux/inet.h> | 77 | #include <linux/inet.h> |
| 80 | #include <linux/ipv6.h> | 78 | #include <linux/ipv6.h> |
| @@ -91,14 +89,22 @@ EXPORT_SYMBOL(sysctl_tcp_low_latency); | |||
| 91 | 89 | ||
| 92 | 90 | ||
| 93 | #ifdef CONFIG_TCP_MD5SIG | 91 | #ifdef CONFIG_TCP_MD5SIG |
| 94 | static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, | 92 | static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, |
| 95 | __be32 daddr, __be32 saddr, const struct tcphdr *th); | 93 | __be32 addr); |
| 94 | static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, | ||
| 95 | __be32 daddr, __be32 saddr, struct tcphdr *th); | ||
| 96 | #else | ||
| 97 | static inline | ||
| 98 | struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) | ||
| 99 | { | ||
| 100 | return NULL; | ||
| 101 | } | ||
| 96 | #endif | 102 | #endif |
| 97 | 103 | ||
| 98 | struct inet_hashinfo tcp_hashinfo; | 104 | struct inet_hashinfo tcp_hashinfo; |
| 99 | EXPORT_SYMBOL(tcp_hashinfo); | 105 | EXPORT_SYMBOL(tcp_hashinfo); |
| 100 | 106 | ||
| 101 | static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb) | 107 | static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) |
| 102 | { | 108 | { |
| 103 | return secure_tcp_sequence_number(ip_hdr(skb)->daddr, | 109 | return secure_tcp_sequence_number(ip_hdr(skb)->daddr, |
| 104 | ip_hdr(skb)->saddr, | 110 | ip_hdr(skb)->saddr, |
| @@ -196,13 +202,26 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | |||
| 196 | /* Reset inherited state */ | 202 | /* Reset inherited state */ |
| 197 | tp->rx_opt.ts_recent = 0; | 203 | tp->rx_opt.ts_recent = 0; |
| 198 | tp->rx_opt.ts_recent_stamp = 0; | 204 | tp->rx_opt.ts_recent_stamp = 0; |
| 199 | if (likely(!tp->repair)) | 205 | tp->write_seq = 0; |
| 200 | tp->write_seq = 0; | ||
| 201 | } | 206 | } |
| 202 | 207 | ||
| 203 | if (tcp_death_row.sysctl_tw_recycle && | 208 | if (tcp_death_row.sysctl_tw_recycle && |
| 204 | !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) | 209 | !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) { |
| 205 | tcp_fetch_timewait_stamp(sk, &rt->dst); | 210 | struct inet_peer *peer = rt_get_peer(rt, fl4->daddr); |
| 211 | /* | ||
| 212 | * VJ's idea. We save last timestamp seen from | ||
| 213 | * the destination in peer table, when entering state | ||
| 214 | * TIME-WAIT * and initialize rx_opt.ts_recent from it, | ||
| 215 | * when trying new connection. | ||
| 216 | */ | ||
| 217 | if (peer) { | ||
| 218 | inet_peer_refcheck(peer); | ||
| 219 | if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { | ||
| 220 | tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; | ||
| 221 | tp->rx_opt.ts_recent = peer->tcp_ts; | ||
| 222 | } | ||
| 223 | } | ||
| 224 | } | ||
| 206 | 225 | ||
| 207 | inet->inet_dport = usin->sin_port; | 226 | inet->inet_dport = usin->sin_port; |
| 208 | inet->inet_daddr = daddr; | 227 | inet->inet_daddr = daddr; |
| @@ -234,7 +253,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | |||
| 234 | sk->sk_gso_type = SKB_GSO_TCPV4; | 253 | sk->sk_gso_type = SKB_GSO_TCPV4; |
| 235 | sk_setup_caps(sk, &rt->dst); | 254 | sk_setup_caps(sk, &rt->dst); |
| 236 | 255 | ||
| 237 | if (!tp->write_seq && likely(!tp->repair)) | 256 | if (!tp->write_seq) |
| 238 | tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, | 257 | tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, |
| 239 | inet->inet_daddr, | 258 | inet->inet_daddr, |
| 240 | inet->inet_sport, | 259 | inet->inet_sport, |
| @@ -243,7 +262,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | |||
| 243 | inet->inet_id = tp->write_seq ^ jiffies; | 262 | inet->inet_id = tp->write_seq ^ jiffies; |
| 244 | 263 | ||
| 245 | err = tcp_connect(sk); | 264 | err = tcp_connect(sk); |
| 246 | |||
| 247 | rt = NULL; | 265 | rt = NULL; |
| 248 | if (err) | 266 | if (err) |
| 249 | goto failure; | 267 | goto failure; |
| @@ -264,15 +282,12 @@ failure: | |||
| 264 | EXPORT_SYMBOL(tcp_v4_connect); | 282 | EXPORT_SYMBOL(tcp_v4_connect); |
| 265 | 283 | ||
| 266 | /* | 284 | /* |
| 267 | * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. | 285 | * This routine does path mtu discovery as defined in RFC1191. |
| 268 | * It can be called through tcp_release_cb() if socket was owned by user | ||
| 269 | * at the time tcp_v4_err() was called to handle ICMP message. | ||
| 270 | */ | 286 | */ |
| 271 | static void tcp_v4_mtu_reduced(struct sock *sk) | 287 | static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) |
| 272 | { | 288 | { |
| 273 | struct dst_entry *dst; | 289 | struct dst_entry *dst; |
| 274 | struct inet_sock *inet = inet_sk(sk); | 290 | struct inet_sock *inet = inet_sk(sk); |
| 275 | u32 mtu = tcp_sk(sk)->mtu_info; | ||
| 276 | 291 | ||
| 277 | /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs | 292 | /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs |
| 278 | * send out by Linux are always <576bytes so they should go through | 293 | * send out by Linux are always <576bytes so they should go through |
| @@ -281,10 +296,17 @@ static void tcp_v4_mtu_reduced(struct sock *sk) | |||
| 281 | if (sk->sk_state == TCP_LISTEN) | 296 | if (sk->sk_state == TCP_LISTEN) |
| 282 | return; | 297 | return; |
| 283 | 298 | ||
| 284 | dst = inet_csk_update_pmtu(sk, mtu); | 299 | /* We don't check in the destentry if pmtu discovery is forbidden |
| 285 | if (!dst) | 300 | * on this route. We just assume that no packet_to_big packets |
| 301 | * are send back when pmtu discovery is not active. | ||
| 302 | * There is a small race when the user changes this flag in the | ||
| 303 | * route, but I think that's acceptable. | ||
| 304 | */ | ||
| 305 | if ((dst = __sk_dst_check(sk, 0)) == NULL) | ||
| 286 | return; | 306 | return; |
| 287 | 307 | ||
| 308 | dst->ops->update_pmtu(dst, mtu); | ||
| 309 | |||
| 288 | /* Something is about to be wrong... Remember soft error | 310 | /* Something is about to be wrong... Remember soft error |
| 289 | * for the case, if this connection will not able to recover. | 311 | * for the case, if this connection will not able to recover. |
| 290 | */ | 312 | */ |
| @@ -306,14 +328,6 @@ static void tcp_v4_mtu_reduced(struct sock *sk) | |||
| 306 | } /* else let the usual retransmit timer handle it */ | 328 | } /* else let the usual retransmit timer handle it */ |
| 307 | } | 329 | } |
| 308 | 330 | ||
| 309 | static void do_redirect(struct sk_buff *skb, struct sock *sk) | ||
| 310 | { | ||
| 311 | struct dst_entry *dst = __sk_dst_check(sk, 0); | ||
| 312 | |||
| 313 | if (dst) | ||
| 314 | dst->ops->redirect(dst, sk, skb); | ||
| 315 | } | ||
| 316 | |||
| 317 | /* | 331 | /* |
| 318 | * This routine is called by the ICMP module when it gets some | 332 | * This routine is called by the ICMP module when it gets some |
| 319 | * sort of error condition. If err < 0 then the socket should | 333 | * sort of error condition. If err < 0 then the socket should |
| @@ -341,7 +355,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
| 341 | const int code = icmp_hdr(icmp_skb)->code; | 355 | const int code = icmp_hdr(icmp_skb)->code; |
| 342 | struct sock *sk; | 356 | struct sock *sk; |
| 343 | struct sk_buff *skb; | 357 | struct sk_buff *skb; |
| 344 | struct request_sock *req; | ||
| 345 | __u32 seq; | 358 | __u32 seq; |
| 346 | __u32 remaining; | 359 | __u32 remaining; |
| 347 | int err; | 360 | int err; |
| @@ -366,12 +379,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
| 366 | bh_lock_sock(sk); | 379 | bh_lock_sock(sk); |
| 367 | /* If too many ICMPs get dropped on busy | 380 | /* If too many ICMPs get dropped on busy |
| 368 | * servers this needs to be solved differently. | 381 | * servers this needs to be solved differently. |
| 369 | * We do take care of PMTU discovery (RFC1191) special case : | ||
| 370 | * we can receive locally generated ICMP messages while socket is held. | ||
| 371 | */ | 382 | */ |
| 372 | if (sock_owned_by_user(sk) && | 383 | if (sock_owned_by_user(sk)) |
| 373 | type != ICMP_DEST_UNREACH && | ||
| 374 | code != ICMP_FRAG_NEEDED) | ||
| 375 | NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); | 384 | NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); |
| 376 | 385 | ||
| 377 | if (sk->sk_state == TCP_CLOSE) | 386 | if (sk->sk_state == TCP_CLOSE) |
| @@ -384,20 +393,14 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
| 384 | 393 | ||
| 385 | icsk = inet_csk(sk); | 394 | icsk = inet_csk(sk); |
| 386 | tp = tcp_sk(sk); | 395 | tp = tcp_sk(sk); |
| 387 | req = tp->fastopen_rsk; | ||
| 388 | seq = ntohl(th->seq); | 396 | seq = ntohl(th->seq); |
| 389 | if (sk->sk_state != TCP_LISTEN && | 397 | if (sk->sk_state != TCP_LISTEN && |
| 390 | !between(seq, tp->snd_una, tp->snd_nxt) && | 398 | !between(seq, tp->snd_una, tp->snd_nxt)) { |
| 391 | (req == NULL || seq != tcp_rsk(req)->snt_isn)) { | ||
| 392 | /* For a Fast Open socket, allow seq to be snt_isn. */ | ||
| 393 | NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); | 399 | NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); |
| 394 | goto out; | 400 | goto out; |
| 395 | } | 401 | } |
| 396 | 402 | ||
| 397 | switch (type) { | 403 | switch (type) { |
| 398 | case ICMP_REDIRECT: | ||
| 399 | do_redirect(icmp_skb, sk); | ||
| 400 | goto out; | ||
| 401 | case ICMP_SOURCE_QUENCH: | 404 | case ICMP_SOURCE_QUENCH: |
| 402 | /* Just silently ignore these. */ | 405 | /* Just silently ignore these. */ |
| 403 | goto out; | 406 | goto out; |
| @@ -409,13 +412,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
| 409 | goto out; | 412 | goto out; |
| 410 | 413 | ||
| 411 | if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ | 414 | if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ |
| 412 | tp->mtu_info = info; | 415 | if (!sock_owned_by_user(sk)) |
| 413 | if (!sock_owned_by_user(sk)) { | 416 | do_pmtu_discovery(sk, iph, info); |
| 414 | tcp_v4_mtu_reduced(sk); | ||
| 415 | } else { | ||
| 416 | if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) | ||
| 417 | sock_hold(sk); | ||
| 418 | } | ||
| 419 | goto out; | 417 | goto out; |
| 420 | } | 418 | } |
| 421 | 419 | ||
| @@ -428,8 +426,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
| 428 | !icsk->icsk_backoff) | 426 | !icsk->icsk_backoff) |
| 429 | break; | 427 | break; |
| 430 | 428 | ||
| 431 | /* XXX (TFO) - revisit the following logic for TFO */ | ||
| 432 | |||
| 433 | if (sock_owned_by_user(sk)) | 429 | if (sock_owned_by_user(sk)) |
| 434 | break; | 430 | break; |
| 435 | 431 | ||
| @@ -461,14 +457,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
| 461 | goto out; | 457 | goto out; |
| 462 | } | 458 | } |
| 463 | 459 | ||
| 464 | /* XXX (TFO) - if it's a TFO socket and has been accepted, rather | ||
| 465 | * than following the TCP_SYN_RECV case and closing the socket, | ||
| 466 | * we ignore the ICMP error and keep trying like a fully established | ||
| 467 | * socket. Is this the right thing to do? | ||
| 468 | */ | ||
| 469 | if (req && req->sk == NULL) | ||
| 470 | goto out; | ||
| 471 | |||
| 472 | switch (sk->sk_state) { | 460 | switch (sk->sk_state) { |
| 473 | struct request_sock *req, **prev; | 461 | struct request_sock *req, **prev; |
| 474 | case TCP_LISTEN: | 462 | case TCP_LISTEN: |
| @@ -501,8 +489,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
| 501 | 489 | ||
| 502 | case TCP_SYN_SENT: | 490 | case TCP_SYN_SENT: |
| 503 | case TCP_SYN_RECV: /* Cannot happen. | 491 | case TCP_SYN_RECV: /* Cannot happen. |
| 504 | It can f.e. if SYNs crossed, | 492 | It can f.e. if SYNs crossed. |
| 505 | or Fast Open. | ||
| 506 | */ | 493 | */ |
| 507 | if (!sock_owned_by_user(sk)) { | 494 | if (!sock_owned_by_user(sk)) { |
| 508 | sk->sk_err = err; | 495 | sk->sk_err = err; |
| @@ -565,7 +552,7 @@ static void __tcp_v4_send_check(struct sk_buff *skb, | |||
| 565 | /* This routine computes an IPv4 TCP checksum. */ | 552 | /* This routine computes an IPv4 TCP checksum. */ |
| 566 | void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) | 553 | void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) |
| 567 | { | 554 | { |
| 568 | const struct inet_sock *inet = inet_sk(sk); | 555 | struct inet_sock *inet = inet_sk(sk); |
| 569 | 556 | ||
| 570 | __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); | 557 | __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); |
| 571 | } | 558 | } |
| @@ -603,7 +590,7 @@ int tcp_v4_gso_send_check(struct sk_buff *skb) | |||
| 603 | 590 | ||
| 604 | static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) | 591 | static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) |
| 605 | { | 592 | { |
| 606 | const struct tcphdr *th = tcp_hdr(skb); | 593 | struct tcphdr *th = tcp_hdr(skb); |
| 607 | struct { | 594 | struct { |
| 608 | struct tcphdr th; | 595 | struct tcphdr th; |
| 609 | #ifdef CONFIG_TCP_MD5SIG | 596 | #ifdef CONFIG_TCP_MD5SIG |
| @@ -613,10 +600,6 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) | |||
| 613 | struct ip_reply_arg arg; | 600 | struct ip_reply_arg arg; |
| 614 | #ifdef CONFIG_TCP_MD5SIG | 601 | #ifdef CONFIG_TCP_MD5SIG |
| 615 | struct tcp_md5sig_key *key; | 602 | struct tcp_md5sig_key *key; |
| 616 | const __u8 *hash_location = NULL; | ||
| 617 | unsigned char newhash[16]; | ||
| 618 | int genhash; | ||
| 619 | struct sock *sk1 = NULL; | ||
| 620 | #endif | 603 | #endif |
| 621 | struct net *net; | 604 | struct net *net; |
| 622 | 605 | ||
| @@ -647,36 +630,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) | |||
| 647 | arg.iov[0].iov_len = sizeof(rep.th); | 630 | arg.iov[0].iov_len = sizeof(rep.th); |
| 648 | 631 | ||
| 649 | #ifdef CONFIG_TCP_MD5SIG | 632 | #ifdef CONFIG_TCP_MD5SIG |
| 650 | hash_location = tcp_parse_md5sig_option(th); | 633 | key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->saddr) : NULL; |
| 651 | if (!sk && hash_location) { | ||
| 652 | /* | ||
| 653 | * active side is lost. Try to find listening socket through | ||
| 654 | * source port, and then find md5 key through listening socket. | ||
| 655 | * we are not loose security here: | ||
| 656 | * Incoming packet is checked with md5 hash with finding key, | ||
| 657 | * no RST generated if md5 hash doesn't match. | ||
| 658 | */ | ||
| 659 | sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev), | ||
| 660 | &tcp_hashinfo, ip_hdr(skb)->daddr, | ||
| 661 | ntohs(th->source), inet_iif(skb)); | ||
| 662 | /* don't send rst if it can't find key */ | ||
| 663 | if (!sk1) | ||
| 664 | return; | ||
| 665 | rcu_read_lock(); | ||
| 666 | key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) | ||
| 667 | &ip_hdr(skb)->saddr, AF_INET); | ||
| 668 | if (!key) | ||
| 669 | goto release_sk1; | ||
| 670 | |||
| 671 | genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb); | ||
| 672 | if (genhash || memcmp(hash_location, newhash, 16) != 0) | ||
| 673 | goto release_sk1; | ||
| 674 | } else { | ||
| 675 | key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *) | ||
| 676 | &ip_hdr(skb)->saddr, | ||
| 677 | AF_INET) : NULL; | ||
| 678 | } | ||
| 679 | |||
| 680 | if (key) { | 634 | if (key) { |
| 681 | rep.opt[0] = htonl((TCPOPT_NOP << 24) | | 635 | rep.opt[0] = htonl((TCPOPT_NOP << 24) | |
| 682 | (TCPOPT_NOP << 16) | | 636 | (TCPOPT_NOP << 16) | |
| @@ -696,28 +650,13 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) | |||
| 696 | arg.iov[0].iov_len, IPPROTO_TCP, 0); | 650 | arg.iov[0].iov_len, IPPROTO_TCP, 0); |
| 697 | arg.csumoffset = offsetof(struct tcphdr, check) / 2; | 651 | arg.csumoffset = offsetof(struct tcphdr, check) / 2; |
| 698 | arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; | 652 | arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; |
| 699 | /* When socket is gone, all binding information is lost. | ||
| 700 | * routing might fail in this case. No choice here, if we choose to force | ||
| 701 | * input interface, we will misroute in case of asymmetric route. | ||
| 702 | */ | ||
| 703 | if (sk) | ||
| 704 | arg.bound_dev_if = sk->sk_bound_dev_if; | ||
| 705 | 653 | ||
| 706 | net = dev_net(skb_dst(skb)->dev); | 654 | net = dev_net(skb_dst(skb)->dev); |
| 707 | arg.tos = ip_hdr(skb)->tos; | 655 | ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, |
| 708 | ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, | 656 | &arg, arg.iov[0].iov_len); |
| 709 | ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); | ||
| 710 | 657 | ||
| 711 | TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); | 658 | TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); |
| 712 | TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); | 659 | TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); |
| 713 | |||
| 714 | #ifdef CONFIG_TCP_MD5SIG | ||
| 715 | release_sk1: | ||
| 716 | if (sk1) { | ||
| 717 | rcu_read_unlock(); | ||
| 718 | sock_put(sk1); | ||
| 719 | } | ||
| 720 | #endif | ||
| 721 | } | 660 | } |
| 722 | 661 | ||
| 723 | /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states | 662 | /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states |
| @@ -727,9 +666,9 @@ release_sk1: | |||
| 727 | static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, | 666 | static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, |
| 728 | u32 win, u32 ts, int oif, | 667 | u32 win, u32 ts, int oif, |
| 729 | struct tcp_md5sig_key *key, | 668 | struct tcp_md5sig_key *key, |
| 730 | int reply_flags, u8 tos) | 669 | int reply_flags) |
| 731 | { | 670 | { |
| 732 | const struct tcphdr *th = tcp_hdr(skb); | 671 | struct tcphdr *th = tcp_hdr(skb); |
| 733 | struct { | 672 | struct { |
| 734 | struct tcphdr th; | 673 | struct tcphdr th; |
| 735 | __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) | 674 | __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) |
| @@ -787,9 +726,9 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, | |||
| 787 | arg.csumoffset = offsetof(struct tcphdr, check) / 2; | 726 | arg.csumoffset = offsetof(struct tcphdr, check) / 2; |
| 788 | if (oif) | 727 | if (oif) |
| 789 | arg.bound_dev_if = oif; | 728 | arg.bound_dev_if = oif; |
| 790 | arg.tos = tos; | 729 | |
| 791 | ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, | 730 | ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, |
| 792 | ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); | 731 | &arg, arg.iov[0].iov_len); |
| 793 | 732 | ||
| 794 | TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); | 733 | TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); |
| 795 | } | 734 | } |
| @@ -804,8 +743,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) | |||
| 804 | tcptw->tw_ts_recent, | 743 | tcptw->tw_ts_recent, |
| 805 | tw->tw_bound_dev_if, | 744 | tw->tw_bound_dev_if, |
| 806 | tcp_twsk_md5_key(tcptw), | 745 | tcp_twsk_md5_key(tcptw), |
| 807 | tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, | 746 | tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0 |
| 808 | tw->tw_tos | ||
| 809 | ); | 747 | ); |
| 810 | 748 | ||
| 811 | inet_twsk_put(tw); | 749 | inet_twsk_put(tw); |
| @@ -814,18 +752,12 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) | |||
| 814 | static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, | 752 | static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, |
| 815 | struct request_sock *req) | 753 | struct request_sock *req) |
| 816 | { | 754 | { |
| 817 | /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV | 755 | tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, |
| 818 | * sk->sk_state == TCP_SYN_RECV -> for Fast Open. | 756 | tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, |
| 819 | */ | ||
| 820 | tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? | ||
| 821 | tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, | ||
| 822 | tcp_rsk(req)->rcv_nxt, req->rcv_wnd, | ||
| 823 | req->ts_recent, | 757 | req->ts_recent, |
| 824 | 0, | 758 | 0, |
| 825 | tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, | 759 | tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr), |
| 826 | AF_INET), | 760 | inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0); |
| 827 | inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, | ||
| 828 | ip_hdr(skb)->tos); | ||
| 829 | } | 761 | } |
| 830 | 762 | ||
| 831 | /* | 763 | /* |
| @@ -835,9 +767,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, | |||
| 835 | */ | 767 | */ |
| 836 | static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, | 768 | static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, |
| 837 | struct request_sock *req, | 769 | struct request_sock *req, |
| 838 | struct request_values *rvp, | 770 | struct request_values *rvp) |
| 839 | u16 queue_mapping, | ||
| 840 | bool nocache) | ||
| 841 | { | 771 | { |
| 842 | const struct inet_request_sock *ireq = inet_rsk(req); | 772 | const struct inet_request_sock *ireq = inet_rsk(req); |
| 843 | struct flowi4 fl4; | 773 | struct flowi4 fl4; |
| @@ -848,31 +778,26 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, | |||
| 848 | if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) | 778 | if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) |
| 849 | return -1; | 779 | return -1; |
| 850 | 780 | ||
| 851 | skb = tcp_make_synack(sk, dst, req, rvp, NULL); | 781 | skb = tcp_make_synack(sk, dst, req, rvp); |
| 852 | 782 | ||
| 853 | if (skb) { | 783 | if (skb) { |
| 854 | __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); | 784 | __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); |
| 855 | 785 | ||
| 856 | skb_set_queue_mapping(skb, queue_mapping); | ||
| 857 | err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, | 786 | err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, |
| 858 | ireq->rmt_addr, | 787 | ireq->rmt_addr, |
| 859 | ireq->opt); | 788 | ireq->opt); |
| 860 | err = net_xmit_eval(err); | 789 | err = net_xmit_eval(err); |
| 861 | if (!tcp_rsk(req)->snt_synack && !err) | ||
| 862 | tcp_rsk(req)->snt_synack = tcp_time_stamp; | ||
| 863 | } | 790 | } |
| 864 | 791 | ||
| 792 | dst_release(dst); | ||
| 865 | return err; | 793 | return err; |
| 866 | } | 794 | } |
| 867 | 795 | ||
| 868 | static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req, | 796 | static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req, |
| 869 | struct request_values *rvp) | 797 | struct request_values *rvp) |
| 870 | { | 798 | { |
| 871 | int res = tcp_v4_send_synack(sk, NULL, req, rvp, 0, false); | 799 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); |
| 872 | 800 | return tcp_v4_send_synack(sk, NULL, req, rvp); | |
| 873 | if (!res) | ||
| 874 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); | ||
| 875 | return res; | ||
| 876 | } | 801 | } |
| 877 | 802 | ||
| 878 | /* | 803 | /* |
| @@ -884,14 +809,14 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) | |||
| 884 | } | 809 | } |
| 885 | 810 | ||
| 886 | /* | 811 | /* |
| 887 | * Return true if a syncookie should be sent | 812 | * Return 1 if a syncookie should be sent |
| 888 | */ | 813 | */ |
| 889 | bool tcp_syn_flood_action(struct sock *sk, | 814 | int tcp_syn_flood_action(struct sock *sk, |
| 890 | const struct sk_buff *skb, | 815 | const struct sk_buff *skb, |
| 891 | const char *proto) | 816 | const char *proto) |
| 892 | { | 817 | { |
| 893 | const char *msg = "Dropping request"; | 818 | const char *msg = "Dropping request"; |
| 894 | bool want_cookie = false; | 819 | int want_cookie = 0; |
| 895 | struct listen_sock *lopt; | 820 | struct listen_sock *lopt; |
| 896 | 821 | ||
| 897 | 822 | ||
| @@ -899,7 +824,7 @@ bool tcp_syn_flood_action(struct sock *sk, | |||
| 899 | #ifdef CONFIG_SYN_COOKIES | 824 | #ifdef CONFIG_SYN_COOKIES |
| 900 | if (sysctl_tcp_syncookies) { | 825 | if (sysctl_tcp_syncookies) { |
| 901 | msg = "Sending cookies"; | 826 | msg = "Sending cookies"; |
| 902 | want_cookie = true; | 827 | want_cookie = 1; |
| 903 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); | 828 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); |
| 904 | } else | 829 | } else |
| 905 | #endif | 830 | #endif |
| @@ -908,7 +833,8 @@ bool tcp_syn_flood_action(struct sock *sk, | |||
| 908 | lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; | 833 | lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; |
| 909 | if (!lopt->synflood_warned) { | 834 | if (!lopt->synflood_warned) { |
| 910 | lopt->synflood_warned = 1; | 835 | lopt->synflood_warned = 1; |
| 911 | pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", | 836 | pr_info("%s: Possible SYN flooding on port %d. %s. " |
| 837 | " Check SNMP counters.\n", | ||
| 912 | proto, ntohs(tcp_hdr(skb)->dest), msg); | 838 | proto, ntohs(tcp_hdr(skb)->dest), msg); |
| 913 | } | 839 | } |
| 914 | return want_cookie; | 840 | return want_cookie; |
| @@ -918,7 +844,8 @@ EXPORT_SYMBOL(tcp_syn_flood_action); | |||
| 918 | /* | 844 | /* |
| 919 | * Save and compile IPv4 options into the request_sock if needed. | 845 | * Save and compile IPv4 options into the request_sock if needed. |
| 920 | */ | 846 | */ |
| 921 | static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) | 847 | static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk, |
| 848 | struct sk_buff *skb) | ||
| 922 | { | 849 | { |
| 923 | const struct ip_options *opt = &(IPCB(skb)->opt); | 850 | const struct ip_options *opt = &(IPCB(skb)->opt); |
| 924 | struct ip_options_rcu *dopt = NULL; | 851 | struct ip_options_rcu *dopt = NULL; |
| @@ -945,138 +872,153 @@ static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) | |||
| 945 | */ | 872 | */ |
| 946 | 873 | ||
| 947 | /* Find the Key structure for an address. */ | 874 | /* Find the Key structure for an address. */ |
| 948 | struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, | 875 | static struct tcp_md5sig_key * |
| 949 | const union tcp_md5_addr *addr, | 876 | tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) |
| 950 | int family) | ||
| 951 | { | 877 | { |
| 952 | struct tcp_sock *tp = tcp_sk(sk); | 878 | struct tcp_sock *tp = tcp_sk(sk); |
| 953 | struct tcp_md5sig_key *key; | 879 | int i; |
| 954 | struct hlist_node *pos; | 880 | |
| 955 | unsigned int size = sizeof(struct in_addr); | 881 | if (!tp->md5sig_info || !tp->md5sig_info->entries4) |
| 956 | struct tcp_md5sig_info *md5sig; | ||
| 957 | |||
| 958 | /* caller either holds rcu_read_lock() or socket lock */ | ||
| 959 | md5sig = rcu_dereference_check(tp->md5sig_info, | ||
| 960 | sock_owned_by_user(sk) || | ||
| 961 | lockdep_is_held(&sk->sk_lock.slock)); | ||
| 962 | if (!md5sig) | ||
| 963 | return NULL; | 882 | return NULL; |
| 964 | #if IS_ENABLED(CONFIG_IPV6) | 883 | for (i = 0; i < tp->md5sig_info->entries4; i++) { |
| 965 | if (family == AF_INET6) | 884 | if (tp->md5sig_info->keys4[i].addr == addr) |
| 966 | size = sizeof(struct in6_addr); | 885 | return &tp->md5sig_info->keys4[i].base; |
| 967 | #endif | ||
| 968 | hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) { | ||
| 969 | if (key->family != family) | ||
| 970 | continue; | ||
| 971 | if (!memcmp(&key->addr, addr, size)) | ||
| 972 | return key; | ||
| 973 | } | 886 | } |
| 974 | return NULL; | 887 | return NULL; |
| 975 | } | 888 | } |
| 976 | EXPORT_SYMBOL(tcp_md5_do_lookup); | ||
| 977 | 889 | ||
| 978 | struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, | 890 | struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, |
| 979 | struct sock *addr_sk) | 891 | struct sock *addr_sk) |
| 980 | { | 892 | { |
| 981 | union tcp_md5_addr *addr; | 893 | return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr); |
| 982 | |||
| 983 | addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr; | ||
| 984 | return tcp_md5_do_lookup(sk, addr, AF_INET); | ||
| 985 | } | 894 | } |
| 986 | EXPORT_SYMBOL(tcp_v4_md5_lookup); | 895 | EXPORT_SYMBOL(tcp_v4_md5_lookup); |
| 987 | 896 | ||
| 988 | static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk, | 897 | static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk, |
| 989 | struct request_sock *req) | 898 | struct request_sock *req) |
| 990 | { | 899 | { |
| 991 | union tcp_md5_addr *addr; | 900 | return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr); |
| 992 | |||
| 993 | addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr; | ||
| 994 | return tcp_md5_do_lookup(sk, addr, AF_INET); | ||
| 995 | } | 901 | } |
| 996 | 902 | ||
| 997 | /* This can be called on a newly created socket, from other files */ | 903 | /* This can be called on a newly created socket, from other files */ |
| 998 | int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, | 904 | int tcp_v4_md5_do_add(struct sock *sk, __be32 addr, |
| 999 | int family, const u8 *newkey, u8 newkeylen, gfp_t gfp) | 905 | u8 *newkey, u8 newkeylen) |
| 1000 | { | 906 | { |
| 1001 | /* Add Key to the list */ | 907 | /* Add Key to the list */ |
| 1002 | struct tcp_md5sig_key *key; | 908 | struct tcp_md5sig_key *key; |
| 1003 | struct tcp_sock *tp = tcp_sk(sk); | 909 | struct tcp_sock *tp = tcp_sk(sk); |
| 1004 | struct tcp_md5sig_info *md5sig; | 910 | struct tcp4_md5sig_key *keys; |
| 1005 | 911 | ||
| 1006 | key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET); | 912 | key = tcp_v4_md5_do_lookup(sk, addr); |
| 1007 | if (key) { | 913 | if (key) { |
| 1008 | /* Pre-existing entry - just update that one. */ | 914 | /* Pre-existing entry - just update that one. */ |
| 1009 | memcpy(key->key, newkey, newkeylen); | 915 | kfree(key->key); |
| 916 | key->key = newkey; | ||
| 1010 | key->keylen = newkeylen; | 917 | key->keylen = newkeylen; |
| 1011 | return 0; | 918 | } else { |
| 1012 | } | 919 | struct tcp_md5sig_info *md5sig; |
| 920 | |||
| 921 | if (!tp->md5sig_info) { | ||
| 922 | tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info), | ||
| 923 | GFP_ATOMIC); | ||
| 924 | if (!tp->md5sig_info) { | ||
| 925 | kfree(newkey); | ||
| 926 | return -ENOMEM; | ||
| 927 | } | ||
| 928 | sk_nocaps_add(sk, NETIF_F_GSO_MASK); | ||
| 929 | } | ||
| 1013 | 930 | ||
| 1014 | md5sig = rcu_dereference_protected(tp->md5sig_info, | 931 | md5sig = tp->md5sig_info; |
| 1015 | sock_owned_by_user(sk)); | 932 | if (md5sig->entries4 == 0 && |
| 1016 | if (!md5sig) { | 933 | tcp_alloc_md5sig_pool(sk) == NULL) { |
| 1017 | md5sig = kmalloc(sizeof(*md5sig), gfp); | 934 | kfree(newkey); |
| 1018 | if (!md5sig) | ||
| 1019 | return -ENOMEM; | 935 | return -ENOMEM; |
| 936 | } | ||
| 1020 | 937 | ||
| 1021 | sk_nocaps_add(sk, NETIF_F_GSO_MASK); | 938 | if (md5sig->alloced4 == md5sig->entries4) { |
| 1022 | INIT_HLIST_HEAD(&md5sig->head); | 939 | keys = kmalloc((sizeof(*keys) * |
| 1023 | rcu_assign_pointer(tp->md5sig_info, md5sig); | 940 | (md5sig->entries4 + 1)), GFP_ATOMIC); |
| 1024 | } | 941 | if (!keys) { |
| 942 | kfree(newkey); | ||
| 943 | if (md5sig->entries4 == 0) | ||
| 944 | tcp_free_md5sig_pool(); | ||
| 945 | return -ENOMEM; | ||
| 946 | } | ||
| 1025 | 947 | ||
| 1026 | key = sock_kmalloc(sk, sizeof(*key), gfp); | 948 | if (md5sig->entries4) |
| 1027 | if (!key) | 949 | memcpy(keys, md5sig->keys4, |
| 1028 | return -ENOMEM; | 950 | sizeof(*keys) * md5sig->entries4); |
| 1029 | if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) { | ||
| 1030 | sock_kfree_s(sk, key, sizeof(*key)); | ||
| 1031 | return -ENOMEM; | ||
| 1032 | } | ||
| 1033 | 951 | ||
| 1034 | memcpy(key->key, newkey, newkeylen); | 952 | /* Free old key list, and reference new one */ |
| 1035 | key->keylen = newkeylen; | 953 | kfree(md5sig->keys4); |
| 1036 | key->family = family; | 954 | md5sig->keys4 = keys; |
| 1037 | memcpy(&key->addr, addr, | 955 | md5sig->alloced4++; |
| 1038 | (family == AF_INET6) ? sizeof(struct in6_addr) : | 956 | } |
| 1039 | sizeof(struct in_addr)); | 957 | md5sig->entries4++; |
| 1040 | hlist_add_head_rcu(&key->node, &md5sig->head); | 958 | md5sig->keys4[md5sig->entries4 - 1].addr = addr; |
| 959 | md5sig->keys4[md5sig->entries4 - 1].base.key = newkey; | ||
| 960 | md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen; | ||
| 961 | } | ||
| 1041 | return 0; | 962 | return 0; |
| 1042 | } | 963 | } |
| 1043 | EXPORT_SYMBOL(tcp_md5_do_add); | 964 | EXPORT_SYMBOL(tcp_v4_md5_do_add); |
| 1044 | 965 | ||
| 1045 | int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family) | 966 | static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk, |
| 967 | u8 *newkey, u8 newkeylen) | ||
| 1046 | { | 968 | { |
| 1047 | struct tcp_sock *tp = tcp_sk(sk); | 969 | return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr, |
| 1048 | struct tcp_md5sig_key *key; | 970 | newkey, newkeylen); |
| 1049 | struct tcp_md5sig_info *md5sig; | ||
| 1050 | |||
| 1051 | key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET); | ||
| 1052 | if (!key) | ||
| 1053 | return -ENOENT; | ||
| 1054 | hlist_del_rcu(&key->node); | ||
| 1055 | atomic_sub(sizeof(*key), &sk->sk_omem_alloc); | ||
| 1056 | kfree_rcu(key, rcu); | ||
| 1057 | md5sig = rcu_dereference_protected(tp->md5sig_info, | ||
| 1058 | sock_owned_by_user(sk)); | ||
| 1059 | if (hlist_empty(&md5sig->head)) | ||
| 1060 | tcp_free_md5sig_pool(); | ||
| 1061 | return 0; | ||
| 1062 | } | 971 | } |
| 1063 | EXPORT_SYMBOL(tcp_md5_do_del); | ||
| 1064 | 972 | ||
| 1065 | static void tcp_clear_md5_list(struct sock *sk) | 973 | int tcp_v4_md5_do_del(struct sock *sk, __be32 addr) |
| 1066 | { | 974 | { |
| 1067 | struct tcp_sock *tp = tcp_sk(sk); | 975 | struct tcp_sock *tp = tcp_sk(sk); |
| 1068 | struct tcp_md5sig_key *key; | 976 | int i; |
| 1069 | struct hlist_node *pos, *n; | 977 | |
| 1070 | struct tcp_md5sig_info *md5sig; | 978 | for (i = 0; i < tp->md5sig_info->entries4; i++) { |
| 979 | if (tp->md5sig_info->keys4[i].addr == addr) { | ||
| 980 | /* Free the key */ | ||
| 981 | kfree(tp->md5sig_info->keys4[i].base.key); | ||
| 982 | tp->md5sig_info->entries4--; | ||
| 983 | |||
| 984 | if (tp->md5sig_info->entries4 == 0) { | ||
| 985 | kfree(tp->md5sig_info->keys4); | ||
| 986 | tp->md5sig_info->keys4 = NULL; | ||
| 987 | tp->md5sig_info->alloced4 = 0; | ||
| 988 | tcp_free_md5sig_pool(); | ||
| 989 | } else if (tp->md5sig_info->entries4 != i) { | ||
| 990 | /* Need to do some manipulation */ | ||
| 991 | memmove(&tp->md5sig_info->keys4[i], | ||
| 992 | &tp->md5sig_info->keys4[i+1], | ||
| 993 | (tp->md5sig_info->entries4 - i) * | ||
| 994 | sizeof(struct tcp4_md5sig_key)); | ||
| 995 | } | ||
| 996 | return 0; | ||
| 997 | } | ||
| 998 | } | ||
| 999 | return -ENOENT; | ||
| 1000 | } | ||
| 1001 | EXPORT_SYMBOL(tcp_v4_md5_do_del); | ||
| 1071 | 1002 | ||
| 1072 | md5sig = rcu_dereference_protected(tp->md5sig_info, 1); | 1003 | static void tcp_v4_clear_md5_list(struct sock *sk) |
| 1004 | { | ||
| 1005 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 1073 | 1006 | ||
| 1074 | if (!hlist_empty(&md5sig->head)) | 1007 | /* Free each key, then the set of key keys, |
| 1008 | * the crypto element, and then decrement our | ||
| 1009 | * hold on the last resort crypto. | ||
| 1010 | */ | ||
| 1011 | if (tp->md5sig_info->entries4) { | ||
| 1012 | int i; | ||
| 1013 | for (i = 0; i < tp->md5sig_info->entries4; i++) | ||
| 1014 | kfree(tp->md5sig_info->keys4[i].base.key); | ||
| 1015 | tp->md5sig_info->entries4 = 0; | ||
| 1075 | tcp_free_md5sig_pool(); | 1016 | tcp_free_md5sig_pool(); |
| 1076 | hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) { | 1017 | } |
| 1077 | hlist_del_rcu(&key->node); | 1018 | if (tp->md5sig_info->keys4) { |
| 1078 | atomic_sub(sizeof(*key), &sk->sk_omem_alloc); | 1019 | kfree(tp->md5sig_info->keys4); |
| 1079 | kfree_rcu(key, rcu); | 1020 | tp->md5sig_info->keys4 = NULL; |
| 1021 | tp->md5sig_info->alloced4 = 0; | ||
| 1080 | } | 1022 | } |
| 1081 | } | 1023 | } |
| 1082 | 1024 | ||
| @@ -1085,6 +1027,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, | |||
| 1085 | { | 1027 | { |
| 1086 | struct tcp_md5sig cmd; | 1028 | struct tcp_md5sig cmd; |
| 1087 | struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; | 1029 | struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; |
| 1030 | u8 *newkey; | ||
| 1088 | 1031 | ||
| 1089 | if (optlen < sizeof(cmd)) | 1032 | if (optlen < sizeof(cmd)) |
| 1090 | return -EINVAL; | 1033 | return -EINVAL; |
| @@ -1095,16 +1038,32 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, | |||
| 1095 | if (sin->sin_family != AF_INET) | 1038 | if (sin->sin_family != AF_INET) |
| 1096 | return -EINVAL; | 1039 | return -EINVAL; |
| 1097 | 1040 | ||
| 1098 | if (!cmd.tcpm_key || !cmd.tcpm_keylen) | 1041 | if (!cmd.tcpm_key || !cmd.tcpm_keylen) { |
| 1099 | return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, | 1042 | if (!tcp_sk(sk)->md5sig_info) |
| 1100 | AF_INET); | 1043 | return -ENOENT; |
| 1044 | return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr); | ||
| 1045 | } | ||
| 1101 | 1046 | ||
| 1102 | if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) | 1047 | if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) |
| 1103 | return -EINVAL; | 1048 | return -EINVAL; |
| 1104 | 1049 | ||
| 1105 | return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, | 1050 | if (!tcp_sk(sk)->md5sig_info) { |
| 1106 | AF_INET, cmd.tcpm_key, cmd.tcpm_keylen, | 1051 | struct tcp_sock *tp = tcp_sk(sk); |
| 1107 | GFP_KERNEL); | 1052 | struct tcp_md5sig_info *p; |
| 1053 | |||
| 1054 | p = kzalloc(sizeof(*p), sk->sk_allocation); | ||
| 1055 | if (!p) | ||
| 1056 | return -EINVAL; | ||
| 1057 | |||
| 1058 | tp->md5sig_info = p; | ||
| 1059 | sk_nocaps_add(sk, NETIF_F_GSO_MASK); | ||
| 1060 | } | ||
| 1061 | |||
| 1062 | newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation); | ||
| 1063 | if (!newkey) | ||
| 1064 | return -ENOMEM; | ||
| 1065 | return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr, | ||
| 1066 | newkey, cmd.tcpm_keylen); | ||
| 1108 | } | 1067 | } |
| 1109 | 1068 | ||
| 1110 | static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, | 1069 | static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, |
| @@ -1130,8 +1089,8 @@ static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, | |||
| 1130 | return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp)); | 1089 | return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp)); |
| 1131 | } | 1090 | } |
| 1132 | 1091 | ||
| 1133 | static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, | 1092 | static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, |
| 1134 | __be32 daddr, __be32 saddr, const struct tcphdr *th) | 1093 | __be32 daddr, __be32 saddr, struct tcphdr *th) |
| 1135 | { | 1094 | { |
| 1136 | struct tcp_md5sig_pool *hp; | 1095 | struct tcp_md5sig_pool *hp; |
| 1137 | struct hash_desc *desc; | 1096 | struct hash_desc *desc; |
| @@ -1163,12 +1122,12 @@ clear_hash_noput: | |||
| 1163 | } | 1122 | } |
| 1164 | 1123 | ||
| 1165 | int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, | 1124 | int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, |
| 1166 | const struct sock *sk, const struct request_sock *req, | 1125 | struct sock *sk, struct request_sock *req, |
| 1167 | const struct sk_buff *skb) | 1126 | struct sk_buff *skb) |
| 1168 | { | 1127 | { |
| 1169 | struct tcp_md5sig_pool *hp; | 1128 | struct tcp_md5sig_pool *hp; |
| 1170 | struct hash_desc *desc; | 1129 | struct hash_desc *desc; |
| 1171 | const struct tcphdr *th = tcp_hdr(skb); | 1130 | struct tcphdr *th = tcp_hdr(skb); |
| 1172 | __be32 saddr, daddr; | 1131 | __be32 saddr, daddr; |
| 1173 | 1132 | ||
| 1174 | if (sk) { | 1133 | if (sk) { |
| @@ -1213,7 +1172,7 @@ clear_hash_noput: | |||
| 1213 | } | 1172 | } |
| 1214 | EXPORT_SYMBOL(tcp_v4_md5_hash_skb); | 1173 | EXPORT_SYMBOL(tcp_v4_md5_hash_skb); |
| 1215 | 1174 | ||
| 1216 | static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) | 1175 | static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) |
| 1217 | { | 1176 | { |
| 1218 | /* | 1177 | /* |
| 1219 | * This gets called for each TCP segment that arrives | 1178 | * This gets called for each TCP segment that arrives |
| @@ -1223,29 +1182,28 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) | |||
| 1223 | * o MD5 hash and we're not expecting one. | 1182 | * o MD5 hash and we're not expecting one. |
| 1224 | * o MD5 hash and its wrong. | 1183 | * o MD5 hash and its wrong. |
| 1225 | */ | 1184 | */ |
| 1226 | const __u8 *hash_location = NULL; | 1185 | __u8 *hash_location = NULL; |
| 1227 | struct tcp_md5sig_key *hash_expected; | 1186 | struct tcp_md5sig_key *hash_expected; |
| 1228 | const struct iphdr *iph = ip_hdr(skb); | 1187 | const struct iphdr *iph = ip_hdr(skb); |
| 1229 | const struct tcphdr *th = tcp_hdr(skb); | 1188 | struct tcphdr *th = tcp_hdr(skb); |
| 1230 | int genhash; | 1189 | int genhash; |
| 1231 | unsigned char newhash[16]; | 1190 | unsigned char newhash[16]; |
| 1232 | 1191 | ||
| 1233 | hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr, | 1192 | hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr); |
| 1234 | AF_INET); | ||
| 1235 | hash_location = tcp_parse_md5sig_option(th); | 1193 | hash_location = tcp_parse_md5sig_option(th); |
| 1236 | 1194 | ||
| 1237 | /* We've parsed the options - do we have a hash? */ | 1195 | /* We've parsed the options - do we have a hash? */ |
| 1238 | if (!hash_expected && !hash_location) | 1196 | if (!hash_expected && !hash_location) |
| 1239 | return false; | 1197 | return 0; |
| 1240 | 1198 | ||
| 1241 | if (hash_expected && !hash_location) { | 1199 | if (hash_expected && !hash_location) { |
| 1242 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); | 1200 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); |
| 1243 | return true; | 1201 | return 1; |
| 1244 | } | 1202 | } |
| 1245 | 1203 | ||
| 1246 | if (!hash_expected && hash_location) { | 1204 | if (!hash_expected && hash_location) { |
| 1247 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); | 1205 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); |
| 1248 | return true; | 1206 | return 1; |
| 1249 | } | 1207 | } |
| 1250 | 1208 | ||
| 1251 | /* Okay, so this is hash_expected and hash_location - | 1209 | /* Okay, so this is hash_expected and hash_location - |
| @@ -1256,14 +1214,15 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) | |||
| 1256 | NULL, NULL, skb); | 1214 | NULL, NULL, skb); |
| 1257 | 1215 | ||
| 1258 | if (genhash || memcmp(hash_location, newhash, 16) != 0) { | 1216 | if (genhash || memcmp(hash_location, newhash, 16) != 0) { |
| 1259 | net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", | 1217 | if (net_ratelimit()) { |
| 1260 | &iph->saddr, ntohs(th->source), | 1218 | printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", |
| 1261 | &iph->daddr, ntohs(th->dest), | 1219 | &iph->saddr, ntohs(th->source), |
| 1262 | genhash ? " tcp_v4_calc_md5_hash failed" | 1220 | &iph->daddr, ntohs(th->dest), |
| 1263 | : ""); | 1221 | genhash ? " tcp_v4_calc_md5_hash failed" : ""); |
| 1264 | return true; | 1222 | } |
| 1223 | return 1; | ||
| 1265 | } | 1224 | } |
| 1266 | return false; | 1225 | return 0; |
| 1267 | } | 1226 | } |
| 1268 | 1227 | ||
| 1269 | #endif | 1228 | #endif |
| @@ -1285,189 +1244,11 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { | |||
| 1285 | }; | 1244 | }; |
| 1286 | #endif | 1245 | #endif |
| 1287 | 1246 | ||
| 1288 | static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, | ||
| 1289 | struct request_sock *req, | ||
| 1290 | struct tcp_fastopen_cookie *foc, | ||
| 1291 | struct tcp_fastopen_cookie *valid_foc) | ||
| 1292 | { | ||
| 1293 | bool skip_cookie = false; | ||
| 1294 | struct fastopen_queue *fastopenq; | ||
| 1295 | |||
| 1296 | if (likely(!fastopen_cookie_present(foc))) { | ||
| 1297 | /* See include/net/tcp.h for the meaning of these knobs */ | ||
| 1298 | if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) || | ||
| 1299 | ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) && | ||
| 1300 | (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1))) | ||
| 1301 | skip_cookie = true; /* no cookie to validate */ | ||
| 1302 | else | ||
| 1303 | return false; | ||
| 1304 | } | ||
| 1305 | fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; | ||
| 1306 | /* A FO option is present; bump the counter. */ | ||
| 1307 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); | ||
| 1308 | |||
| 1309 | /* Make sure the listener has enabled fastopen, and we don't | ||
| 1310 | * exceed the max # of pending TFO requests allowed before trying | ||
| 1311 | * to validating the cookie in order to avoid burning CPU cycles | ||
| 1312 | * unnecessarily. | ||
| 1313 | * | ||
| 1314 | * XXX (TFO) - The implication of checking the max_qlen before | ||
| 1315 | * processing a cookie request is that clients can't differentiate | ||
| 1316 | * between qlen overflow causing Fast Open to be disabled | ||
| 1317 | * temporarily vs a server not supporting Fast Open at all. | ||
| 1318 | */ | ||
| 1319 | if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 || | ||
| 1320 | fastopenq == NULL || fastopenq->max_qlen == 0) | ||
| 1321 | return false; | ||
| 1322 | |||
| 1323 | if (fastopenq->qlen >= fastopenq->max_qlen) { | ||
| 1324 | struct request_sock *req1; | ||
| 1325 | spin_lock(&fastopenq->lock); | ||
| 1326 | req1 = fastopenq->rskq_rst_head; | ||
| 1327 | if ((req1 == NULL) || time_after(req1->expires, jiffies)) { | ||
| 1328 | spin_unlock(&fastopenq->lock); | ||
| 1329 | NET_INC_STATS_BH(sock_net(sk), | ||
| 1330 | LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); | ||
| 1331 | /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/ | ||
| 1332 | foc->len = -1; | ||
| 1333 | return false; | ||
| 1334 | } | ||
| 1335 | fastopenq->rskq_rst_head = req1->dl_next; | ||
| 1336 | fastopenq->qlen--; | ||
| 1337 | spin_unlock(&fastopenq->lock); | ||
| 1338 | reqsk_free(req1); | ||
| 1339 | } | ||
| 1340 | if (skip_cookie) { | ||
| 1341 | tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | ||
| 1342 | return true; | ||
| 1343 | } | ||
| 1344 | if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) { | ||
| 1345 | if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) { | ||
| 1346 | tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); | ||
| 1347 | if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) || | ||
| 1348 | memcmp(&foc->val[0], &valid_foc->val[0], | ||
| 1349 | TCP_FASTOPEN_COOKIE_SIZE) != 0) | ||
| 1350 | return false; | ||
| 1351 | valid_foc->len = -1; | ||
| 1352 | } | ||
| 1353 | /* Acknowledge the data received from the peer. */ | ||
| 1354 | tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | ||
| 1355 | return true; | ||
| 1356 | } else if (foc->len == 0) { /* Client requesting a cookie */ | ||
| 1357 | tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); | ||
| 1358 | NET_INC_STATS_BH(sock_net(sk), | ||
| 1359 | LINUX_MIB_TCPFASTOPENCOOKIEREQD); | ||
| 1360 | } else { | ||
| 1361 | /* Client sent a cookie with wrong size. Treat it | ||
| 1362 | * the same as invalid and return a valid one. | ||
| 1363 | */ | ||
| 1364 | tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); | ||
| 1365 | } | ||
| 1366 | return false; | ||
| 1367 | } | ||
| 1368 | |||
| 1369 | static int tcp_v4_conn_req_fastopen(struct sock *sk, | ||
| 1370 | struct sk_buff *skb, | ||
| 1371 | struct sk_buff *skb_synack, | ||
| 1372 | struct request_sock *req, | ||
| 1373 | struct request_values *rvp) | ||
| 1374 | { | ||
| 1375 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 1376 | struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; | ||
| 1377 | const struct inet_request_sock *ireq = inet_rsk(req); | ||
| 1378 | struct sock *child; | ||
| 1379 | int err; | ||
| 1380 | |||
| 1381 | req->num_retrans = 0; | ||
| 1382 | req->num_timeout = 0; | ||
| 1383 | req->sk = NULL; | ||
| 1384 | |||
| 1385 | child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); | ||
| 1386 | if (child == NULL) { | ||
| 1387 | NET_INC_STATS_BH(sock_net(sk), | ||
| 1388 | LINUX_MIB_TCPFASTOPENPASSIVEFAIL); | ||
| 1389 | kfree_skb(skb_synack); | ||
| 1390 | return -1; | ||
| 1391 | } | ||
| 1392 | err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, | ||
| 1393 | ireq->rmt_addr, ireq->opt); | ||
| 1394 | err = net_xmit_eval(err); | ||
| 1395 | if (!err) | ||
| 1396 | tcp_rsk(req)->snt_synack = tcp_time_stamp; | ||
| 1397 | /* XXX (TFO) - is it ok to ignore error and continue? */ | ||
| 1398 | |||
| 1399 | spin_lock(&queue->fastopenq->lock); | ||
| 1400 | queue->fastopenq->qlen++; | ||
| 1401 | spin_unlock(&queue->fastopenq->lock); | ||
| 1402 | |||
| 1403 | /* Initialize the child socket. Have to fix some values to take | ||
| 1404 | * into account the child is a Fast Open socket and is created | ||
| 1405 | * only out of the bits carried in the SYN packet. | ||
| 1406 | */ | ||
| 1407 | tp = tcp_sk(child); | ||
| 1408 | |||
| 1409 | tp->fastopen_rsk = req; | ||
| 1410 | /* Do a hold on the listner sk so that if the listener is being | ||
| 1411 | * closed, the child that has been accepted can live on and still | ||
| 1412 | * access listen_lock. | ||
| 1413 | */ | ||
| 1414 | sock_hold(sk); | ||
| 1415 | tcp_rsk(req)->listener = sk; | ||
| 1416 | |||
| 1417 | /* RFC1323: The window in SYN & SYN/ACK segments is never | ||
| 1418 | * scaled. So correct it appropriately. | ||
| 1419 | */ | ||
| 1420 | tp->snd_wnd = ntohs(tcp_hdr(skb)->window); | ||
| 1421 | |||
| 1422 | /* Activate the retrans timer so that SYNACK can be retransmitted. | ||
| 1423 | * The request socket is not added to the SYN table of the parent | ||
| 1424 | * because it's been added to the accept queue directly. | ||
| 1425 | */ | ||
| 1426 | inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, | ||
| 1427 | TCP_TIMEOUT_INIT, TCP_RTO_MAX); | ||
| 1428 | |||
| 1429 | /* Add the child socket directly into the accept queue */ | ||
| 1430 | inet_csk_reqsk_queue_add(sk, req, child); | ||
| 1431 | |||
| 1432 | /* Now finish processing the fastopen child socket. */ | ||
| 1433 | inet_csk(child)->icsk_af_ops->rebuild_header(child); | ||
| 1434 | tcp_init_congestion_control(child); | ||
| 1435 | tcp_mtup_init(child); | ||
| 1436 | tcp_init_buffer_space(child); | ||
| 1437 | tcp_init_metrics(child); | ||
| 1438 | |||
| 1439 | /* Queue the data carried in the SYN packet. We need to first | ||
| 1440 | * bump skb's refcnt because the caller will attempt to free it. | ||
| 1441 | * | ||
| 1442 | * XXX (TFO) - we honor a zero-payload TFO request for now. | ||
| 1443 | * (Any reason not to?) | ||
| 1444 | */ | ||
| 1445 | if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) { | ||
| 1446 | /* Don't queue the skb if there is no payload in SYN. | ||
| 1447 | * XXX (TFO) - How about SYN+FIN? | ||
| 1448 | */ | ||
| 1449 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | ||
| 1450 | } else { | ||
| 1451 | skb = skb_get(skb); | ||
| 1452 | skb_dst_drop(skb); | ||
| 1453 | __skb_pull(skb, tcp_hdr(skb)->doff * 4); | ||
| 1454 | skb_set_owner_r(skb, child); | ||
| 1455 | __skb_queue_tail(&child->sk_receive_queue, skb); | ||
| 1456 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | ||
| 1457 | tp->syn_data_acked = 1; | ||
| 1458 | } | ||
| 1459 | sk->sk_data_ready(sk, 0); | ||
| 1460 | bh_unlock_sock(child); | ||
| 1461 | sock_put(child); | ||
| 1462 | WARN_ON(req->sk == NULL); | ||
| 1463 | return 0; | ||
| 1464 | } | ||
| 1465 | |||
| 1466 | int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | 1247 | int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) |
| 1467 | { | 1248 | { |
| 1468 | struct tcp_extend_values tmp_ext; | 1249 | struct tcp_extend_values tmp_ext; |
| 1469 | struct tcp_options_received tmp_opt; | 1250 | struct tcp_options_received tmp_opt; |
| 1470 | const u8 *hash_location; | 1251 | u8 *hash_location; |
| 1471 | struct request_sock *req; | 1252 | struct request_sock *req; |
| 1472 | struct inet_request_sock *ireq; | 1253 | struct inet_request_sock *ireq; |
| 1473 | struct tcp_sock *tp = tcp_sk(sk); | 1254 | struct tcp_sock *tp = tcp_sk(sk); |
| @@ -1475,12 +1256,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
| 1475 | __be32 saddr = ip_hdr(skb)->saddr; | 1256 | __be32 saddr = ip_hdr(skb)->saddr; |
| 1476 | __be32 daddr = ip_hdr(skb)->daddr; | 1257 | __be32 daddr = ip_hdr(skb)->daddr; |
| 1477 | __u32 isn = TCP_SKB_CB(skb)->when; | 1258 | __u32 isn = TCP_SKB_CB(skb)->when; |
| 1478 | bool want_cookie = false; | 1259 | int want_cookie = 0; |
| 1479 | struct flowi4 fl4; | ||
| 1480 | struct tcp_fastopen_cookie foc = { .len = -1 }; | ||
| 1481 | struct tcp_fastopen_cookie valid_foc = { .len = -1 }; | ||
| 1482 | struct sk_buff *skb_synack; | ||
| 1483 | int do_fastopen; | ||
| 1484 | 1260 | ||
| 1485 | /* Never answer to SYNs send to broadcast or multicast */ | 1261 | /* Never answer to SYNs send to broadcast or multicast */ |
| 1486 | if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) | 1262 | if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) |
| @@ -1515,8 +1291,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
| 1515 | tcp_clear_options(&tmp_opt); | 1291 | tcp_clear_options(&tmp_opt); |
| 1516 | tmp_opt.mss_clamp = TCP_MSS_DEFAULT; | 1292 | tmp_opt.mss_clamp = TCP_MSS_DEFAULT; |
| 1517 | tmp_opt.user_mss = tp->rx_opt.user_mss; | 1293 | tmp_opt.user_mss = tp->rx_opt.user_mss; |
| 1518 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0, | 1294 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0); |
| 1519 | want_cookie ? NULL : &foc); | ||
| 1520 | 1295 | ||
| 1521 | if (tmp_opt.cookie_plus > 0 && | 1296 | if (tmp_opt.cookie_plus > 0 && |
| 1522 | tmp_opt.saw_tstamp && | 1297 | tmp_opt.saw_tstamp && |
| @@ -1540,7 +1315,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
| 1540 | while (l-- > 0) | 1315 | while (l-- > 0) |
| 1541 | *c++ ^= *hash_location++; | 1316 | *c++ ^= *hash_location++; |
| 1542 | 1317 | ||
| 1543 | want_cookie = false; /* not our kind of cookie */ | 1318 | want_cookie = 0; /* not our kind of cookie */ |
| 1544 | tmp_ext.cookie_out_never = 0; /* false */ | 1319 | tmp_ext.cookie_out_never = 0; /* false */ |
| 1545 | tmp_ext.cookie_plus = tmp_opt.cookie_plus; | 1320 | tmp_ext.cookie_plus = tmp_opt.cookie_plus; |
| 1546 | } else if (!tp->rx_opt.cookie_in_always) { | 1321 | } else if (!tp->rx_opt.cookie_in_always) { |
| @@ -1562,18 +1337,21 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
| 1562 | ireq->loc_addr = daddr; | 1337 | ireq->loc_addr = daddr; |
| 1563 | ireq->rmt_addr = saddr; | 1338 | ireq->rmt_addr = saddr; |
| 1564 | ireq->no_srccheck = inet_sk(sk)->transparent; | 1339 | ireq->no_srccheck = inet_sk(sk)->transparent; |
| 1565 | ireq->opt = tcp_v4_save_options(skb); | 1340 | ireq->opt = tcp_v4_save_options(sk, skb); |
| 1566 | 1341 | ||
| 1567 | if (security_inet_conn_request(sk, skb, req)) | 1342 | if (security_inet_conn_request(sk, skb, req)) |
| 1568 | goto drop_and_free; | 1343 | goto drop_and_free; |
| 1569 | 1344 | ||
| 1570 | if (!want_cookie || tmp_opt.tstamp_ok) | 1345 | if (!want_cookie || tmp_opt.tstamp_ok) |
| 1571 | TCP_ECN_create_request(req, skb); | 1346 | TCP_ECN_create_request(req, tcp_hdr(skb)); |
| 1572 | 1347 | ||
| 1573 | if (want_cookie) { | 1348 | if (want_cookie) { |
| 1574 | isn = cookie_v4_init_sequence(sk, skb, &req->mss); | 1349 | isn = cookie_v4_init_sequence(sk, skb, &req->mss); |
| 1575 | req->cookie_ts = tmp_opt.tstamp_ok; | 1350 | req->cookie_ts = tmp_opt.tstamp_ok; |
| 1576 | } else if (!isn) { | 1351 | } else if (!isn) { |
| 1352 | struct inet_peer *peer = NULL; | ||
| 1353 | struct flowi4 fl4; | ||
| 1354 | |||
| 1577 | /* VJ's idea. We save last timestamp seen | 1355 | /* VJ's idea. We save last timestamp seen |
| 1578 | * from the destination in peer table, when entering | 1356 | * from the destination in peer table, when entering |
| 1579 | * state TIME-WAIT, and check against it before | 1357 | * state TIME-WAIT, and check against it before |
| @@ -1586,8 +1364,12 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
| 1586 | if (tmp_opt.saw_tstamp && | 1364 | if (tmp_opt.saw_tstamp && |
| 1587 | tcp_death_row.sysctl_tw_recycle && | 1365 | tcp_death_row.sysctl_tw_recycle && |
| 1588 | (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && | 1366 | (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && |
| 1589 | fl4.daddr == saddr) { | 1367 | fl4.daddr == saddr && |
| 1590 | if (!tcp_peer_is_proven(req, dst, true)) { | 1368 | (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) { |
| 1369 | inet_peer_refcheck(peer); | ||
| 1370 | if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && | ||
| 1371 | (s32)(peer->tcp_ts - req->ts_recent) > | ||
| 1372 | TCP_PAWS_WINDOW) { | ||
| 1591 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); | 1373 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); |
| 1592 | goto drop_and_release; | 1374 | goto drop_and_release; |
| 1593 | } | 1375 | } |
| @@ -1596,7 +1378,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
| 1596 | else if (!sysctl_tcp_syncookies && | 1378 | else if (!sysctl_tcp_syncookies && |
| 1597 | (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < | 1379 | (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < |
| 1598 | (sysctl_max_syn_backlog >> 2)) && | 1380 | (sysctl_max_syn_backlog >> 2)) && |
| 1599 | !tcp_peer_is_proven(req, dst, false)) { | 1381 | (!peer || !peer->tcp_ts_stamp) && |
| 1382 | (!dst || !dst_metric(dst, RTAX_RTT))) { | ||
| 1600 | /* Without syncookies last quarter of | 1383 | /* Without syncookies last quarter of |
| 1601 | * backlog is filled with destinations, | 1384 | * backlog is filled with destinations, |
| 1602 | * proven to be alive. | 1385 | * proven to be alive. |
| @@ -1604,7 +1387,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
| 1604 | * to destinations, already remembered | 1387 | * to destinations, already remembered |
| 1605 | * to the moment of synflood. | 1388 | * to the moment of synflood. |
| 1606 | */ | 1389 | */ |
| 1607 | LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), | 1390 | LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n", |
| 1608 | &saddr, ntohs(tcp_hdr(skb)->source)); | 1391 | &saddr, ntohs(tcp_hdr(skb)->source)); |
| 1609 | goto drop_and_release; | 1392 | goto drop_and_release; |
| 1610 | } | 1393 | } |
| @@ -1612,54 +1395,14 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
| 1612 | isn = tcp_v4_init_sequence(skb); | 1395 | isn = tcp_v4_init_sequence(skb); |
| 1613 | } | 1396 | } |
| 1614 | tcp_rsk(req)->snt_isn = isn; | 1397 | tcp_rsk(req)->snt_isn = isn; |
| 1398 | tcp_rsk(req)->snt_synack = tcp_time_stamp; | ||
| 1615 | 1399 | ||
| 1616 | if (dst == NULL) { | 1400 | if (tcp_v4_send_synack(sk, dst, req, |
| 1617 | dst = inet_csk_route_req(sk, &fl4, req); | 1401 | (struct request_values *)&tmp_ext) || |
| 1618 | if (dst == NULL) | 1402 | want_cookie) |
| 1619 | goto drop_and_free; | ||
| 1620 | } | ||
| 1621 | do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc); | ||
| 1622 | |||
| 1623 | /* We don't call tcp_v4_send_synack() directly because we need | ||
| 1624 | * to make sure a child socket can be created successfully before | ||
| 1625 | * sending back synack! | ||
| 1626 | * | ||
| 1627 | * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack() | ||
| 1628 | * (or better yet, call tcp_send_synack() in the child context | ||
| 1629 | * directly, but will have to fix bunch of other code first) | ||
| 1630 | * after syn_recv_sock() except one will need to first fix the | ||
| 1631 | * latter to remove its dependency on the current implementation | ||
| 1632 | * of tcp_v4_send_synack()->tcp_select_initial_window(). | ||
| 1633 | */ | ||
| 1634 | skb_synack = tcp_make_synack(sk, dst, req, | ||
| 1635 | (struct request_values *)&tmp_ext, | ||
| 1636 | fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL); | ||
| 1637 | |||
| 1638 | if (skb_synack) { | ||
| 1639 | __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr); | ||
| 1640 | skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb)); | ||
| 1641 | } else | ||
| 1642 | goto drop_and_free; | ||
| 1643 | |||
| 1644 | if (likely(!do_fastopen)) { | ||
| 1645 | int err; | ||
| 1646 | err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, | ||
| 1647 | ireq->rmt_addr, ireq->opt); | ||
| 1648 | err = net_xmit_eval(err); | ||
| 1649 | if (err || want_cookie) | ||
| 1650 | goto drop_and_free; | ||
| 1651 | |||
| 1652 | tcp_rsk(req)->snt_synack = tcp_time_stamp; | ||
| 1653 | tcp_rsk(req)->listener = NULL; | ||
| 1654 | /* Add the request_sock to the SYN table */ | ||
| 1655 | inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); | ||
| 1656 | if (fastopen_cookie_present(&foc) && foc.len != 0) | ||
| 1657 | NET_INC_STATS_BH(sock_net(sk), | ||
| 1658 | LINUX_MIB_TCPFASTOPENPASSIVEFAIL); | ||
| 1659 | } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req, | ||
| 1660 | (struct request_values *)&tmp_ext)) | ||
| 1661 | goto drop_and_free; | 1403 | goto drop_and_free; |
| 1662 | 1404 | ||
| 1405 | inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); | ||
| 1663 | return 0; | 1406 | return 0; |
| 1664 | 1407 | ||
| 1665 | drop_and_release: | 1408 | drop_and_release: |
| @@ -1697,7 +1440,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, | |||
| 1697 | goto exit_nonewsk; | 1440 | goto exit_nonewsk; |
| 1698 | 1441 | ||
| 1699 | newsk->sk_gso_type = SKB_GSO_TCPV4; | 1442 | newsk->sk_gso_type = SKB_GSO_TCPV4; |
| 1700 | inet_sk_rx_dst_set(newsk, skb); | ||
| 1701 | 1443 | ||
| 1702 | newtp = tcp_sk(newsk); | 1444 | newtp = tcp_sk(newsk); |
| 1703 | newinet = inet_sk(newsk); | 1445 | newinet = inet_sk(newsk); |
| @@ -1710,19 +1452,14 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, | |||
| 1710 | ireq->opt = NULL; | 1452 | ireq->opt = NULL; |
| 1711 | newinet->mc_index = inet_iif(skb); | 1453 | newinet->mc_index = inet_iif(skb); |
| 1712 | newinet->mc_ttl = ip_hdr(skb)->ttl; | 1454 | newinet->mc_ttl = ip_hdr(skb)->ttl; |
| 1713 | newinet->rcv_tos = ip_hdr(skb)->tos; | ||
| 1714 | inet_csk(newsk)->icsk_ext_hdr_len = 0; | 1455 | inet_csk(newsk)->icsk_ext_hdr_len = 0; |
| 1715 | if (inet_opt) | 1456 | if (inet_opt) |
| 1716 | inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; | 1457 | inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; |
| 1717 | newinet->inet_id = newtp->write_seq ^ jiffies; | 1458 | newinet->inet_id = newtp->write_seq ^ jiffies; |
| 1718 | 1459 | ||
| 1719 | if (!dst) { | 1460 | if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL) |
| 1720 | dst = inet_csk_route_child_sock(sk, newsk, req); | 1461 | goto put_and_exit; |
| 1721 | if (!dst) | 1462 | |
| 1722 | goto put_and_exit; | ||
| 1723 | } else { | ||
| 1724 | /* syncookie case : see end of cookie_v4_check() */ | ||
| 1725 | } | ||
| 1726 | sk_setup_caps(newsk, dst); | 1463 | sk_setup_caps(newsk, dst); |
| 1727 | 1464 | ||
| 1728 | tcp_mtup_init(newsk); | 1465 | tcp_mtup_init(newsk); |
| @@ -1733,13 +1470,14 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, | |||
| 1733 | newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; | 1470 | newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; |
| 1734 | 1471 | ||
| 1735 | tcp_initialize_rcv_mss(newsk); | 1472 | tcp_initialize_rcv_mss(newsk); |
| 1736 | tcp_synack_rtt_meas(newsk, req); | 1473 | if (tcp_rsk(req)->snt_synack) |
| 1737 | newtp->total_retrans = req->num_retrans; | 1474 | tcp_valid_rtt_meas(newsk, |
| 1475 | tcp_time_stamp - tcp_rsk(req)->snt_synack); | ||
| 1476 | newtp->total_retrans = req->retrans; | ||
| 1738 | 1477 | ||
| 1739 | #ifdef CONFIG_TCP_MD5SIG | 1478 | #ifdef CONFIG_TCP_MD5SIG |
| 1740 | /* Copy over the MD5 key from the original socket */ | 1479 | /* Copy over the MD5 key from the original socket */ |
| 1741 | key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, | 1480 | key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr); |
| 1742 | AF_INET); | ||
| 1743 | if (key != NULL) { | 1481 | if (key != NULL) { |
| 1744 | /* | 1482 | /* |
| 1745 | * We're using one, so create a matching key | 1483 | * We're using one, so create a matching key |
| @@ -1747,8 +1485,10 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, | |||
| 1747 | * memory, then we end up not copying the key | 1485 | * memory, then we end up not copying the key |
| 1748 | * across. Shucks. | 1486 | * across. Shucks. |
| 1749 | */ | 1487 | */ |
| 1750 | tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, | 1488 | char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC); |
| 1751 | AF_INET, key->key, key->keylen, GFP_ATOMIC); | 1489 | if (newkey != NULL) |
| 1490 | tcp_v4_md5_do_add(newsk, newinet->inet_daddr, | ||
| 1491 | newkey, key->keylen); | ||
| 1752 | sk_nocaps_add(newsk, NETIF_F_GSO_MASK); | 1492 | sk_nocaps_add(newsk, NETIF_F_GSO_MASK); |
| 1753 | } | 1493 | } |
| 1754 | #endif | 1494 | #endif |
| @@ -1767,8 +1507,7 @@ exit: | |||
| 1767 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); | 1507 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); |
| 1768 | return NULL; | 1508 | return NULL; |
| 1769 | put_and_exit: | 1509 | put_and_exit: |
| 1770 | inet_csk_prepare_forced_close(newsk); | 1510 | sock_put(newsk); |
| 1771 | tcp_done(newsk); | ||
| 1772 | goto exit; | 1511 | goto exit; |
| 1773 | } | 1512 | } |
| 1774 | EXPORT_SYMBOL(tcp_v4_syn_recv_sock); | 1513 | EXPORT_SYMBOL(tcp_v4_syn_recv_sock); |
| @@ -1783,7 +1522,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) | |||
| 1783 | struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, | 1522 | struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, |
| 1784 | iph->saddr, iph->daddr); | 1523 | iph->saddr, iph->daddr); |
| 1785 | if (req) | 1524 | if (req) |
| 1786 | return tcp_check_req(sk, skb, req, prev, false); | 1525 | return tcp_check_req(sk, skb, req, prev); |
| 1787 | 1526 | ||
| 1788 | nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, | 1527 | nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, |
| 1789 | th->source, iph->daddr, th->dest, inet_iif(skb)); | 1528 | th->source, iph->daddr, th->dest, inet_iif(skb)); |
| @@ -1849,16 +1588,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) | |||
| 1849 | #endif | 1588 | #endif |
| 1850 | 1589 | ||
| 1851 | if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ | 1590 | if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ |
| 1852 | struct dst_entry *dst = sk->sk_rx_dst; | 1591 | sock_rps_save_rxhash(sk, skb->rxhash); |
| 1853 | |||
| 1854 | sock_rps_save_rxhash(sk, skb); | ||
| 1855 | if (dst) { | ||
| 1856 | if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || | ||
| 1857 | dst->ops->check(dst, 0) == NULL) { | ||
| 1858 | dst_release(dst); | ||
| 1859 | sk->sk_rx_dst = NULL; | ||
| 1860 | } | ||
| 1861 | } | ||
| 1862 | if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { | 1592 | if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { |
| 1863 | rsk = sk; | 1593 | rsk = sk; |
| 1864 | goto reset; | 1594 | goto reset; |
| @@ -1875,7 +1605,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) | |||
| 1875 | goto discard; | 1605 | goto discard; |
| 1876 | 1606 | ||
| 1877 | if (nsk != sk) { | 1607 | if (nsk != sk) { |
| 1878 | sock_rps_save_rxhash(nsk, skb); | 1608 | sock_rps_save_rxhash(nsk, skb->rxhash); |
| 1879 | if (tcp_child_process(sk, nsk, skb)) { | 1609 | if (tcp_child_process(sk, nsk, skb)) { |
| 1880 | rsk = nsk; | 1610 | rsk = nsk; |
| 1881 | goto reset; | 1611 | goto reset; |
| @@ -1883,7 +1613,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) | |||
| 1883 | return 0; | 1613 | return 0; |
| 1884 | } | 1614 | } |
| 1885 | } else | 1615 | } else |
| 1886 | sock_rps_save_rxhash(sk, skb); | 1616 | sock_rps_save_rxhash(sk, skb->rxhash); |
| 1887 | 1617 | ||
| 1888 | if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { | 1618 | if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { |
| 1889 | rsk = sk; | 1619 | rsk = sk; |
| @@ -1908,43 +1638,6 @@ csum_err: | |||
| 1908 | } | 1638 | } |
| 1909 | EXPORT_SYMBOL(tcp_v4_do_rcv); | 1639 | EXPORT_SYMBOL(tcp_v4_do_rcv); |
| 1910 | 1640 | ||
| 1911 | void tcp_v4_early_demux(struct sk_buff *skb) | ||
| 1912 | { | ||
| 1913 | const struct iphdr *iph; | ||
| 1914 | const struct tcphdr *th; | ||
| 1915 | struct sock *sk; | ||
| 1916 | |||
| 1917 | if (skb->pkt_type != PACKET_HOST) | ||
| 1918 | return; | ||
| 1919 | |||
| 1920 | if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) | ||
| 1921 | return; | ||
| 1922 | |||
| 1923 | iph = ip_hdr(skb); | ||
| 1924 | th = tcp_hdr(skb); | ||
| 1925 | |||
| 1926 | if (th->doff < sizeof(struct tcphdr) / 4) | ||
| 1927 | return; | ||
| 1928 | |||
| 1929 | sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, | ||
| 1930 | iph->saddr, th->source, | ||
| 1931 | iph->daddr, ntohs(th->dest), | ||
| 1932 | skb->skb_iif); | ||
| 1933 | if (sk) { | ||
| 1934 | skb->sk = sk; | ||
| 1935 | skb->destructor = sock_edemux; | ||
| 1936 | if (sk->sk_state != TCP_TIME_WAIT) { | ||
| 1937 | struct dst_entry *dst = sk->sk_rx_dst; | ||
| 1938 | |||
| 1939 | if (dst) | ||
| 1940 | dst = dst_check(dst, 0); | ||
| 1941 | if (dst && | ||
| 1942 | inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) | ||
| 1943 | skb_dst_set_noref(skb, dst); | ||
| 1944 | } | ||
| 1945 | } | ||
| 1946 | } | ||
| 1947 | |||
| 1948 | /* | 1641 | /* |
| 1949 | * From tcp_input.c | 1642 | * From tcp_input.c |
| 1950 | */ | 1643 | */ |
| @@ -1952,7 +1645,7 @@ void tcp_v4_early_demux(struct sk_buff *skb) | |||
| 1952 | int tcp_v4_rcv(struct sk_buff *skb) | 1645 | int tcp_v4_rcv(struct sk_buff *skb) |
| 1953 | { | 1646 | { |
| 1954 | const struct iphdr *iph; | 1647 | const struct iphdr *iph; |
| 1955 | const struct tcphdr *th; | 1648 | struct tcphdr *th; |
| 1956 | struct sock *sk; | 1649 | struct sock *sk; |
| 1957 | int ret; | 1650 | int ret; |
| 1958 | struct net *net = dev_net(skb->dev); | 1651 | struct net *net = dev_net(skb->dev); |
| @@ -1987,7 +1680,7 @@ int tcp_v4_rcv(struct sk_buff *skb) | |||
| 1987 | skb->len - th->doff * 4); | 1680 | skb->len - th->doff * 4); |
| 1988 | TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); | 1681 | TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); |
| 1989 | TCP_SKB_CB(skb)->when = 0; | 1682 | TCP_SKB_CB(skb)->when = 0; |
| 1990 | TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); | 1683 | TCP_SKB_CB(skb)->flags = iph->tos; |
| 1991 | TCP_SKB_CB(skb)->sacked = 0; | 1684 | TCP_SKB_CB(skb)->sacked = 0; |
| 1992 | 1685 | ||
| 1993 | sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); | 1686 | sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); |
| @@ -2018,7 +1711,7 @@ process: | |||
| 2018 | #ifdef CONFIG_NET_DMA | 1711 | #ifdef CONFIG_NET_DMA |
| 2019 | struct tcp_sock *tp = tcp_sk(sk); | 1712 | struct tcp_sock *tp = tcp_sk(sk); |
| 2020 | if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) | 1713 | if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) |
| 2021 | tp->ucopy.dma_chan = net_dma_find_channel(); | 1714 | tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); |
| 2022 | if (tp->ucopy.dma_chan) | 1715 | if (tp->ucopy.dma_chan) |
| 2023 | ret = tcp_v4_do_rcv(sk, skb); | 1716 | ret = tcp_v4_do_rcv(sk, skb); |
| 2024 | else | 1717 | else |
| @@ -2027,8 +1720,7 @@ process: | |||
| 2027 | if (!tcp_prequeue(sk, skb)) | 1720 | if (!tcp_prequeue(sk, skb)) |
| 2028 | ret = tcp_v4_do_rcv(sk, skb); | 1721 | ret = tcp_v4_do_rcv(sk, skb); |
| 2029 | } | 1722 | } |
| 2030 | } else if (unlikely(sk_add_backlog(sk, skb, | 1723 | } else if (unlikely(sk_add_backlog(sk, skb))) { |
| 2031 | sk->sk_rcvbuf + sk->sk_sndbuf))) { | ||
| 2032 | bh_unlock_sock(sk); | 1724 | bh_unlock_sock(sk); |
| 2033 | NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); | 1725 | NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); |
| 2034 | goto discard_and_relse; | 1726 | goto discard_and_relse; |
| @@ -2094,29 +1786,49 @@ do_time_wait: | |||
| 2094 | goto discard_it; | 1786 | goto discard_it; |
| 2095 | } | 1787 | } |
| 2096 | 1788 | ||
| 1789 | struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it) | ||
| 1790 | { | ||
| 1791 | struct rtable *rt = (struct rtable *) __sk_dst_get(sk); | ||
| 1792 | struct inet_sock *inet = inet_sk(sk); | ||
| 1793 | struct inet_peer *peer; | ||
| 1794 | |||
| 1795 | if (!rt || | ||
| 1796 | inet->cork.fl.u.ip4.daddr != inet->inet_daddr) { | ||
| 1797 | peer = inet_getpeer_v4(inet->inet_daddr, 1); | ||
| 1798 | *release_it = true; | ||
| 1799 | } else { | ||
| 1800 | if (!rt->peer) | ||
| 1801 | rt_bind_peer(rt, inet->inet_daddr, 1); | ||
| 1802 | peer = rt->peer; | ||
| 1803 | *release_it = false; | ||
| 1804 | } | ||
| 1805 | |||
| 1806 | return peer; | ||
| 1807 | } | ||
| 1808 | EXPORT_SYMBOL(tcp_v4_get_peer); | ||
| 1809 | |||
| 1810 | void *tcp_v4_tw_get_peer(struct sock *sk) | ||
| 1811 | { | ||
| 1812 | struct inet_timewait_sock *tw = inet_twsk(sk); | ||
| 1813 | |||
| 1814 | return inet_getpeer_v4(tw->tw_daddr, 1); | ||
| 1815 | } | ||
| 1816 | EXPORT_SYMBOL(tcp_v4_tw_get_peer); | ||
| 1817 | |||
| 2097 | static struct timewait_sock_ops tcp_timewait_sock_ops = { | 1818 | static struct timewait_sock_ops tcp_timewait_sock_ops = { |
| 2098 | .twsk_obj_size = sizeof(struct tcp_timewait_sock), | 1819 | .twsk_obj_size = sizeof(struct tcp_timewait_sock), |
| 2099 | .twsk_unique = tcp_twsk_unique, | 1820 | .twsk_unique = tcp_twsk_unique, |
| 2100 | .twsk_destructor= tcp_twsk_destructor, | 1821 | .twsk_destructor= tcp_twsk_destructor, |
| 1822 | .twsk_getpeer = tcp_v4_tw_get_peer, | ||
| 2101 | }; | 1823 | }; |
| 2102 | 1824 | ||
| 2103 | void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) | ||
| 2104 | { | ||
| 2105 | struct dst_entry *dst = skb_dst(skb); | ||
| 2106 | |||
| 2107 | dst_hold(dst); | ||
| 2108 | sk->sk_rx_dst = dst; | ||
| 2109 | inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; | ||
| 2110 | } | ||
| 2111 | EXPORT_SYMBOL(inet_sk_rx_dst_set); | ||
| 2112 | |||
| 2113 | const struct inet_connection_sock_af_ops ipv4_specific = { | 1825 | const struct inet_connection_sock_af_ops ipv4_specific = { |
| 2114 | .queue_xmit = ip_queue_xmit, | 1826 | .queue_xmit = ip_queue_xmit, |
| 2115 | .send_check = tcp_v4_send_check, | 1827 | .send_check = tcp_v4_send_check, |
| 2116 | .rebuild_header = inet_sk_rebuild_header, | 1828 | .rebuild_header = inet_sk_rebuild_header, |
| 2117 | .sk_rx_dst_set = inet_sk_rx_dst_set, | ||
| 2118 | .conn_request = tcp_v4_conn_request, | 1829 | .conn_request = tcp_v4_conn_request, |
| 2119 | .syn_recv_sock = tcp_v4_syn_recv_sock, | 1830 | .syn_recv_sock = tcp_v4_syn_recv_sock, |
| 1831 | .get_peer = tcp_v4_get_peer, | ||
| 2120 | .net_header_len = sizeof(struct iphdr), | 1832 | .net_header_len = sizeof(struct iphdr), |
| 2121 | .setsockopt = ip_setsockopt, | 1833 | .setsockopt = ip_setsockopt, |
| 2122 | .getsockopt = ip_getsockopt, | 1834 | .getsockopt = ip_getsockopt, |
| @@ -2134,6 +1846,7 @@ EXPORT_SYMBOL(ipv4_specific); | |||
| 2134 | static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { | 1846 | static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { |
| 2135 | .md5_lookup = tcp_v4_md5_lookup, | 1847 | .md5_lookup = tcp_v4_md5_lookup, |
| 2136 | .calc_md5_hash = tcp_v4_md5_hash_skb, | 1848 | .calc_md5_hash = tcp_v4_md5_hash_skb, |
| 1849 | .md5_add = tcp_v4_md5_add_func, | ||
| 2137 | .md5_parse = tcp_v4_parse_md5_keys, | 1850 | .md5_parse = tcp_v4_parse_md5_keys, |
| 2138 | }; | 1851 | }; |
| 2139 | #endif | 1852 | #endif |
| @@ -2144,15 +1857,63 @@ static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { | |||
| 2144 | static int tcp_v4_init_sock(struct sock *sk) | 1857 | static int tcp_v4_init_sock(struct sock *sk) |
| 2145 | { | 1858 | { |
| 2146 | struct inet_connection_sock *icsk = inet_csk(sk); | 1859 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 1860 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2147 | 1861 | ||
| 2148 | tcp_init_sock(sk); | 1862 | skb_queue_head_init(&tp->out_of_order_queue); |
| 1863 | tcp_init_xmit_timers(sk); | ||
| 1864 | tcp_prequeue_init(tp); | ||
| 2149 | 1865 | ||
| 2150 | icsk->icsk_af_ops = &ipv4_specific; | 1866 | icsk->icsk_rto = TCP_TIMEOUT_INIT; |
| 1867 | tp->mdev = TCP_TIMEOUT_INIT; | ||
| 1868 | |||
| 1869 | /* So many TCP implementations out there (incorrectly) count the | ||
| 1870 | * initial SYN frame in their delayed-ACK and congestion control | ||
| 1871 | * algorithms that we must have the following bandaid to talk | ||
| 1872 | * efficiently to them. -DaveM | ||
| 1873 | */ | ||
| 1874 | tp->snd_cwnd = TCP_INIT_CWND; | ||
| 1875 | |||
| 1876 | /* See draft-stevens-tcpca-spec-01 for discussion of the | ||
| 1877 | * initialization of these values. | ||
| 1878 | */ | ||
| 1879 | tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; | ||
| 1880 | tp->snd_cwnd_clamp = ~0; | ||
| 1881 | tp->mss_cache = TCP_MSS_DEFAULT; | ||
| 1882 | |||
| 1883 | tp->reordering = sysctl_tcp_reordering; | ||
| 1884 | icsk->icsk_ca_ops = &tcp_init_congestion_ops; | ||
| 2151 | 1885 | ||
| 1886 | sk->sk_state = TCP_CLOSE; | ||
| 1887 | |||
| 1888 | sk->sk_write_space = sk_stream_write_space; | ||
| 1889 | sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); | ||
| 1890 | |||
| 1891 | icsk->icsk_af_ops = &ipv4_specific; | ||
| 1892 | icsk->icsk_sync_mss = tcp_sync_mss; | ||
| 2152 | #ifdef CONFIG_TCP_MD5SIG | 1893 | #ifdef CONFIG_TCP_MD5SIG |
| 2153 | tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; | 1894 | tp->af_specific = &tcp_sock_ipv4_specific; |
| 2154 | #endif | 1895 | #endif |
| 2155 | 1896 | ||
| 1897 | /* TCP Cookie Transactions */ | ||
| 1898 | if (sysctl_tcp_cookie_size > 0) { | ||
| 1899 | /* Default, cookies without s_data_payload. */ | ||
| 1900 | tp->cookie_values = | ||
| 1901 | kzalloc(sizeof(*tp->cookie_values), | ||
| 1902 | sk->sk_allocation); | ||
| 1903 | if (tp->cookie_values != NULL) | ||
| 1904 | kref_init(&tp->cookie_values->kref); | ||
| 1905 | } | ||
| 1906 | /* Presumed zeroed, in order of appearance: | ||
| 1907 | * cookie_in_always, cookie_out_never, | ||
| 1908 | * s_data_constant, s_data_in, s_data_out | ||
| 1909 | */ | ||
| 1910 | sk->sk_sndbuf = sysctl_tcp_wmem[1]; | ||
| 1911 | sk->sk_rcvbuf = sysctl_tcp_rmem[1]; | ||
| 1912 | |||
| 1913 | local_bh_disable(); | ||
| 1914 | percpu_counter_inc(&tcp_sockets_allocated); | ||
| 1915 | local_bh_enable(); | ||
| 1916 | |||
| 2156 | return 0; | 1917 | return 0; |
| 2157 | } | 1918 | } |
| 2158 | 1919 | ||
| @@ -2173,8 +1934,8 @@ void tcp_v4_destroy_sock(struct sock *sk) | |||
| 2173 | #ifdef CONFIG_TCP_MD5SIG | 1934 | #ifdef CONFIG_TCP_MD5SIG |
| 2174 | /* Clean up the MD5 key list, if any */ | 1935 | /* Clean up the MD5 key list, if any */ |
| 2175 | if (tp->md5sig_info) { | 1936 | if (tp->md5sig_info) { |
| 2176 | tcp_clear_md5_list(sk); | 1937 | tcp_v4_clear_md5_list(sk); |
| 2177 | kfree_rcu(tp->md5sig_info, rcu); | 1938 | kfree(tp->md5sig_info); |
| 2178 | tp->md5sig_info = NULL; | 1939 | tp->md5sig_info = NULL; |
| 2179 | } | 1940 | } |
| 2180 | #endif | 1941 | #endif |
| @@ -2191,19 +1952,22 @@ void tcp_v4_destroy_sock(struct sock *sk) | |||
| 2191 | if (inet_csk(sk)->icsk_bind_hash) | 1952 | if (inet_csk(sk)->icsk_bind_hash) |
| 2192 | inet_put_port(sk); | 1953 | inet_put_port(sk); |
| 2193 | 1954 | ||
| 1955 | /* | ||
| 1956 | * If sendmsg cached page exists, toss it. | ||
| 1957 | */ | ||
| 1958 | if (sk->sk_sndmsg_page) { | ||
| 1959 | __free_page(sk->sk_sndmsg_page); | ||
| 1960 | sk->sk_sndmsg_page = NULL; | ||
| 1961 | } | ||
| 1962 | |||
| 2194 | /* TCP Cookie Transactions */ | 1963 | /* TCP Cookie Transactions */ |
| 2195 | if (tp->cookie_values != NULL) { | 1964 | if (tp->cookie_values != NULL) { |
| 2196 | kref_put(&tp->cookie_values->kref, | 1965 | kref_put(&tp->cookie_values->kref, |
| 2197 | tcp_cookie_values_release); | 1966 | tcp_cookie_values_release); |
| 2198 | tp->cookie_values = NULL; | 1967 | tp->cookie_values = NULL; |
| 2199 | } | 1968 | } |
| 2200 | BUG_ON(tp->fastopen_rsk != NULL); | ||
| 2201 | 1969 | ||
| 2202 | /* If socket is aborted during connect operation */ | 1970 | percpu_counter_dec(&tcp_sockets_allocated); |
| 2203 | tcp_free_fastopen_req(tp); | ||
| 2204 | |||
| 2205 | sk_sockets_allocated_dec(sk); | ||
| 2206 | sock_release_memcg(sk); | ||
| 2207 | } | 1971 | } |
| 2208 | EXPORT_SYMBOL(tcp_v4_destroy_sock); | 1972 | EXPORT_SYMBOL(tcp_v4_destroy_sock); |
| 2209 | 1973 | ||
| @@ -2325,7 +2089,7 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) | |||
| 2325 | return rc; | 2089 | return rc; |
| 2326 | } | 2090 | } |
| 2327 | 2091 | ||
| 2328 | static inline bool empty_bucket(struct tcp_iter_state *st) | 2092 | static inline int empty_bucket(struct tcp_iter_state *st) |
| 2329 | { | 2093 | { |
| 2330 | return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) && | 2094 | return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) && |
| 2331 | hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); | 2095 | hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); |
| @@ -2572,7 +2336,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) | |||
| 2572 | } | 2336 | } |
| 2573 | } | 2337 | } |
| 2574 | 2338 | ||
| 2575 | int tcp_seq_open(struct inode *inode, struct file *file) | 2339 | static int tcp_seq_open(struct inode *inode, struct file *file) |
| 2576 | { | 2340 | { |
| 2577 | struct tcp_seq_afinfo *afinfo = PDE(inode)->data; | 2341 | struct tcp_seq_afinfo *afinfo = PDE(inode)->data; |
| 2578 | struct tcp_iter_state *s; | 2342 | struct tcp_iter_state *s; |
| @@ -2588,19 +2352,23 @@ int tcp_seq_open(struct inode *inode, struct file *file) | |||
| 2588 | s->last_pos = 0; | 2352 | s->last_pos = 0; |
| 2589 | return 0; | 2353 | return 0; |
| 2590 | } | 2354 | } |
| 2591 | EXPORT_SYMBOL(tcp_seq_open); | ||
| 2592 | 2355 | ||
| 2593 | int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo) | 2356 | int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo) |
| 2594 | { | 2357 | { |
| 2595 | int rc = 0; | 2358 | int rc = 0; |
| 2596 | struct proc_dir_entry *p; | 2359 | struct proc_dir_entry *p; |
| 2597 | 2360 | ||
| 2361 | afinfo->seq_fops.open = tcp_seq_open; | ||
| 2362 | afinfo->seq_fops.read = seq_read; | ||
| 2363 | afinfo->seq_fops.llseek = seq_lseek; | ||
| 2364 | afinfo->seq_fops.release = seq_release_net; | ||
| 2365 | |||
| 2598 | afinfo->seq_ops.start = tcp_seq_start; | 2366 | afinfo->seq_ops.start = tcp_seq_start; |
| 2599 | afinfo->seq_ops.next = tcp_seq_next; | 2367 | afinfo->seq_ops.next = tcp_seq_next; |
| 2600 | afinfo->seq_ops.stop = tcp_seq_stop; | 2368 | afinfo->seq_ops.stop = tcp_seq_stop; |
| 2601 | 2369 | ||
| 2602 | p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, | 2370 | p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, |
| 2603 | afinfo->seq_fops, afinfo); | 2371 | &afinfo->seq_fops, afinfo); |
| 2604 | if (!p) | 2372 | if (!p) |
| 2605 | rc = -ENOMEM; | 2373 | rc = -ENOMEM; |
| 2606 | return rc; | 2374 | return rc; |
| @@ -2613,11 +2381,11 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) | |||
| 2613 | } | 2381 | } |
| 2614 | EXPORT_SYMBOL(tcp_proc_unregister); | 2382 | EXPORT_SYMBOL(tcp_proc_unregister); |
| 2615 | 2383 | ||
| 2616 | static void get_openreq4(const struct sock *sk, const struct request_sock *req, | 2384 | static void get_openreq4(struct sock *sk, struct request_sock *req, |
| 2617 | struct seq_file *f, int i, kuid_t uid, int *len) | 2385 | struct seq_file *f, int i, int uid, int *len) |
| 2618 | { | 2386 | { |
| 2619 | const struct inet_request_sock *ireq = inet_rsk(req); | 2387 | const struct inet_request_sock *ireq = inet_rsk(req); |
| 2620 | long delta = req->expires - jiffies; | 2388 | int ttd = req->expires - jiffies; |
| 2621 | 2389 | ||
| 2622 | seq_printf(f, "%4d: %08X:%04X %08X:%04X" | 2390 | seq_printf(f, "%4d: %08X:%04X %08X:%04X" |
| 2623 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n", | 2391 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n", |
| @@ -2629,9 +2397,9 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req, | |||
| 2629 | TCP_SYN_RECV, | 2397 | TCP_SYN_RECV, |
| 2630 | 0, 0, /* could print option size, but that is af dependent. */ | 2398 | 0, 0, /* could print option size, but that is af dependent. */ |
| 2631 | 1, /* timers active (only the expire timer) */ | 2399 | 1, /* timers active (only the expire timer) */ |
| 2632 | jiffies_delta_to_clock_t(delta), | 2400 | jiffies_to_clock_t(ttd), |
| 2633 | req->num_timeout, | 2401 | req->retrans, |
| 2634 | from_kuid_munged(seq_user_ns(f), uid), | 2402 | uid, |
| 2635 | 0, /* non standard timer */ | 2403 | 0, /* non standard timer */ |
| 2636 | 0, /* open_requests have no inode */ | 2404 | 0, /* open_requests have no inode */ |
| 2637 | atomic_read(&sk->sk_refcnt), | 2405 | atomic_read(&sk->sk_refcnt), |
| @@ -2643,10 +2411,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) | |||
| 2643 | { | 2411 | { |
| 2644 | int timer_active; | 2412 | int timer_active; |
| 2645 | unsigned long timer_expires; | 2413 | unsigned long timer_expires; |
| 2646 | const struct tcp_sock *tp = tcp_sk(sk); | 2414 | struct tcp_sock *tp = tcp_sk(sk); |
| 2647 | const struct inet_connection_sock *icsk = inet_csk(sk); | 2415 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 2648 | const struct inet_sock *inet = inet_sk(sk); | 2416 | struct inet_sock *inet = inet_sk(sk); |
| 2649 | struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq; | ||
| 2650 | __be32 dest = inet->inet_daddr; | 2417 | __be32 dest = inet->inet_daddr; |
| 2651 | __be32 src = inet->inet_rcv_saddr; | 2418 | __be32 src = inet->inet_rcv_saddr; |
| 2652 | __u16 destp = ntohs(inet->inet_dport); | 2419 | __u16 destp = ntohs(inet->inet_dport); |
| @@ -2681,9 +2448,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) | |||
| 2681 | tp->write_seq - tp->snd_una, | 2448 | tp->write_seq - tp->snd_una, |
| 2682 | rx_queue, | 2449 | rx_queue, |
| 2683 | timer_active, | 2450 | timer_active, |
| 2684 | jiffies_delta_to_clock_t(timer_expires - jiffies), | 2451 | jiffies_to_clock_t(timer_expires - jiffies), |
| 2685 | icsk->icsk_retransmits, | 2452 | icsk->icsk_retransmits, |
| 2686 | from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), | 2453 | sock_i_uid(sk), |
| 2687 | icsk->icsk_probes_out, | 2454 | icsk->icsk_probes_out, |
| 2688 | sock_i_ino(sk), | 2455 | sock_i_ino(sk), |
| 2689 | atomic_read(&sk->sk_refcnt), sk, | 2456 | atomic_read(&sk->sk_refcnt), sk, |
| @@ -2691,18 +2458,19 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) | |||
| 2691 | jiffies_to_clock_t(icsk->icsk_ack.ato), | 2458 | jiffies_to_clock_t(icsk->icsk_ack.ato), |
| 2692 | (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, | 2459 | (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, |
| 2693 | tp->snd_cwnd, | 2460 | tp->snd_cwnd, |
| 2694 | sk->sk_state == TCP_LISTEN ? | 2461 | tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh, |
| 2695 | (fastopenq ? fastopenq->max_qlen : 0) : | ||
| 2696 | (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh), | ||
| 2697 | len); | 2462 | len); |
| 2698 | } | 2463 | } |
| 2699 | 2464 | ||
| 2700 | static void get_timewait4_sock(const struct inet_timewait_sock *tw, | 2465 | static void get_timewait4_sock(struct inet_timewait_sock *tw, |
| 2701 | struct seq_file *f, int i, int *len) | 2466 | struct seq_file *f, int i, int *len) |
| 2702 | { | 2467 | { |
| 2703 | __be32 dest, src; | 2468 | __be32 dest, src; |
| 2704 | __u16 destp, srcp; | 2469 | __u16 destp, srcp; |
| 2705 | long delta = tw->tw_ttd - jiffies; | 2470 | int ttd = tw->tw_ttd - jiffies; |
| 2471 | |||
| 2472 | if (ttd < 0) | ||
| 2473 | ttd = 0; | ||
| 2706 | 2474 | ||
| 2707 | dest = tw->tw_daddr; | 2475 | dest = tw->tw_daddr; |
| 2708 | src = tw->tw_rcv_saddr; | 2476 | src = tw->tw_rcv_saddr; |
| @@ -2712,7 +2480,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw, | |||
| 2712 | seq_printf(f, "%4d: %08X:%04X %08X:%04X" | 2480 | seq_printf(f, "%4d: %08X:%04X %08X:%04X" |
| 2713 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n", | 2481 | " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n", |
| 2714 | i, src, srcp, dest, destp, tw->tw_substate, 0, 0, | 2482 | i, src, srcp, dest, destp, tw->tw_substate, 0, 0, |
| 2715 | 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, | 2483 | 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, |
| 2716 | atomic_read(&tw->tw_refcnt), tw, len); | 2484 | atomic_read(&tw->tw_refcnt), tw, len); |
| 2717 | } | 2485 | } |
| 2718 | 2486 | ||
| @@ -2749,18 +2517,12 @@ out: | |||
| 2749 | return 0; | 2517 | return 0; |
| 2750 | } | 2518 | } |
| 2751 | 2519 | ||
| 2752 | static const struct file_operations tcp_afinfo_seq_fops = { | ||
| 2753 | .owner = THIS_MODULE, | ||
| 2754 | .open = tcp_seq_open, | ||
| 2755 | .read = seq_read, | ||
| 2756 | .llseek = seq_lseek, | ||
| 2757 | .release = seq_release_net | ||
| 2758 | }; | ||
| 2759 | |||
| 2760 | static struct tcp_seq_afinfo tcp4_seq_afinfo = { | 2520 | static struct tcp_seq_afinfo tcp4_seq_afinfo = { |
| 2761 | .name = "tcp", | 2521 | .name = "tcp", |
| 2762 | .family = AF_INET, | 2522 | .family = AF_INET, |
| 2763 | .seq_fops = &tcp_afinfo_seq_fops, | 2523 | .seq_fops = { |
| 2524 | .owner = THIS_MODULE, | ||
| 2525 | }, | ||
| 2764 | .seq_ops = { | 2526 | .seq_ops = { |
| 2765 | .show = tcp4_seq_show, | 2527 | .show = tcp4_seq_show, |
| 2766 | }, | 2528 | }, |
| @@ -2795,8 +2557,6 @@ void tcp4_proc_exit(void) | |||
| 2795 | struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) | 2557 | struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) |
| 2796 | { | 2558 | { |
| 2797 | const struct iphdr *iph = skb_gro_network_header(skb); | 2559 | const struct iphdr *iph = skb_gro_network_header(skb); |
| 2798 | __wsum wsum; | ||
| 2799 | __sum16 sum; | ||
| 2800 | 2560 | ||
| 2801 | switch (skb->ip_summed) { | 2561 | switch (skb->ip_summed) { |
| 2802 | case CHECKSUM_COMPLETE: | 2562 | case CHECKSUM_COMPLETE: |
| @@ -2805,22 +2565,11 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) | |||
| 2805 | skb->ip_summed = CHECKSUM_UNNECESSARY; | 2565 | skb->ip_summed = CHECKSUM_UNNECESSARY; |
| 2806 | break; | 2566 | break; |
| 2807 | } | 2567 | } |
| 2808 | flush: | ||
| 2809 | NAPI_GRO_CB(skb)->flush = 1; | ||
| 2810 | return NULL; | ||
| 2811 | 2568 | ||
| 2569 | /* fall through */ | ||
| 2812 | case CHECKSUM_NONE: | 2570 | case CHECKSUM_NONE: |
| 2813 | wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr, | 2571 | NAPI_GRO_CB(skb)->flush = 1; |
| 2814 | skb_gro_len(skb), IPPROTO_TCP, 0); | 2572 | return NULL; |
| 2815 | sum = csum_fold(skb_checksum(skb, | ||
| 2816 | skb_gro_offset(skb), | ||
| 2817 | skb_gro_len(skb), | ||
| 2818 | wsum)); | ||
| 2819 | if (sum) | ||
| 2820 | goto flush; | ||
| 2821 | |||
| 2822 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
| 2823 | break; | ||
| 2824 | } | 2573 | } |
| 2825 | 2574 | ||
| 2826 | return tcp_gro_receive(head, skb); | 2575 | return tcp_gro_receive(head, skb); |
| @@ -2855,8 +2604,6 @@ struct proto tcp_prot = { | |||
| 2855 | .sendmsg = tcp_sendmsg, | 2604 | .sendmsg = tcp_sendmsg, |
| 2856 | .sendpage = tcp_sendpage, | 2605 | .sendpage = tcp_sendpage, |
| 2857 | .backlog_rcv = tcp_v4_do_rcv, | 2606 | .backlog_rcv = tcp_v4_do_rcv, |
| 2858 | .release_cb = tcp_release_cb, | ||
| 2859 | .mtu_reduced = tcp_v4_mtu_reduced, | ||
| 2860 | .hash = inet_hash, | 2607 | .hash = inet_hash, |
| 2861 | .unhash = inet_unhash, | 2608 | .unhash = inet_unhash, |
| 2862 | .get_port = inet_csk_get_port, | 2609 | .get_port = inet_csk_get_port, |
| @@ -2865,6 +2612,7 @@ struct proto tcp_prot = { | |||
| 2865 | .orphan_count = &tcp_orphan_count, | 2612 | .orphan_count = &tcp_orphan_count, |
| 2866 | .memory_allocated = &tcp_memory_allocated, | 2613 | .memory_allocated = &tcp_memory_allocated, |
| 2867 | .memory_pressure = &tcp_memory_pressure, | 2614 | .memory_pressure = &tcp_memory_pressure, |
| 2615 | .sysctl_mem = sysctl_tcp_mem, | ||
| 2868 | .sysctl_wmem = sysctl_tcp_wmem, | 2616 | .sysctl_wmem = sysctl_tcp_wmem, |
| 2869 | .sysctl_rmem = sysctl_tcp_rmem, | 2617 | .sysctl_rmem = sysctl_tcp_rmem, |
| 2870 | .max_header = MAX_TCP_HEADER, | 2618 | .max_header = MAX_TCP_HEADER, |
| @@ -2878,21 +2626,19 @@ struct proto tcp_prot = { | |||
| 2878 | .compat_setsockopt = compat_tcp_setsockopt, | 2626 | .compat_setsockopt = compat_tcp_setsockopt, |
| 2879 | .compat_getsockopt = compat_tcp_getsockopt, | 2627 | .compat_getsockopt = compat_tcp_getsockopt, |
| 2880 | #endif | 2628 | #endif |
| 2881 | #ifdef CONFIG_MEMCG_KMEM | ||
| 2882 | .init_cgroup = tcp_init_cgroup, | ||
| 2883 | .destroy_cgroup = tcp_destroy_cgroup, | ||
| 2884 | .proto_cgroup = tcp_proto_cgroup, | ||
| 2885 | #endif | ||
| 2886 | }; | 2629 | }; |
| 2887 | EXPORT_SYMBOL(tcp_prot); | 2630 | EXPORT_SYMBOL(tcp_prot); |
| 2888 | 2631 | ||
| 2632 | |||
| 2889 | static int __net_init tcp_sk_init(struct net *net) | 2633 | static int __net_init tcp_sk_init(struct net *net) |
| 2890 | { | 2634 | { |
| 2891 | return 0; | 2635 | return inet_ctl_sock_create(&net->ipv4.tcp_sock, |
| 2636 | PF_INET, SOCK_RAW, IPPROTO_TCP, net); | ||
| 2892 | } | 2637 | } |
| 2893 | 2638 | ||
| 2894 | static void __net_exit tcp_sk_exit(struct net *net) | 2639 | static void __net_exit tcp_sk_exit(struct net *net) |
| 2895 | { | 2640 | { |
| 2641 | inet_ctl_sock_destroy(net->ipv4.tcp_sock); | ||
| 2896 | } | 2642 | } |
| 2897 | 2643 | ||
| 2898 | static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) | 2644 | static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) |
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c deleted file mode 100644 index b6f3583ddfe..00000000000 --- a/net/ipv4/tcp_memcontrol.c +++ /dev/null | |||
| @@ -1,291 +0,0 @@ | |||
| 1 | #include <net/tcp.h> | ||
| 2 | #include <net/tcp_memcontrol.h> | ||
| 3 | #include <net/sock.h> | ||
| 4 | #include <net/ip.h> | ||
| 5 | #include <linux/nsproxy.h> | ||
| 6 | #include <linux/memcontrol.h> | ||
| 7 | #include <linux/module.h> | ||
| 8 | |||
| 9 | static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto) | ||
| 10 | { | ||
| 11 | return container_of(cg_proto, struct tcp_memcontrol, cg_proto); | ||
| 12 | } | ||
| 13 | |||
| 14 | static void memcg_tcp_enter_memory_pressure(struct sock *sk) | ||
| 15 | { | ||
| 16 | if (sk->sk_cgrp->memory_pressure) | ||
| 17 | *sk->sk_cgrp->memory_pressure = 1; | ||
| 18 | } | ||
| 19 | EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure); | ||
| 20 | |||
| 21 | int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | ||
| 22 | { | ||
| 23 | /* | ||
| 24 | * The root cgroup does not use res_counters, but rather, | ||
| 25 | * rely on the data already collected by the network | ||
| 26 | * subsystem | ||
| 27 | */ | ||
| 28 | struct res_counter *res_parent = NULL; | ||
| 29 | struct cg_proto *cg_proto, *parent_cg; | ||
| 30 | struct tcp_memcontrol *tcp; | ||
| 31 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); | ||
| 32 | struct net *net = current->nsproxy->net_ns; | ||
| 33 | |||
| 34 | cg_proto = tcp_prot.proto_cgroup(memcg); | ||
| 35 | if (!cg_proto) | ||
| 36 | return 0; | ||
| 37 | |||
| 38 | tcp = tcp_from_cgproto(cg_proto); | ||
| 39 | |||
| 40 | tcp->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0]; | ||
| 41 | tcp->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1]; | ||
| 42 | tcp->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2]; | ||
| 43 | tcp->tcp_memory_pressure = 0; | ||
| 44 | |||
| 45 | parent_cg = tcp_prot.proto_cgroup(parent); | ||
| 46 | if (parent_cg) | ||
| 47 | res_parent = parent_cg->memory_allocated; | ||
| 48 | |||
| 49 | res_counter_init(&tcp->tcp_memory_allocated, res_parent); | ||
| 50 | percpu_counter_init(&tcp->tcp_sockets_allocated, 0); | ||
| 51 | |||
| 52 | cg_proto->enter_memory_pressure = memcg_tcp_enter_memory_pressure; | ||
| 53 | cg_proto->memory_pressure = &tcp->tcp_memory_pressure; | ||
| 54 | cg_proto->sysctl_mem = tcp->tcp_prot_mem; | ||
| 55 | cg_proto->memory_allocated = &tcp->tcp_memory_allocated; | ||
| 56 | cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated; | ||
| 57 | cg_proto->memcg = memcg; | ||
| 58 | |||
| 59 | return 0; | ||
| 60 | } | ||
| 61 | EXPORT_SYMBOL(tcp_init_cgroup); | ||
| 62 | |||
| 63 | void tcp_destroy_cgroup(struct mem_cgroup *memcg) | ||
| 64 | { | ||
| 65 | struct cg_proto *cg_proto; | ||
| 66 | struct tcp_memcontrol *tcp; | ||
| 67 | u64 val; | ||
| 68 | |||
| 69 | cg_proto = tcp_prot.proto_cgroup(memcg); | ||
| 70 | if (!cg_proto) | ||
| 71 | return; | ||
| 72 | |||
| 73 | tcp = tcp_from_cgproto(cg_proto); | ||
| 74 | percpu_counter_destroy(&tcp->tcp_sockets_allocated); | ||
| 75 | |||
| 76 | val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); | ||
| 77 | } | ||
| 78 | EXPORT_SYMBOL(tcp_destroy_cgroup); | ||
| 79 | |||
| 80 | static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) | ||
| 81 | { | ||
| 82 | struct net *net = current->nsproxy->net_ns; | ||
| 83 | struct tcp_memcontrol *tcp; | ||
| 84 | struct cg_proto *cg_proto; | ||
| 85 | u64 old_lim; | ||
| 86 | int i; | ||
| 87 | int ret; | ||
| 88 | |||
| 89 | cg_proto = tcp_prot.proto_cgroup(memcg); | ||
| 90 | if (!cg_proto) | ||
| 91 | return -EINVAL; | ||
| 92 | |||
| 93 | if (val > RESOURCE_MAX) | ||
| 94 | val = RESOURCE_MAX; | ||
| 95 | |||
| 96 | tcp = tcp_from_cgproto(cg_proto); | ||
| 97 | |||
| 98 | old_lim = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); | ||
| 99 | ret = res_counter_set_limit(&tcp->tcp_memory_allocated, val); | ||
| 100 | if (ret) | ||
| 101 | return ret; | ||
| 102 | |||
| 103 | for (i = 0; i < 3; i++) | ||
| 104 | tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT, | ||
| 105 | net->ipv4.sysctl_tcp_mem[i]); | ||
| 106 | |||
| 107 | if (val == RESOURCE_MAX) | ||
| 108 | clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); | ||
| 109 | else if (val != RESOURCE_MAX) { | ||
| 110 | /* | ||
| 111 | * The active bit needs to be written after the static_key | ||
| 112 | * update. This is what guarantees that the socket activation | ||
| 113 | * function is the last one to run. See sock_update_memcg() for | ||
| 114 | * details, and note that we don't mark any socket as belonging | ||
| 115 | * to this memcg until that flag is up. | ||
| 116 | * | ||
| 117 | * We need to do this, because static_keys will span multiple | ||
| 118 | * sites, but we can't control their order. If we mark a socket | ||
| 119 | * as accounted, but the accounting functions are not patched in | ||
| 120 | * yet, we'll lose accounting. | ||
| 121 | * | ||
| 122 | * We never race with the readers in sock_update_memcg(), | ||
| 123 | * because when this value change, the code to process it is not | ||
| 124 | * patched in yet. | ||
| 125 | * | ||
| 126 | * The activated bit is used to guarantee that no two writers | ||
| 127 | * will do the update in the same memcg. Without that, we can't | ||
| 128 | * properly shutdown the static key. | ||
| 129 | */ | ||
| 130 | if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags)) | ||
| 131 | static_key_slow_inc(&memcg_socket_limit_enabled); | ||
| 132 | set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); | ||
| 133 | } | ||
| 134 | |||
| 135 | return 0; | ||
| 136 | } | ||
| 137 | |||
| 138 | static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft, | ||
| 139 | const char *buffer) | ||
| 140 | { | ||
| 141 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | ||
| 142 | unsigned long long val; | ||
| 143 | int ret = 0; | ||
| 144 | |||
| 145 | switch (cft->private) { | ||
| 146 | case RES_LIMIT: | ||
| 147 | /* see memcontrol.c */ | ||
| 148 | ret = res_counter_memparse_write_strategy(buffer, &val); | ||
| 149 | if (ret) | ||
| 150 | break; | ||
| 151 | ret = tcp_update_limit(memcg, val); | ||
| 152 | break; | ||
| 153 | default: | ||
| 154 | ret = -EINVAL; | ||
| 155 | break; | ||
| 156 | } | ||
| 157 | return ret; | ||
| 158 | } | ||
| 159 | |||
| 160 | static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val) | ||
| 161 | { | ||
| 162 | struct tcp_memcontrol *tcp; | ||
| 163 | struct cg_proto *cg_proto; | ||
| 164 | |||
| 165 | cg_proto = tcp_prot.proto_cgroup(memcg); | ||
| 166 | if (!cg_proto) | ||
| 167 | return default_val; | ||
| 168 | |||
| 169 | tcp = tcp_from_cgproto(cg_proto); | ||
| 170 | return res_counter_read_u64(&tcp->tcp_memory_allocated, type); | ||
| 171 | } | ||
| 172 | |||
| 173 | static u64 tcp_read_usage(struct mem_cgroup *memcg) | ||
| 174 | { | ||
| 175 | struct tcp_memcontrol *tcp; | ||
| 176 | struct cg_proto *cg_proto; | ||
| 177 | |||
| 178 | cg_proto = tcp_prot.proto_cgroup(memcg); | ||
| 179 | if (!cg_proto) | ||
| 180 | return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT; | ||
| 181 | |||
| 182 | tcp = tcp_from_cgproto(cg_proto); | ||
| 183 | return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); | ||
| 184 | } | ||
| 185 | |||
| 186 | static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) | ||
| 187 | { | ||
| 188 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | ||
| 189 | u64 val; | ||
| 190 | |||
| 191 | switch (cft->private) { | ||
| 192 | case RES_LIMIT: | ||
| 193 | val = tcp_read_stat(memcg, RES_LIMIT, RESOURCE_MAX); | ||
| 194 | break; | ||
| 195 | case RES_USAGE: | ||
| 196 | val = tcp_read_usage(memcg); | ||
| 197 | break; | ||
| 198 | case RES_FAILCNT: | ||
| 199 | case RES_MAX_USAGE: | ||
| 200 | val = tcp_read_stat(memcg, cft->private, 0); | ||
| 201 | break; | ||
| 202 | default: | ||
| 203 | BUG(); | ||
| 204 | } | ||
| 205 | return val; | ||
| 206 | } | ||
| 207 | |||
| 208 | static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event) | ||
| 209 | { | ||
| 210 | struct mem_cgroup *memcg; | ||
| 211 | struct tcp_memcontrol *tcp; | ||
| 212 | struct cg_proto *cg_proto; | ||
| 213 | |||
| 214 | memcg = mem_cgroup_from_cont(cont); | ||
| 215 | cg_proto = tcp_prot.proto_cgroup(memcg); | ||
| 216 | if (!cg_proto) | ||
| 217 | return 0; | ||
| 218 | tcp = tcp_from_cgproto(cg_proto); | ||
| 219 | |||
| 220 | switch (event) { | ||
| 221 | case RES_MAX_USAGE: | ||
| 222 | res_counter_reset_max(&tcp->tcp_memory_allocated); | ||
| 223 | break; | ||
| 224 | case RES_FAILCNT: | ||
| 225 | res_counter_reset_failcnt(&tcp->tcp_memory_allocated); | ||
| 226 | break; | ||
| 227 | } | ||
| 228 | |||
| 229 | return 0; | ||
| 230 | } | ||
| 231 | |||
| 232 | unsigned long long tcp_max_memory(const struct mem_cgroup *memcg) | ||
| 233 | { | ||
| 234 | struct tcp_memcontrol *tcp; | ||
| 235 | struct cg_proto *cg_proto; | ||
| 236 | |||
| 237 | cg_proto = tcp_prot.proto_cgroup((struct mem_cgroup *)memcg); | ||
| 238 | if (!cg_proto) | ||
| 239 | return 0; | ||
| 240 | |||
| 241 | tcp = tcp_from_cgproto(cg_proto); | ||
| 242 | return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); | ||
| 243 | } | ||
| 244 | |||
| 245 | void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx) | ||
| 246 | { | ||
| 247 | struct tcp_memcontrol *tcp; | ||
| 248 | struct cg_proto *cg_proto; | ||
| 249 | |||
| 250 | cg_proto = tcp_prot.proto_cgroup(memcg); | ||
| 251 | if (!cg_proto) | ||
| 252 | return; | ||
| 253 | |||
| 254 | tcp = tcp_from_cgproto(cg_proto); | ||
| 255 | |||
| 256 | tcp->tcp_prot_mem[idx] = val; | ||
| 257 | } | ||
| 258 | |||
| 259 | static struct cftype tcp_files[] = { | ||
| 260 | { | ||
| 261 | .name = "kmem.tcp.limit_in_bytes", | ||
| 262 | .write_string = tcp_cgroup_write, | ||
| 263 | .read_u64 = tcp_cgroup_read, | ||
| 264 | .private = RES_LIMIT, | ||
| 265 | }, | ||
| 266 | { | ||
| 267 | .name = "kmem.tcp.usage_in_bytes", | ||
| 268 | .read_u64 = tcp_cgroup_read, | ||
| 269 | .private = RES_USAGE, | ||
| 270 | }, | ||
| 271 | { | ||
| 272 | .name = "kmem.tcp.failcnt", | ||
| 273 | .private = RES_FAILCNT, | ||
| 274 | .trigger = tcp_cgroup_reset, | ||
| 275 | .read_u64 = tcp_cgroup_read, | ||
| 276 | }, | ||
| 277 | { | ||
| 278 | .name = "kmem.tcp.max_usage_in_bytes", | ||
| 279 | .private = RES_MAX_USAGE, | ||
| 280 | .trigger = tcp_cgroup_reset, | ||
| 281 | .read_u64 = tcp_cgroup_read, | ||
| 282 | }, | ||
| 283 | { } /* terminate */ | ||
| 284 | }; | ||
| 285 | |||
| 286 | static int __init tcp_memcontrol_init(void) | ||
| 287 | { | ||
| 288 | WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files)); | ||
| 289 | return 0; | ||
| 290 | } | ||
| 291 | __initcall(tcp_memcontrol_init); | ||
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c deleted file mode 100644 index f696d7c2e9f..00000000000 --- a/net/ipv4/tcp_metrics.c +++ /dev/null | |||
| @@ -1,1091 +0,0 @@ | |||
| 1 | #include <linux/rcupdate.h> | ||
| 2 | #include <linux/spinlock.h> | ||
| 3 | #include <linux/jiffies.h> | ||
| 4 | #include <linux/module.h> | ||
| 5 | #include <linux/cache.h> | ||
| 6 | #include <linux/slab.h> | ||
| 7 | #include <linux/init.h> | ||
| 8 | #include <linux/tcp.h> | ||
| 9 | #include <linux/hash.h> | ||
| 10 | #include <linux/tcp_metrics.h> | ||
| 11 | #include <linux/vmalloc.h> | ||
| 12 | |||
| 13 | #include <net/inet_connection_sock.h> | ||
| 14 | #include <net/net_namespace.h> | ||
| 15 | #include <net/request_sock.h> | ||
| 16 | #include <net/inetpeer.h> | ||
| 17 | #include <net/sock.h> | ||
| 18 | #include <net/ipv6.h> | ||
| 19 | #include <net/dst.h> | ||
| 20 | #include <net/tcp.h> | ||
| 21 | #include <net/genetlink.h> | ||
| 22 | |||
| 23 | int sysctl_tcp_nometrics_save __read_mostly; | ||
| 24 | |||
| 25 | struct tcp_fastopen_metrics { | ||
| 26 | u16 mss; | ||
| 27 | u16 syn_loss:10; /* Recurring Fast Open SYN losses */ | ||
| 28 | unsigned long last_syn_loss; /* Last Fast Open SYN loss */ | ||
| 29 | struct tcp_fastopen_cookie cookie; | ||
| 30 | }; | ||
| 31 | |||
| 32 | struct tcp_metrics_block { | ||
| 33 | struct tcp_metrics_block __rcu *tcpm_next; | ||
| 34 | struct inetpeer_addr tcpm_addr; | ||
| 35 | unsigned long tcpm_stamp; | ||
| 36 | u32 tcpm_ts; | ||
| 37 | u32 tcpm_ts_stamp; | ||
| 38 | u32 tcpm_lock; | ||
| 39 | u32 tcpm_vals[TCP_METRIC_MAX + 1]; | ||
| 40 | struct tcp_fastopen_metrics tcpm_fastopen; | ||
| 41 | |||
| 42 | struct rcu_head rcu_head; | ||
| 43 | }; | ||
| 44 | |||
| 45 | static bool tcp_metric_locked(struct tcp_metrics_block *tm, | ||
| 46 | enum tcp_metric_index idx) | ||
| 47 | { | ||
| 48 | return tm->tcpm_lock & (1 << idx); | ||
| 49 | } | ||
| 50 | |||
| 51 | static u32 tcp_metric_get(struct tcp_metrics_block *tm, | ||
| 52 | enum tcp_metric_index idx) | ||
| 53 | { | ||
| 54 | return tm->tcpm_vals[idx]; | ||
| 55 | } | ||
| 56 | |||
| 57 | static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm, | ||
| 58 | enum tcp_metric_index idx) | ||
| 59 | { | ||
| 60 | return msecs_to_jiffies(tm->tcpm_vals[idx]); | ||
| 61 | } | ||
| 62 | |||
| 63 | static void tcp_metric_set(struct tcp_metrics_block *tm, | ||
| 64 | enum tcp_metric_index idx, | ||
| 65 | u32 val) | ||
| 66 | { | ||
| 67 | tm->tcpm_vals[idx] = val; | ||
| 68 | } | ||
| 69 | |||
| 70 | static void tcp_metric_set_msecs(struct tcp_metrics_block *tm, | ||
| 71 | enum tcp_metric_index idx, | ||
| 72 | u32 val) | ||
| 73 | { | ||
| 74 | tm->tcpm_vals[idx] = jiffies_to_msecs(val); | ||
| 75 | } | ||
| 76 | |||
| 77 | static bool addr_same(const struct inetpeer_addr *a, | ||
| 78 | const struct inetpeer_addr *b) | ||
| 79 | { | ||
| 80 | const struct in6_addr *a6, *b6; | ||
| 81 | |||
| 82 | if (a->family != b->family) | ||
| 83 | return false; | ||
| 84 | if (a->family == AF_INET) | ||
| 85 | return a->addr.a4 == b->addr.a4; | ||
| 86 | |||
| 87 | a6 = (const struct in6_addr *) &a->addr.a6[0]; | ||
| 88 | b6 = (const struct in6_addr *) &b->addr.a6[0]; | ||
| 89 | |||
| 90 | return ipv6_addr_equal(a6, b6); | ||
| 91 | } | ||
| 92 | |||
| 93 | struct tcpm_hash_bucket { | ||
| 94 | struct tcp_metrics_block __rcu *chain; | ||
| 95 | }; | ||
| 96 | |||
| 97 | static DEFINE_SPINLOCK(tcp_metrics_lock); | ||
| 98 | |||
| 99 | static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst) | ||
| 100 | { | ||
| 101 | u32 val; | ||
| 102 | |||
| 103 | tm->tcpm_stamp = jiffies; | ||
| 104 | |||
| 105 | val = 0; | ||
| 106 | if (dst_metric_locked(dst, RTAX_RTT)) | ||
| 107 | val |= 1 << TCP_METRIC_RTT; | ||
| 108 | if (dst_metric_locked(dst, RTAX_RTTVAR)) | ||
| 109 | val |= 1 << TCP_METRIC_RTTVAR; | ||
| 110 | if (dst_metric_locked(dst, RTAX_SSTHRESH)) | ||
| 111 | val |= 1 << TCP_METRIC_SSTHRESH; | ||
| 112 | if (dst_metric_locked(dst, RTAX_CWND)) | ||
| 113 | val |= 1 << TCP_METRIC_CWND; | ||
| 114 | if (dst_metric_locked(dst, RTAX_REORDERING)) | ||
| 115 | val |= 1 << TCP_METRIC_REORDERING; | ||
| 116 | tm->tcpm_lock = val; | ||
| 117 | |||
| 118 | tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT); | ||
| 119 | tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR); | ||
| 120 | tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); | ||
| 121 | tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); | ||
| 122 | tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); | ||
| 123 | tm->tcpm_ts = 0; | ||
| 124 | tm->tcpm_ts_stamp = 0; | ||
| 125 | tm->tcpm_fastopen.mss = 0; | ||
| 126 | tm->tcpm_fastopen.syn_loss = 0; | ||
| 127 | tm->tcpm_fastopen.cookie.len = 0; | ||
| 128 | } | ||
| 129 | |||
| 130 | static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, | ||
| 131 | struct inetpeer_addr *addr, | ||
| 132 | unsigned int hash, | ||
| 133 | bool reclaim) | ||
| 134 | { | ||
| 135 | struct tcp_metrics_block *tm; | ||
| 136 | struct net *net; | ||
| 137 | |||
| 138 | spin_lock_bh(&tcp_metrics_lock); | ||
| 139 | net = dev_net(dst->dev); | ||
| 140 | if (unlikely(reclaim)) { | ||
| 141 | struct tcp_metrics_block *oldest; | ||
| 142 | |||
| 143 | oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); | ||
| 144 | for (tm = rcu_dereference(oldest->tcpm_next); tm; | ||
| 145 | tm = rcu_dereference(tm->tcpm_next)) { | ||
| 146 | if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp)) | ||
| 147 | oldest = tm; | ||
| 148 | } | ||
| 149 | tm = oldest; | ||
| 150 | } else { | ||
| 151 | tm = kmalloc(sizeof(*tm), GFP_ATOMIC); | ||
| 152 | if (!tm) | ||
| 153 | goto out_unlock; | ||
| 154 | } | ||
| 155 | tm->tcpm_addr = *addr; | ||
| 156 | |||
| 157 | tcpm_suck_dst(tm, dst); | ||
| 158 | |||
| 159 | if (likely(!reclaim)) { | ||
| 160 | tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain; | ||
| 161 | rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm); | ||
| 162 | } | ||
| 163 | |||
| 164 | out_unlock: | ||
| 165 | spin_unlock_bh(&tcp_metrics_lock); | ||
| 166 | return tm; | ||
| 167 | } | ||
| 168 | |||
| 169 | #define TCP_METRICS_TIMEOUT (60 * 60 * HZ) | ||
| 170 | |||
| 171 | static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) | ||
| 172 | { | ||
| 173 | if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) | ||
| 174 | tcpm_suck_dst(tm, dst); | ||
| 175 | } | ||
| 176 | |||
| 177 | #define TCP_METRICS_RECLAIM_DEPTH 5 | ||
| 178 | #define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL | ||
| 179 | |||
| 180 | static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth) | ||
| 181 | { | ||
| 182 | if (tm) | ||
| 183 | return tm; | ||
| 184 | if (depth > TCP_METRICS_RECLAIM_DEPTH) | ||
| 185 | return TCP_METRICS_RECLAIM_PTR; | ||
| 186 | return NULL; | ||
| 187 | } | ||
| 188 | |||
| 189 | static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr, | ||
| 190 | struct net *net, unsigned int hash) | ||
| 191 | { | ||
| 192 | struct tcp_metrics_block *tm; | ||
| 193 | int depth = 0; | ||
| 194 | |||
| 195 | for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; | ||
| 196 | tm = rcu_dereference(tm->tcpm_next)) { | ||
| 197 | if (addr_same(&tm->tcpm_addr, addr)) | ||
| 198 | break; | ||
| 199 | depth++; | ||
| 200 | } | ||
| 201 | return tcp_get_encode(tm, depth); | ||
| 202 | } | ||
| 203 | |||
| 204 | static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, | ||
| 205 | struct dst_entry *dst) | ||
| 206 | { | ||
| 207 | struct tcp_metrics_block *tm; | ||
| 208 | struct inetpeer_addr addr; | ||
| 209 | unsigned int hash; | ||
| 210 | struct net *net; | ||
| 211 | |||
| 212 | addr.family = req->rsk_ops->family; | ||
| 213 | switch (addr.family) { | ||
| 214 | case AF_INET: | ||
| 215 | addr.addr.a4 = inet_rsk(req)->rmt_addr; | ||
| 216 | hash = (__force unsigned int) addr.addr.a4; | ||
| 217 | break; | ||
| 218 | case AF_INET6: | ||
| 219 | *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr; | ||
| 220 | hash = ipv6_addr_hash(&inet6_rsk(req)->rmt_addr); | ||
| 221 | break; | ||
| 222 | default: | ||
| 223 | return NULL; | ||
| 224 | } | ||
| 225 | |||
| 226 | net = dev_net(dst->dev); | ||
| 227 | hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); | ||
| 228 | |||
| 229 | for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; | ||
| 230 | tm = rcu_dereference(tm->tcpm_next)) { | ||
| 231 | if (addr_same(&tm->tcpm_addr, &addr)) | ||
| 232 | break; | ||
| 233 | } | ||
| 234 | tcpm_check_stamp(tm, dst); | ||
| 235 | return tm; | ||
| 236 | } | ||
| 237 | |||
| 238 | static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw) | ||
| 239 | { | ||
| 240 | struct inet6_timewait_sock *tw6; | ||
| 241 | struct tcp_metrics_block *tm; | ||
| 242 | struct inetpeer_addr addr; | ||
| 243 | unsigned int hash; | ||
| 244 | struct net *net; | ||
| 245 | |||
| 246 | addr.family = tw->tw_family; | ||
| 247 | switch (addr.family) { | ||
| 248 | case AF_INET: | ||
| 249 | addr.addr.a4 = tw->tw_daddr; | ||
| 250 | hash = (__force unsigned int) addr.addr.a4; | ||
| 251 | break; | ||
| 252 | case AF_INET6: | ||
| 253 | tw6 = inet6_twsk((struct sock *)tw); | ||
| 254 | *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr; | ||
| 255 | hash = ipv6_addr_hash(&tw6->tw_v6_daddr); | ||
| 256 | break; | ||
| 257 | default: | ||
| 258 | return NULL; | ||
| 259 | } | ||
| 260 | |||
| 261 | net = twsk_net(tw); | ||
| 262 | hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); | ||
| 263 | |||
| 264 | for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; | ||
| 265 | tm = rcu_dereference(tm->tcpm_next)) { | ||
| 266 | if (addr_same(&tm->tcpm_addr, &addr)) | ||
| 267 | break; | ||
| 268 | } | ||
| 269 | return tm; | ||
| 270 | } | ||
| 271 | |||
| 272 | static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, | ||
| 273 | struct dst_entry *dst, | ||
| 274 | bool create) | ||
| 275 | { | ||
| 276 | struct tcp_metrics_block *tm; | ||
| 277 | struct inetpeer_addr addr; | ||
| 278 | unsigned int hash; | ||
| 279 | struct net *net; | ||
| 280 | bool reclaim; | ||
| 281 | |||
| 282 | addr.family = sk->sk_family; | ||
| 283 | switch (addr.family) { | ||
| 284 | case AF_INET: | ||
| 285 | addr.addr.a4 = inet_sk(sk)->inet_daddr; | ||
| 286 | hash = (__force unsigned int) addr.addr.a4; | ||
| 287 | break; | ||
| 288 | case AF_INET6: | ||
| 289 | *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr; | ||
| 290 | hash = ipv6_addr_hash(&inet6_sk(sk)->daddr); | ||
| 291 | break; | ||
| 292 | default: | ||
| 293 | return NULL; | ||
| 294 | } | ||
| 295 | |||
| 296 | net = dev_net(dst->dev); | ||
| 297 | hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); | ||
| 298 | |||
| 299 | tm = __tcp_get_metrics(&addr, net, hash); | ||
| 300 | reclaim = false; | ||
| 301 | if (tm == TCP_METRICS_RECLAIM_PTR) { | ||
| 302 | reclaim = true; | ||
| 303 | tm = NULL; | ||
| 304 | } | ||
| 305 | if (!tm && create) | ||
| 306 | tm = tcpm_new(dst, &addr, hash, reclaim); | ||
| 307 | else | ||
| 308 | tcpm_check_stamp(tm, dst); | ||
| 309 | |||
| 310 | return tm; | ||
| 311 | } | ||
| 312 | |||
| 313 | /* Save metrics learned by this TCP session. This function is called | ||
| 314 | * only, when TCP finishes successfully i.e. when it enters TIME-WAIT | ||
| 315 | * or goes from LAST-ACK to CLOSE. | ||
| 316 | */ | ||
| 317 | void tcp_update_metrics(struct sock *sk) | ||
| 318 | { | ||
| 319 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 320 | struct dst_entry *dst = __sk_dst_get(sk); | ||
| 321 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 322 | struct tcp_metrics_block *tm; | ||
| 323 | unsigned long rtt; | ||
| 324 | u32 val; | ||
| 325 | int m; | ||
| 326 | |||
| 327 | if (sysctl_tcp_nometrics_save || !dst) | ||
| 328 | return; | ||
| 329 | |||
| 330 | if (dst->flags & DST_HOST) | ||
| 331 | dst_confirm(dst); | ||
| 332 | |||
| 333 | rcu_read_lock(); | ||
| 334 | if (icsk->icsk_backoff || !tp->srtt) { | ||
| 335 | /* This session failed to estimate rtt. Why? | ||
| 336 | * Probably, no packets returned in time. Reset our | ||
| 337 | * results. | ||
| 338 | */ | ||
| 339 | tm = tcp_get_metrics(sk, dst, false); | ||
| 340 | if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT)) | ||
| 341 | tcp_metric_set(tm, TCP_METRIC_RTT, 0); | ||
| 342 | goto out_unlock; | ||
| 343 | } else | ||
| 344 | tm = tcp_get_metrics(sk, dst, true); | ||
| 345 | |||
| 346 | if (!tm) | ||
| 347 | goto out_unlock; | ||
| 348 | |||
| 349 | rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); | ||
| 350 | m = rtt - tp->srtt; | ||
| 351 | |||
| 352 | /* If newly calculated rtt larger than stored one, store new | ||
| 353 | * one. Otherwise, use EWMA. Remember, rtt overestimation is | ||
| 354 | * always better than underestimation. | ||
| 355 | */ | ||
| 356 | if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { | ||
| 357 | if (m <= 0) | ||
| 358 | rtt = tp->srtt; | ||
| 359 | else | ||
| 360 | rtt -= (m >> 3); | ||
| 361 | tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt); | ||
| 362 | } | ||
| 363 | |||
| 364 | if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { | ||
| 365 | unsigned long var; | ||
| 366 | |||
| 367 | if (m < 0) | ||
| 368 | m = -m; | ||
| 369 | |||
| 370 | /* Scale deviation to rttvar fixed point */ | ||
| 371 | m >>= 1; | ||
| 372 | if (m < tp->mdev) | ||
| 373 | m = tp->mdev; | ||
| 374 | |||
| 375 | var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); | ||
| 376 | if (m >= var) | ||
| 377 | var = m; | ||
| 378 | else | ||
| 379 | var -= (var - m) >> 2; | ||
| 380 | |||
| 381 | tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var); | ||
| 382 | } | ||
| 383 | |||
| 384 | if (tcp_in_initial_slowstart(tp)) { | ||
| 385 | /* Slow start still did not finish. */ | ||
| 386 | if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { | ||
| 387 | val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); | ||
| 388 | if (val && (tp->snd_cwnd >> 1) > val) | ||
| 389 | tcp_metric_set(tm, TCP_METRIC_SSTHRESH, | ||
| 390 | tp->snd_cwnd >> 1); | ||
| 391 | } | ||
| 392 | if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { | ||
| 393 | val = tcp_metric_get(tm, TCP_METRIC_CWND); | ||
| 394 | if (tp->snd_cwnd > val) | ||
| 395 | tcp_metric_set(tm, TCP_METRIC_CWND, | ||
| 396 | tp->snd_cwnd); | ||
| 397 | } | ||
| 398 | } else if (tp->snd_cwnd > tp->snd_ssthresh && | ||
| 399 | icsk->icsk_ca_state == TCP_CA_Open) { | ||
| 400 | /* Cong. avoidance phase, cwnd is reliable. */ | ||
| 401 | if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) | ||
| 402 | tcp_metric_set(tm, TCP_METRIC_SSTHRESH, | ||
| 403 | max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); | ||
| 404 | if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { | ||
| 405 | val = tcp_metric_get(tm, TCP_METRIC_CWND); | ||
| 406 | tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1); | ||
| 407 | } | ||
| 408 | } else { | ||
| 409 | /* Else slow start did not finish, cwnd is non-sense, | ||
| 410 | * ssthresh may be also invalid. | ||
| 411 | */ | ||
| 412 | if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { | ||
| 413 | val = tcp_metric_get(tm, TCP_METRIC_CWND); | ||
| 414 | tcp_metric_set(tm, TCP_METRIC_CWND, | ||
| 415 | (val + tp->snd_ssthresh) >> 1); | ||
| 416 | } | ||
| 417 | if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { | ||
| 418 | val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); | ||
| 419 | if (val && tp->snd_ssthresh > val) | ||
| 420 | tcp_metric_set(tm, TCP_METRIC_SSTHRESH, | ||
| 421 | tp->snd_ssthresh); | ||
| 422 | } | ||
| 423 | if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) { | ||
| 424 | val = tcp_metric_get(tm, TCP_METRIC_REORDERING); | ||
| 425 | if (val < tp->reordering && | ||
| 426 | tp->reordering != sysctl_tcp_reordering) | ||
| 427 | tcp_metric_set(tm, TCP_METRIC_REORDERING, | ||
| 428 | tp->reordering); | ||
| 429 | } | ||
| 430 | } | ||
| 431 | tm->tcpm_stamp = jiffies; | ||
| 432 | out_unlock: | ||
| 433 | rcu_read_unlock(); | ||
| 434 | } | ||
| 435 | |||
| 436 | /* Initialize metrics on socket. */ | ||
| 437 | |||
| 438 | void tcp_init_metrics(struct sock *sk) | ||
| 439 | { | ||
| 440 | struct dst_entry *dst = __sk_dst_get(sk); | ||
| 441 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 442 | struct tcp_metrics_block *tm; | ||
| 443 | u32 val; | ||
| 444 | |||
| 445 | if (dst == NULL) | ||
| 446 | goto reset; | ||
| 447 | |||
| 448 | dst_confirm(dst); | ||
| 449 | |||
| 450 | rcu_read_lock(); | ||
| 451 | tm = tcp_get_metrics(sk, dst, true); | ||
| 452 | if (!tm) { | ||
| 453 | rcu_read_unlock(); | ||
| 454 | goto reset; | ||
| 455 | } | ||
| 456 | |||
| 457 | if (tcp_metric_locked(tm, TCP_METRIC_CWND)) | ||
| 458 | tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND); | ||
| 459 | |||
| 460 | val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); | ||
| 461 | if (val) { | ||
| 462 | tp->snd_ssthresh = val; | ||
| 463 | if (tp->snd_ssthresh > tp->snd_cwnd_clamp) | ||
| 464 | tp->snd_ssthresh = tp->snd_cwnd_clamp; | ||
| 465 | } else { | ||
| 466 | /* ssthresh may have been reduced unnecessarily during. | ||
| 467 | * 3WHS. Restore it back to its initial default. | ||
| 468 | */ | ||
| 469 | tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; | ||
| 470 | } | ||
| 471 | val = tcp_metric_get(tm, TCP_METRIC_REORDERING); | ||
| 472 | if (val && tp->reordering != val) { | ||
| 473 | tcp_disable_fack(tp); | ||
| 474 | tcp_disable_early_retrans(tp); | ||
| 475 | tp->reordering = val; | ||
| 476 | } | ||
| 477 | |||
| 478 | val = tcp_metric_get(tm, TCP_METRIC_RTT); | ||
| 479 | if (val == 0 || tp->srtt == 0) { | ||
| 480 | rcu_read_unlock(); | ||
| 481 | goto reset; | ||
| 482 | } | ||
| 483 | /* Initial rtt is determined from SYN,SYN-ACK. | ||
| 484 | * The segment is small and rtt may appear much | ||
| 485 | * less than real one. Use per-dst memory | ||
| 486 | * to make it more realistic. | ||
| 487 | * | ||
| 488 | * A bit of theory. RTT is time passed after "normal" sized packet | ||
| 489 | * is sent until it is ACKed. In normal circumstances sending small | ||
| 490 | * packets force peer to delay ACKs and calculation is correct too. | ||
| 491 | * The algorithm is adaptive and, provided we follow specs, it | ||
| 492 | * NEVER underestimate RTT. BUT! If peer tries to make some clever | ||
| 493 | * tricks sort of "quick acks" for time long enough to decrease RTT | ||
| 494 | * to low value, and then abruptly stops to do it and starts to delay | ||
| 495 | * ACKs, wait for troubles. | ||
| 496 | */ | ||
| 497 | val = msecs_to_jiffies(val); | ||
| 498 | if (val > tp->srtt) { | ||
| 499 | tp->srtt = val; | ||
| 500 | tp->rtt_seq = tp->snd_nxt; | ||
| 501 | } | ||
| 502 | val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); | ||
| 503 | if (val > tp->mdev) { | ||
| 504 | tp->mdev = val; | ||
| 505 | tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); | ||
| 506 | } | ||
| 507 | rcu_read_unlock(); | ||
| 508 | |||
| 509 | tcp_set_rto(sk); | ||
| 510 | reset: | ||
| 511 | if (tp->srtt == 0) { | ||
| 512 | /* RFC6298: 5.7 We've failed to get a valid RTT sample from | ||
| 513 | * 3WHS. This is most likely due to retransmission, | ||
| 514 | * including spurious one. Reset the RTO back to 3secs | ||
| 515 | * from the more aggressive 1sec to avoid more spurious | ||
| 516 | * retransmission. | ||
| 517 | */ | ||
| 518 | tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; | ||
| 519 | inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; | ||
| 520 | } | ||
| 521 | /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been | ||
| 522 | * retransmitted. In light of RFC6298 more aggressive 1sec | ||
| 523 | * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK | ||
| 524 | * retransmission has occurred. | ||
| 525 | */ | ||
| 526 | if (tp->total_retrans > 1) | ||
| 527 | tp->snd_cwnd = 1; | ||
| 528 | else | ||
| 529 | tp->snd_cwnd = tcp_init_cwnd(tp, dst); | ||
| 530 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
| 531 | } | ||
| 532 | |||
| 533 | bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check) | ||
| 534 | { | ||
| 535 | struct tcp_metrics_block *tm; | ||
| 536 | bool ret; | ||
| 537 | |||
| 538 | if (!dst) | ||
| 539 | return false; | ||
| 540 | |||
| 541 | rcu_read_lock(); | ||
| 542 | tm = __tcp_get_metrics_req(req, dst); | ||
| 543 | if (paws_check) { | ||
| 544 | if (tm && | ||
| 545 | (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL && | ||
| 546 | (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW) | ||
| 547 | ret = false; | ||
| 548 | else | ||
| 549 | ret = true; | ||
| 550 | } else { | ||
| 551 | if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp) | ||
| 552 | ret = true; | ||
| 553 | else | ||
| 554 | ret = false; | ||
| 555 | } | ||
| 556 | rcu_read_unlock(); | ||
| 557 | |||
| 558 | return ret; | ||
| 559 | } | ||
| 560 | EXPORT_SYMBOL_GPL(tcp_peer_is_proven); | ||
| 561 | |||
| 562 | void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst) | ||
| 563 | { | ||
| 564 | struct tcp_metrics_block *tm; | ||
| 565 | |||
| 566 | rcu_read_lock(); | ||
| 567 | tm = tcp_get_metrics(sk, dst, true); | ||
| 568 | if (tm) { | ||
| 569 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 570 | |||
| 571 | if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) { | ||
| 572 | tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp; | ||
| 573 | tp->rx_opt.ts_recent = tm->tcpm_ts; | ||
| 574 | } | ||
| 575 | } | ||
| 576 | rcu_read_unlock(); | ||
| 577 | } | ||
| 578 | EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp); | ||
| 579 | |||
| 580 | /* VJ's idea. Save last timestamp seen from this destination and hold | ||
| 581 | * it at least for normal timewait interval to use for duplicate | ||
| 582 | * segment detection in subsequent connections, before they enter | ||
| 583 | * synchronized state. | ||
| 584 | */ | ||
| 585 | bool tcp_remember_stamp(struct sock *sk) | ||
| 586 | { | ||
| 587 | struct dst_entry *dst = __sk_dst_get(sk); | ||
| 588 | bool ret = false; | ||
| 589 | |||
| 590 | if (dst) { | ||
| 591 | struct tcp_metrics_block *tm; | ||
| 592 | |||
| 593 | rcu_read_lock(); | ||
| 594 | tm = tcp_get_metrics(sk, dst, true); | ||
| 595 | if (tm) { | ||
| 596 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 597 | |||
| 598 | if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 || | ||
| 599 | ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && | ||
| 600 | tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { | ||
| 601 | tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; | ||
| 602 | tm->tcpm_ts = tp->rx_opt.ts_recent; | ||
| 603 | } | ||
| 604 | ret = true; | ||
| 605 | } | ||
| 606 | rcu_read_unlock(); | ||
| 607 | } | ||
| 608 | return ret; | ||
| 609 | } | ||
| 610 | |||
| 611 | bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw) | ||
| 612 | { | ||
| 613 | struct tcp_metrics_block *tm; | ||
| 614 | bool ret = false; | ||
| 615 | |||
| 616 | rcu_read_lock(); | ||
| 617 | tm = __tcp_get_metrics_tw(tw); | ||
| 618 | if (tm) { | ||
| 619 | const struct tcp_timewait_sock *tcptw; | ||
| 620 | struct sock *sk = (struct sock *) tw; | ||
| 621 | |||
| 622 | tcptw = tcp_twsk(sk); | ||
| 623 | if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 || | ||
| 624 | ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && | ||
| 625 | tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { | ||
| 626 | tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; | ||
| 627 | tm->tcpm_ts = tcptw->tw_ts_recent; | ||
| 628 | } | ||
| 629 | ret = true; | ||
| 630 | } | ||
| 631 | rcu_read_unlock(); | ||
| 632 | |||
| 633 | return ret; | ||
| 634 | } | ||
| 635 | |||
| 636 | static DEFINE_SEQLOCK(fastopen_seqlock); | ||
| 637 | |||
| 638 | void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, | ||
| 639 | struct tcp_fastopen_cookie *cookie, | ||
| 640 | int *syn_loss, unsigned long *last_syn_loss) | ||
| 641 | { | ||
| 642 | struct tcp_metrics_block *tm; | ||
| 643 | |||
| 644 | rcu_read_lock(); | ||
| 645 | tm = tcp_get_metrics(sk, __sk_dst_get(sk), false); | ||
| 646 | if (tm) { | ||
| 647 | struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; | ||
| 648 | unsigned int seq; | ||
| 649 | |||
| 650 | do { | ||
| 651 | seq = read_seqbegin(&fastopen_seqlock); | ||
| 652 | if (tfom->mss) | ||
| 653 | *mss = tfom->mss; | ||
| 654 | *cookie = tfom->cookie; | ||
| 655 | *syn_loss = tfom->syn_loss; | ||
| 656 | *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0; | ||
| 657 | } while (read_seqretry(&fastopen_seqlock, seq)); | ||
| 658 | } | ||
| 659 | rcu_read_unlock(); | ||
| 660 | } | ||
| 661 | |||
| 662 | void tcp_fastopen_cache_set(struct sock *sk, u16 mss, | ||
| 663 | struct tcp_fastopen_cookie *cookie, bool syn_lost) | ||
| 664 | { | ||
| 665 | struct tcp_metrics_block *tm; | ||
| 666 | |||
| 667 | rcu_read_lock(); | ||
| 668 | tm = tcp_get_metrics(sk, __sk_dst_get(sk), true); | ||
| 669 | if (tm) { | ||
| 670 | struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; | ||
| 671 | |||
| 672 | write_seqlock_bh(&fastopen_seqlock); | ||
| 673 | tfom->mss = mss; | ||
| 674 | if (cookie->len > 0) | ||
| 675 | tfom->cookie = *cookie; | ||
| 676 | if (syn_lost) { | ||
| 677 | ++tfom->syn_loss; | ||
| 678 | tfom->last_syn_loss = jiffies; | ||
| 679 | } else | ||
| 680 | tfom->syn_loss = 0; | ||
| 681 | write_sequnlock_bh(&fastopen_seqlock); | ||
| 682 | } | ||
| 683 | rcu_read_unlock(); | ||
| 684 | } | ||
| 685 | |||
| 686 | static struct genl_family tcp_metrics_nl_family = { | ||
| 687 | .id = GENL_ID_GENERATE, | ||
| 688 | .hdrsize = 0, | ||
| 689 | .name = TCP_METRICS_GENL_NAME, | ||
| 690 | .version = TCP_METRICS_GENL_VERSION, | ||
| 691 | .maxattr = TCP_METRICS_ATTR_MAX, | ||
| 692 | .netnsok = true, | ||
| 693 | }; | ||
| 694 | |||
| 695 | static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = { | ||
| 696 | [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, }, | ||
| 697 | [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY, | ||
| 698 | .len = sizeof(struct in6_addr), }, | ||
| 699 | /* Following attributes are not received for GET/DEL, | ||
| 700 | * we keep them for reference | ||
| 701 | */ | ||
| 702 | #if 0 | ||
| 703 | [TCP_METRICS_ATTR_AGE] = { .type = NLA_MSECS, }, | ||
| 704 | [TCP_METRICS_ATTR_TW_TSVAL] = { .type = NLA_U32, }, | ||
| 705 | [TCP_METRICS_ATTR_TW_TS_STAMP] = { .type = NLA_S32, }, | ||
| 706 | [TCP_METRICS_ATTR_VALS] = { .type = NLA_NESTED, }, | ||
| 707 | [TCP_METRICS_ATTR_FOPEN_MSS] = { .type = NLA_U16, }, | ||
| 708 | [TCP_METRICS_ATTR_FOPEN_SYN_DROPS] = { .type = NLA_U16, }, | ||
| 709 | [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS] = { .type = NLA_MSECS, }, | ||
| 710 | [TCP_METRICS_ATTR_FOPEN_COOKIE] = { .type = NLA_BINARY, | ||
| 711 | .len = TCP_FASTOPEN_COOKIE_MAX, }, | ||
| 712 | #endif | ||
| 713 | }; | ||
| 714 | |||
| 715 | /* Add attributes, caller cancels its header on failure */ | ||
| 716 | static int tcp_metrics_fill_info(struct sk_buff *msg, | ||
| 717 | struct tcp_metrics_block *tm) | ||
| 718 | { | ||
| 719 | struct nlattr *nest; | ||
| 720 | int i; | ||
| 721 | |||
| 722 | switch (tm->tcpm_addr.family) { | ||
| 723 | case AF_INET: | ||
| 724 | if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4, | ||
| 725 | tm->tcpm_addr.addr.a4) < 0) | ||
| 726 | goto nla_put_failure; | ||
| 727 | break; | ||
| 728 | case AF_INET6: | ||
| 729 | if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16, | ||
| 730 | tm->tcpm_addr.addr.a6) < 0) | ||
| 731 | goto nla_put_failure; | ||
| 732 | break; | ||
| 733 | default: | ||
| 734 | return -EAFNOSUPPORT; | ||
| 735 | } | ||
| 736 | |||
| 737 | if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE, | ||
| 738 | jiffies - tm->tcpm_stamp) < 0) | ||
| 739 | goto nla_put_failure; | ||
| 740 | if (tm->tcpm_ts_stamp) { | ||
| 741 | if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP, | ||
| 742 | (s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0) | ||
| 743 | goto nla_put_failure; | ||
| 744 | if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL, | ||
| 745 | tm->tcpm_ts) < 0) | ||
| 746 | goto nla_put_failure; | ||
| 747 | } | ||
| 748 | |||
| 749 | { | ||
| 750 | int n = 0; | ||
| 751 | |||
| 752 | nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS); | ||
| 753 | if (!nest) | ||
| 754 | goto nla_put_failure; | ||
| 755 | for (i = 0; i < TCP_METRIC_MAX + 1; i++) { | ||
| 756 | if (!tm->tcpm_vals[i]) | ||
| 757 | continue; | ||
| 758 | if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0) | ||
| 759 | goto nla_put_failure; | ||
| 760 | n++; | ||
| 761 | } | ||
| 762 | if (n) | ||
| 763 | nla_nest_end(msg, nest); | ||
| 764 | else | ||
| 765 | nla_nest_cancel(msg, nest); | ||
| 766 | } | ||
| 767 | |||
| 768 | { | ||
| 769 | struct tcp_fastopen_metrics tfom_copy[1], *tfom; | ||
| 770 | unsigned int seq; | ||
| 771 | |||
| 772 | do { | ||
| 773 | seq = read_seqbegin(&fastopen_seqlock); | ||
| 774 | tfom_copy[0] = tm->tcpm_fastopen; | ||
| 775 | } while (read_seqretry(&fastopen_seqlock, seq)); | ||
| 776 | |||
| 777 | tfom = tfom_copy; | ||
| 778 | if (tfom->mss && | ||
| 779 | nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS, | ||
| 780 | tfom->mss) < 0) | ||
| 781 | goto nla_put_failure; | ||
| 782 | if (tfom->syn_loss && | ||
| 783 | (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS, | ||
| 784 | tfom->syn_loss) < 0 || | ||
| 785 | nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS, | ||
| 786 | jiffies - tfom->last_syn_loss) < 0)) | ||
| 787 | goto nla_put_failure; | ||
| 788 | if (tfom->cookie.len > 0 && | ||
| 789 | nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE, | ||
| 790 | tfom->cookie.len, tfom->cookie.val) < 0) | ||
| 791 | goto nla_put_failure; | ||
| 792 | } | ||
| 793 | |||
| 794 | return 0; | ||
| 795 | |||
| 796 | nla_put_failure: | ||
| 797 | return -EMSGSIZE; | ||
| 798 | } | ||
| 799 | |||
| 800 | static int tcp_metrics_dump_info(struct sk_buff *skb, | ||
| 801 | struct netlink_callback *cb, | ||
| 802 | struct tcp_metrics_block *tm) | ||
| 803 | { | ||
| 804 | void *hdr; | ||
| 805 | |||
| 806 | hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, | ||
| 807 | &tcp_metrics_nl_family, NLM_F_MULTI, | ||
| 808 | TCP_METRICS_CMD_GET); | ||
| 809 | if (!hdr) | ||
| 810 | return -EMSGSIZE; | ||
| 811 | |||
| 812 | if (tcp_metrics_fill_info(skb, tm) < 0) | ||
| 813 | goto nla_put_failure; | ||
| 814 | |||
| 815 | return genlmsg_end(skb, hdr); | ||
| 816 | |||
| 817 | nla_put_failure: | ||
| 818 | genlmsg_cancel(skb, hdr); | ||
| 819 | return -EMSGSIZE; | ||
| 820 | } | ||
| 821 | |||
| 822 | static int tcp_metrics_nl_dump(struct sk_buff *skb, | ||
| 823 | struct netlink_callback *cb) | ||
| 824 | { | ||
| 825 | struct net *net = sock_net(skb->sk); | ||
| 826 | unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; | ||
| 827 | unsigned int row, s_row = cb->args[0]; | ||
| 828 | int s_col = cb->args[1], col = s_col; | ||
| 829 | |||
| 830 | for (row = s_row; row < max_rows; row++, s_col = 0) { | ||
| 831 | struct tcp_metrics_block *tm; | ||
| 832 | struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row; | ||
| 833 | |||
| 834 | rcu_read_lock(); | ||
| 835 | for (col = 0, tm = rcu_dereference(hb->chain); tm; | ||
| 836 | tm = rcu_dereference(tm->tcpm_next), col++) { | ||
| 837 | if (col < s_col) | ||
| 838 | continue; | ||
| 839 | if (tcp_metrics_dump_info(skb, cb, tm) < 0) { | ||
| 840 | rcu_read_unlock(); | ||
| 841 | goto done; | ||
| 842 | } | ||
| 843 | } | ||
| 844 | rcu_read_unlock(); | ||
| 845 | } | ||
| 846 | |||
| 847 | done: | ||
| 848 | cb->args[0] = row; | ||
| 849 | cb->args[1] = col; | ||
| 850 | return skb->len; | ||
| 851 | } | ||
| 852 | |||
| 853 | static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, | ||
| 854 | unsigned int *hash, int optional) | ||
| 855 | { | ||
| 856 | struct nlattr *a; | ||
| 857 | |||
| 858 | a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV4]; | ||
| 859 | if (a) { | ||
| 860 | addr->family = AF_INET; | ||
| 861 | addr->addr.a4 = nla_get_be32(a); | ||
| 862 | *hash = (__force unsigned int) addr->addr.a4; | ||
| 863 | return 0; | ||
| 864 | } | ||
| 865 | a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV6]; | ||
| 866 | if (a) { | ||
| 867 | if (nla_len(a) != sizeof(struct in6_addr)) | ||
| 868 | return -EINVAL; | ||
| 869 | addr->family = AF_INET6; | ||
| 870 | memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6)); | ||
| 871 | *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6); | ||
| 872 | return 0; | ||
| 873 | } | ||
| 874 | return optional ? 1 : -EAFNOSUPPORT; | ||
| 875 | } | ||
| 876 | |||
| 877 | static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) | ||
| 878 | { | ||
| 879 | struct tcp_metrics_block *tm; | ||
| 880 | struct inetpeer_addr addr; | ||
| 881 | unsigned int hash; | ||
| 882 | struct sk_buff *msg; | ||
| 883 | struct net *net = genl_info_net(info); | ||
| 884 | void *reply; | ||
| 885 | int ret; | ||
| 886 | |||
| 887 | ret = parse_nl_addr(info, &addr, &hash, 0); | ||
| 888 | if (ret < 0) | ||
| 889 | return ret; | ||
| 890 | |||
| 891 | msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); | ||
| 892 | if (!msg) | ||
| 893 | return -ENOMEM; | ||
| 894 | |||
| 895 | reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0, | ||
| 896 | info->genlhdr->cmd); | ||
| 897 | if (!reply) | ||
| 898 | goto nla_put_failure; | ||
| 899 | |||
| 900 | hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); | ||
| 901 | ret = -ESRCH; | ||
| 902 | rcu_read_lock(); | ||
| 903 | for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; | ||
| 904 | tm = rcu_dereference(tm->tcpm_next)) { | ||
| 905 | if (addr_same(&tm->tcpm_addr, &addr)) { | ||
| 906 | ret = tcp_metrics_fill_info(msg, tm); | ||
| 907 | break; | ||
| 908 | } | ||
| 909 | } | ||
| 910 | rcu_read_unlock(); | ||
| 911 | if (ret < 0) | ||
| 912 | goto out_free; | ||
| 913 | |||
| 914 | genlmsg_end(msg, reply); | ||
| 915 | return genlmsg_reply(msg, info); | ||
| 916 | |||
| 917 | nla_put_failure: | ||
| 918 | ret = -EMSGSIZE; | ||
| 919 | |||
| 920 | out_free: | ||
| 921 | nlmsg_free(msg); | ||
| 922 | return ret; | ||
| 923 | } | ||
| 924 | |||
| 925 | #define deref_locked_genl(p) \ | ||
| 926 | rcu_dereference_protected(p, lockdep_genl_is_held() && \ | ||
| 927 | lockdep_is_held(&tcp_metrics_lock)) | ||
| 928 | |||
| 929 | #define deref_genl(p) rcu_dereference_protected(p, lockdep_genl_is_held()) | ||
| 930 | |||
| 931 | static int tcp_metrics_flush_all(struct net *net) | ||
| 932 | { | ||
| 933 | unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; | ||
| 934 | struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash; | ||
| 935 | struct tcp_metrics_block *tm; | ||
| 936 | unsigned int row; | ||
| 937 | |||
| 938 | for (row = 0; row < max_rows; row++, hb++) { | ||
| 939 | spin_lock_bh(&tcp_metrics_lock); | ||
| 940 | tm = deref_locked_genl(hb->chain); | ||
| 941 | if (tm) | ||
| 942 | hb->chain = NULL; | ||
| 943 | spin_unlock_bh(&tcp_metrics_lock); | ||
| 944 | while (tm) { | ||
| 945 | struct tcp_metrics_block *next; | ||
| 946 | |||
| 947 | next = deref_genl(tm->tcpm_next); | ||
| 948 | kfree_rcu(tm, rcu_head); | ||
| 949 | tm = next; | ||
| 950 | } | ||
| 951 | } | ||
| 952 | return 0; | ||
| 953 | } | ||
| 954 | |||
| 955 | static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) | ||
| 956 | { | ||
| 957 | struct tcpm_hash_bucket *hb; | ||
| 958 | struct tcp_metrics_block *tm; | ||
| 959 | struct tcp_metrics_block __rcu **pp; | ||
| 960 | struct inetpeer_addr addr; | ||
| 961 | unsigned int hash; | ||
| 962 | struct net *net = genl_info_net(info); | ||
| 963 | int ret; | ||
| 964 | |||
| 965 | ret = parse_nl_addr(info, &addr, &hash, 1); | ||
| 966 | if (ret < 0) | ||
| 967 | return ret; | ||
| 968 | if (ret > 0) | ||
| 969 | return tcp_metrics_flush_all(net); | ||
| 970 | |||
| 971 | hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); | ||
| 972 | hb = net->ipv4.tcp_metrics_hash + hash; | ||
| 973 | pp = &hb->chain; | ||
| 974 | spin_lock_bh(&tcp_metrics_lock); | ||
| 975 | for (tm = deref_locked_genl(*pp); tm; | ||
| 976 | pp = &tm->tcpm_next, tm = deref_locked_genl(*pp)) { | ||
| 977 | if (addr_same(&tm->tcpm_addr, &addr)) { | ||
| 978 | *pp = tm->tcpm_next; | ||
| 979 | break; | ||
| 980 | } | ||
| 981 | } | ||
| 982 | spin_unlock_bh(&tcp_metrics_lock); | ||
| 983 | if (!tm) | ||
| 984 | return -ESRCH; | ||
| 985 | kfree_rcu(tm, rcu_head); | ||
| 986 | return 0; | ||
| 987 | } | ||
| 988 | |||
| 989 | static struct genl_ops tcp_metrics_nl_ops[] = { | ||
| 990 | { | ||
| 991 | .cmd = TCP_METRICS_CMD_GET, | ||
| 992 | .doit = tcp_metrics_nl_cmd_get, | ||
| 993 | .dumpit = tcp_metrics_nl_dump, | ||
| 994 | .policy = tcp_metrics_nl_policy, | ||
| 995 | .flags = GENL_ADMIN_PERM, | ||
| 996 | }, | ||
| 997 | { | ||
| 998 | .cmd = TCP_METRICS_CMD_DEL, | ||
| 999 | .doit = tcp_metrics_nl_cmd_del, | ||
| 1000 | .policy = tcp_metrics_nl_policy, | ||
| 1001 | .flags = GENL_ADMIN_PERM, | ||
| 1002 | }, | ||
| 1003 | }; | ||
| 1004 | |||
| 1005 | static unsigned int tcpmhash_entries; | ||
| 1006 | static int __init set_tcpmhash_entries(char *str) | ||
| 1007 | { | ||
| 1008 | ssize_t ret; | ||
| 1009 | |||
| 1010 | if (!str) | ||
| 1011 | return 0; | ||
| 1012 | |||
| 1013 | ret = kstrtouint(str, 0, &tcpmhash_entries); | ||
| 1014 | if (ret) | ||
| 1015 | return 0; | ||
| 1016 | |||
| 1017 | return 1; | ||
| 1018 | } | ||
| 1019 | __setup("tcpmhash_entries=", set_tcpmhash_entries); | ||
| 1020 | |||
| 1021 | static int __net_init tcp_net_metrics_init(struct net *net) | ||
| 1022 | { | ||
| 1023 | size_t size; | ||
| 1024 | unsigned int slots; | ||
| 1025 | |||
| 1026 | slots = tcpmhash_entries; | ||
| 1027 | if (!slots) { | ||
| 1028 | if (totalram_pages >= 128 * 1024) | ||
| 1029 | slots = 16 * 1024; | ||
| 1030 | else | ||
| 1031 | slots = 8 * 1024; | ||
| 1032 | } | ||
| 1033 | |||
| 1034 | net->ipv4.tcp_metrics_hash_log = order_base_2(slots); | ||
| 1035 | size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log; | ||
| 1036 | |||
| 1037 | net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); | ||
| 1038 | if (!net->ipv4.tcp_metrics_hash) | ||
| 1039 | net->ipv4.tcp_metrics_hash = vzalloc(size); | ||
| 1040 | |||
| 1041 | if (!net->ipv4.tcp_metrics_hash) | ||
| 1042 | return -ENOMEM; | ||
| 1043 | |||
| 1044 | return 0; | ||
| 1045 | } | ||
| 1046 | |||
| 1047 | static void __net_exit tcp_net_metrics_exit(struct net *net) | ||
| 1048 | { | ||
| 1049 | unsigned int i; | ||
| 1050 | |||
| 1051 | for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) { | ||
| 1052 | struct tcp_metrics_block *tm, *next; | ||
| 1053 | |||
| 1054 | tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1); | ||
| 1055 | while (tm) { | ||
| 1056 | next = rcu_dereference_protected(tm->tcpm_next, 1); | ||
| 1057 | kfree(tm); | ||
| 1058 | tm = next; | ||
| 1059 | } | ||
| 1060 | } | ||
| 1061 | if (is_vmalloc_addr(net->ipv4.tcp_metrics_hash)) | ||
| 1062 | vfree(net->ipv4.tcp_metrics_hash); | ||
| 1063 | else | ||
| 1064 | kfree(net->ipv4.tcp_metrics_hash); | ||
| 1065 | } | ||
| 1066 | |||
| 1067 | static __net_initdata struct pernet_operations tcp_net_metrics_ops = { | ||
| 1068 | .init = tcp_net_metrics_init, | ||
| 1069 | .exit = tcp_net_metrics_exit, | ||
| 1070 | }; | ||
| 1071 | |||
| 1072 | void __init tcp_metrics_init(void) | ||
| 1073 | { | ||
| 1074 | int ret; | ||
| 1075 | |||
| 1076 | ret = register_pernet_subsys(&tcp_net_metrics_ops); | ||
| 1077 | if (ret < 0) | ||
| 1078 | goto cleanup; | ||
| 1079 | ret = genl_register_family_with_ops(&tcp_metrics_nl_family, | ||
| 1080 | tcp_metrics_nl_ops, | ||
| 1081 | ARRAY_SIZE(tcp_metrics_nl_ops)); | ||
| 1082 | if (ret < 0) | ||
| 1083 | goto cleanup_subsys; | ||
| 1084 | return; | ||
| 1085 | |||
| 1086 | cleanup_subsys: | ||
| 1087 | unregister_pernet_subsys(&tcp_net_metrics_ops); | ||
| 1088 | |||
| 1089 | cleanup: | ||
| 1090 | return; | ||
| 1091 | } | ||
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f35f2dfb640..0ce3d06dce6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
| @@ -49,12 +49,62 @@ struct inet_timewait_death_row tcp_death_row = { | |||
| 49 | }; | 49 | }; |
| 50 | EXPORT_SYMBOL_GPL(tcp_death_row); | 50 | EXPORT_SYMBOL_GPL(tcp_death_row); |
| 51 | 51 | ||
| 52 | static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) | 52 | /* VJ's idea. Save last timestamp seen from this destination |
| 53 | * and hold it at least for normal timewait interval to use for duplicate | ||
| 54 | * segment detection in subsequent connections, before they enter synchronized | ||
| 55 | * state. | ||
| 56 | */ | ||
| 57 | |||
| 58 | static int tcp_remember_stamp(struct sock *sk) | ||
| 59 | { | ||
| 60 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 61 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 62 | struct inet_peer *peer; | ||
| 63 | bool release_it; | ||
| 64 | |||
| 65 | peer = icsk->icsk_af_ops->get_peer(sk, &release_it); | ||
| 66 | if (peer) { | ||
| 67 | if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || | ||
| 68 | ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && | ||
| 69 | peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { | ||
| 70 | peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; | ||
| 71 | peer->tcp_ts = tp->rx_opt.ts_recent; | ||
| 72 | } | ||
| 73 | if (release_it) | ||
| 74 | inet_putpeer(peer); | ||
| 75 | return 1; | ||
| 76 | } | ||
| 77 | |||
| 78 | return 0; | ||
| 79 | } | ||
| 80 | |||
| 81 | static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw) | ||
| 82 | { | ||
| 83 | struct sock *sk = (struct sock *) tw; | ||
| 84 | struct inet_peer *peer; | ||
| 85 | |||
| 86 | peer = twsk_getpeer(sk); | ||
| 87 | if (peer) { | ||
| 88 | const struct tcp_timewait_sock *tcptw = tcp_twsk(sk); | ||
| 89 | |||
| 90 | if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || | ||
| 91 | ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && | ||
| 92 | peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { | ||
| 93 | peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; | ||
| 94 | peer->tcp_ts = tcptw->tw_ts_recent; | ||
| 95 | } | ||
| 96 | inet_putpeer(peer); | ||
| 97 | return 1; | ||
| 98 | } | ||
| 99 | return 0; | ||
| 100 | } | ||
| 101 | |||
| 102 | static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) | ||
| 53 | { | 103 | { |
| 54 | if (seq == s_win) | 104 | if (seq == s_win) |
| 55 | return true; | 105 | return 1; |
| 56 | if (after(end_seq, s_win) && before(seq, e_win)) | 106 | if (after(end_seq, s_win) && before(seq, e_win)) |
| 57 | return true; | 107 | return 1; |
| 58 | return seq == e_win && seq == end_seq; | 108 | return seq == e_win && seq == end_seq; |
| 59 | } | 109 | } |
| 60 | 110 | ||
| @@ -85,21 +135,19 @@ static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) | |||
| 85 | * spinlock it. I do not want! Well, probability of misbehaviour | 135 | * spinlock it. I do not want! Well, probability of misbehaviour |
| 86 | * is ridiculously low and, seems, we could use some mb() tricks | 136 | * is ridiculously low and, seems, we could use some mb() tricks |
| 87 | * to avoid misread sequence numbers, states etc. --ANK | 137 | * to avoid misread sequence numbers, states etc. --ANK |
| 88 | * | ||
| 89 | * We don't need to initialize tmp_out.sack_ok as we don't use the results | ||
| 90 | */ | 138 | */ |
| 91 | enum tcp_tw_status | 139 | enum tcp_tw_status |
| 92 | tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, | 140 | tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, |
| 93 | const struct tcphdr *th) | 141 | const struct tcphdr *th) |
| 94 | { | 142 | { |
| 95 | struct tcp_options_received tmp_opt; | 143 | struct tcp_options_received tmp_opt; |
| 96 | const u8 *hash_location; | 144 | u8 *hash_location; |
| 97 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); | 145 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); |
| 98 | bool paws_reject = false; | 146 | int paws_reject = 0; |
| 99 | 147 | ||
| 100 | tmp_opt.saw_tstamp = 0; | 148 | tmp_opt.saw_tstamp = 0; |
| 101 | if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { | 149 | if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { |
| 102 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); | 150 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0); |
| 103 | 151 | ||
| 104 | if (tmp_opt.saw_tstamp) { | 152 | if (tmp_opt.saw_tstamp) { |
| 105 | tmp_opt.ts_recent = tcptw->tw_ts_recent; | 153 | tmp_opt.ts_recent = tcptw->tw_ts_recent; |
| @@ -268,7 +316,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) | |||
| 268 | struct inet_timewait_sock *tw = NULL; | 316 | struct inet_timewait_sock *tw = NULL; |
| 269 | const struct inet_connection_sock *icsk = inet_csk(sk); | 317 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 270 | const struct tcp_sock *tp = tcp_sk(sk); | 318 | const struct tcp_sock *tp = tcp_sk(sk); |
| 271 | bool recycle_ok = false; | 319 | int recycle_ok = 0; |
| 272 | 320 | ||
| 273 | if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) | 321 | if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) |
| 274 | recycle_ok = tcp_remember_stamp(sk); | 322 | recycle_ok = tcp_remember_stamp(sk); |
| @@ -279,9 +327,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) | |||
| 279 | if (tw != NULL) { | 327 | if (tw != NULL) { |
| 280 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); | 328 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); |
| 281 | const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); | 329 | const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); |
| 282 | struct inet_sock *inet = inet_sk(sk); | ||
| 283 | 330 | ||
| 284 | tw->tw_transparent = inet->transparent; | 331 | tw->tw_transparent = inet_sk(sk)->transparent; |
| 285 | tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; | 332 | tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; |
| 286 | tcptw->tw_rcv_nxt = tp->rcv_nxt; | 333 | tcptw->tw_rcv_nxt = tp->rcv_nxt; |
| 287 | tcptw->tw_snd_nxt = tp->snd_nxt; | 334 | tcptw->tw_snd_nxt = tp->snd_nxt; |
| @@ -289,16 +336,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) | |||
| 289 | tcptw->tw_ts_recent = tp->rx_opt.ts_recent; | 336 | tcptw->tw_ts_recent = tp->rx_opt.ts_recent; |
| 290 | tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; | 337 | tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; |
| 291 | 338 | ||
| 292 | #if IS_ENABLED(CONFIG_IPV6) | 339 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 293 | if (tw->tw_family == PF_INET6) { | 340 | if (tw->tw_family == PF_INET6) { |
| 294 | struct ipv6_pinfo *np = inet6_sk(sk); | 341 | struct ipv6_pinfo *np = inet6_sk(sk); |
| 295 | struct inet6_timewait_sock *tw6; | 342 | struct inet6_timewait_sock *tw6; |
| 296 | 343 | ||
| 297 | tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); | 344 | tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); |
| 298 | tw6 = inet6_twsk((struct sock *)tw); | 345 | tw6 = inet6_twsk((struct sock *)tw); |
| 299 | tw6->tw_v6_daddr = np->daddr; | 346 | ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); |
| 300 | tw6->tw_v6_rcv_saddr = np->rcv_saddr; | 347 | ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); |
| 301 | tw->tw_tclass = np->tclass; | ||
| 302 | tw->tw_ipv6only = np->ipv6only; | 348 | tw->tw_ipv6only = np->ipv6only; |
| 303 | } | 349 | } |
| 304 | #endif | 350 | #endif |
| @@ -312,11 +358,13 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) | |||
| 312 | */ | 358 | */ |
| 313 | do { | 359 | do { |
| 314 | struct tcp_md5sig_key *key; | 360 | struct tcp_md5sig_key *key; |
| 315 | tcptw->tw_md5_key = NULL; | 361 | memset(tcptw->tw_md5_key, 0, sizeof(tcptw->tw_md5_key)); |
| 362 | tcptw->tw_md5_keylen = 0; | ||
| 316 | key = tp->af_specific->md5_lookup(sk, sk); | 363 | key = tp->af_specific->md5_lookup(sk, sk); |
| 317 | if (key != NULL) { | 364 | if (key != NULL) { |
| 318 | tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); | 365 | memcpy(&tcptw->tw_md5_key, key->key, key->keylen); |
| 319 | if (tcptw->tw_md5_key && tcp_alloc_md5sig_pool(sk) == NULL) | 366 | tcptw->tw_md5_keylen = key->keylen; |
| 367 | if (tcp_alloc_md5sig_pool(sk) == NULL) | ||
| 320 | BUG(); | 368 | BUG(); |
| 321 | } | 369 | } |
| 322 | } while (0); | 370 | } while (0); |
| @@ -356,11 +404,8 @@ void tcp_twsk_destructor(struct sock *sk) | |||
| 356 | { | 404 | { |
| 357 | #ifdef CONFIG_TCP_MD5SIG | 405 | #ifdef CONFIG_TCP_MD5SIG |
| 358 | struct tcp_timewait_sock *twsk = tcp_twsk(sk); | 406 | struct tcp_timewait_sock *twsk = tcp_twsk(sk); |
| 359 | 407 | if (twsk->tw_md5_keylen) | |
| 360 | if (twsk->tw_md5_key) { | ||
| 361 | tcp_free_md5sig_pool(); | 408 | tcp_free_md5sig_pool(); |
| 362 | kfree_rcu(twsk->tw_md5_key, rcu); | ||
| 363 | } | ||
| 364 | #endif | 409 | #endif |
| 365 | } | 410 | } |
| 366 | EXPORT_SYMBOL_GPL(tcp_twsk_destructor); | 411 | EXPORT_SYMBOL_GPL(tcp_twsk_destructor); |
| @@ -379,7 +424,7 @@ static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, | |||
| 379 | */ | 424 | */ |
| 380 | struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) | 425 | struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) |
| 381 | { | 426 | { |
| 382 | struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); | 427 | struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); |
| 383 | 428 | ||
| 384 | if (newsk != NULL) { | 429 | if (newsk != NULL) { |
| 385 | const struct inet_request_sock *ireq = inet_rsk(req); | 430 | const struct inet_request_sock *ireq = inet_rsk(req); |
| @@ -424,7 +469,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
| 424 | treq->snt_isn + 1 + tcp_s_data_size(oldtp); | 469 | treq->snt_isn + 1 + tcp_s_data_size(oldtp); |
| 425 | 470 | ||
| 426 | tcp_prequeue_init(newtp); | 471 | tcp_prequeue_init(newtp); |
| 427 | INIT_LIST_HEAD(&newtp->tsq_node); | ||
| 428 | 472 | ||
| 429 | tcp_init_wl(newtp, treq->rcv_isn); | 473 | tcp_init_wl(newtp, treq->rcv_isn); |
| 430 | 474 | ||
| @@ -437,7 +481,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
| 437 | newtp->sacked_out = 0; | 481 | newtp->sacked_out = 0; |
| 438 | newtp->fackets_out = 0; | 482 | newtp->fackets_out = 0; |
| 439 | newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; | 483 | newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; |
| 440 | tcp_enable_early_retrans(newtp); | ||
| 441 | 484 | ||
| 442 | /* So many TCP implementations out there (incorrectly) count the | 485 | /* So many TCP implementations out there (incorrectly) count the |
| 443 | * initial SYN frame in their delayed-ACK and congestion control | 486 | * initial SYN frame in their delayed-ACK and congestion control |
| @@ -451,9 +494,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
| 451 | newtp->frto_counter = 0; | 494 | newtp->frto_counter = 0; |
| 452 | newtp->frto_highmark = 0; | 495 | newtp->frto_highmark = 0; |
| 453 | 496 | ||
| 454 | if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops && | 497 | newicsk->icsk_ca_ops = &tcp_init_congestion_ops; |
| 455 | !try_module_get(newicsk->icsk_ca_ops->owner)) | ||
| 456 | newicsk->icsk_ca_ops = &tcp_init_congestion_ops; | ||
| 457 | 498 | ||
| 458 | tcp_set_ca_state(newsk, TCP_CA_Open); | 499 | tcp_set_ca_state(newsk, TCP_CA_Open); |
| 459 | tcp_init_xmit_timers(newsk); | 500 | tcp_init_xmit_timers(newsk); |
| @@ -509,8 +550,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
| 509 | newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; | 550 | newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; |
| 510 | newtp->rx_opt.mss_clamp = req->mss; | 551 | newtp->rx_opt.mss_clamp = req->mss; |
| 511 | TCP_ECN_openreq_child(newtp, req); | 552 | TCP_ECN_openreq_child(newtp, req); |
| 512 | newtp->fastopen_rsk = NULL; | ||
| 513 | newtp->syn_data_acked = 0; | ||
| 514 | 553 | ||
| 515 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); | 554 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); |
| 516 | } | 555 | } |
| @@ -519,33 +558,24 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
| 519 | EXPORT_SYMBOL(tcp_create_openreq_child); | 558 | EXPORT_SYMBOL(tcp_create_openreq_child); |
| 520 | 559 | ||
| 521 | /* | 560 | /* |
| 522 | * Process an incoming packet for SYN_RECV sockets represented as a | 561 | * Process an incoming packet for SYN_RECV sockets represented |
| 523 | * request_sock. Normally sk is the listener socket but for TFO it | 562 | * as a request_sock. |
| 524 | * points to the child socket. | ||
| 525 | * | ||
| 526 | * XXX (TFO) - The current impl contains a special check for ack | ||
| 527 | * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? | ||
| 528 | * | ||
| 529 | * We don't need to initialize tmp_opt.sack_ok as we don't use the results | ||
| 530 | */ | 563 | */ |
| 531 | 564 | ||
| 532 | struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | 565 | struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, |
| 533 | struct request_sock *req, | 566 | struct request_sock *req, |
| 534 | struct request_sock **prev, | 567 | struct request_sock **prev) |
| 535 | bool fastopen) | ||
| 536 | { | 568 | { |
| 537 | struct tcp_options_received tmp_opt; | 569 | struct tcp_options_received tmp_opt; |
| 538 | const u8 *hash_location; | 570 | u8 *hash_location; |
| 539 | struct sock *child; | 571 | struct sock *child; |
| 540 | const struct tcphdr *th = tcp_hdr(skb); | 572 | const struct tcphdr *th = tcp_hdr(skb); |
| 541 | __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); | 573 | __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); |
| 542 | bool paws_reject = false; | 574 | int paws_reject = 0; |
| 543 | |||
| 544 | BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN)); | ||
| 545 | 575 | ||
| 546 | tmp_opt.saw_tstamp = 0; | 576 | tmp_opt.saw_tstamp = 0; |
| 547 | if (th->doff > (sizeof(struct tcphdr)>>2)) { | 577 | if (th->doff > (sizeof(struct tcphdr)>>2)) { |
| 548 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); | 578 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0); |
| 549 | 579 | ||
| 550 | if (tmp_opt.saw_tstamp) { | 580 | if (tmp_opt.saw_tstamp) { |
| 551 | tmp_opt.ts_recent = req->ts_recent; | 581 | tmp_opt.ts_recent = req->ts_recent; |
| @@ -553,7 +583,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
| 553 | * it can be estimated (approximately) | 583 | * it can be estimated (approximately) |
| 554 | * from another data. | 584 | * from another data. |
| 555 | */ | 585 | */ |
| 556 | tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout); | 586 | tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); |
| 557 | paws_reject = tcp_paws_reject(&tmp_opt, th->rst); | 587 | paws_reject = tcp_paws_reject(&tmp_opt, th->rst); |
| 558 | } | 588 | } |
| 559 | } | 589 | } |
| @@ -578,11 +608,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
| 578 | * | 608 | * |
| 579 | * Enforce "SYN-ACK" according to figure 8, figure 6 | 609 | * Enforce "SYN-ACK" according to figure 8, figure 6 |
| 580 | * of RFC793, fixed by RFC1122. | 610 | * of RFC793, fixed by RFC1122. |
| 581 | * | ||
| 582 | * Note that even if there is new data in the SYN packet | ||
| 583 | * they will be thrown away too. | ||
| 584 | */ | 611 | */ |
| 585 | inet_rtx_syn_ack(sk, req); | 612 | req->rsk_ops->rtx_syn_ack(sk, req, NULL); |
| 586 | return NULL; | 613 | return NULL; |
| 587 | } | 614 | } |
| 588 | 615 | ||
| @@ -638,12 +665,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
| 638 | * sent (the segment carries an unacceptable ACK) ... | 665 | * sent (the segment carries an unacceptable ACK) ... |
| 639 | * a reset is sent." | 666 | * a reset is sent." |
| 640 | * | 667 | * |
| 641 | * Invalid ACK: reset will be sent by listening socket. | 668 | * Invalid ACK: reset will be sent by listening socket |
| 642 | * Note that the ACK validity check for a Fast Open socket is done | ||
| 643 | * elsewhere and is checked directly against the child socket rather | ||
| 644 | * than req because user data may have been sent out. | ||
| 645 | */ | 669 | */ |
| 646 | if ((flg & TCP_FLAG_ACK) && !fastopen && | 670 | if ((flg & TCP_FLAG_ACK) && |
| 647 | (TCP_SKB_CB(skb)->ack_seq != | 671 | (TCP_SKB_CB(skb)->ack_seq != |
| 648 | tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) | 672 | tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) |
| 649 | return sk; | 673 | return sk; |
| @@ -656,7 +680,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
| 656 | /* RFC793: "first check sequence number". */ | 680 | /* RFC793: "first check sequence number". */ |
| 657 | 681 | ||
| 658 | if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, | 682 | if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, |
| 659 | tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) { | 683 | tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { |
| 660 | /* Out of window: send ACK and drop. */ | 684 | /* Out of window: send ACK and drop. */ |
| 661 | if (!(flg & TCP_FLAG_RST)) | 685 | if (!(flg & TCP_FLAG_RST)) |
| 662 | req->rsk_ops->send_ack(sk, skb, req); | 686 | req->rsk_ops->send_ack(sk, skb, req); |
| @@ -667,7 +691,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
| 667 | 691 | ||
| 668 | /* In sequence, PAWS is OK. */ | 692 | /* In sequence, PAWS is OK. */ |
| 669 | 693 | ||
| 670 | if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) | 694 | if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) |
| 671 | req->ts_recent = tmp_opt.rcv_tsval; | 695 | req->ts_recent = tmp_opt.rcv_tsval; |
| 672 | 696 | ||
| 673 | if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { | 697 | if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { |
| @@ -686,32 +710,21 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
| 686 | 710 | ||
| 687 | /* ACK sequence verified above, just make sure ACK is | 711 | /* ACK sequence verified above, just make sure ACK is |
| 688 | * set. If ACK not set, just silently drop the packet. | 712 | * set. If ACK not set, just silently drop the packet. |
| 689 | * | ||
| 690 | * XXX (TFO) - if we ever allow "data after SYN", the | ||
| 691 | * following check needs to be removed. | ||
| 692 | */ | 713 | */ |
| 693 | if (!(flg & TCP_FLAG_ACK)) | 714 | if (!(flg & TCP_FLAG_ACK)) |
| 694 | return NULL; | 715 | return NULL; |
| 695 | 716 | ||
| 696 | /* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */ | ||
| 697 | if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr) | ||
| 698 | tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr; | ||
| 699 | else if (req->num_retrans) /* don't take RTT sample if retrans && ~TS */ | ||
| 700 | tcp_rsk(req)->snt_synack = 0; | ||
| 701 | |||
| 702 | /* For Fast Open no more processing is needed (sk is the | ||
| 703 | * child socket). | ||
| 704 | */ | ||
| 705 | if (fastopen) | ||
| 706 | return sk; | ||
| 707 | |||
| 708 | /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ | 717 | /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ |
| 709 | if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && | 718 | if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && |
| 710 | TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { | 719 | TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { |
| 711 | inet_rsk(req)->acked = 1; | 720 | inet_rsk(req)->acked = 1; |
| 712 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); | 721 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); |
| 713 | return NULL; | 722 | return NULL; |
| 714 | } | 723 | } |
| 724 | if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr) | ||
| 725 | tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr; | ||
| 726 | else if (req->retrans) /* don't take RTT sample if retrans && ~TS */ | ||
| 727 | tcp_rsk(req)->snt_synack = 0; | ||
| 715 | 728 | ||
| 716 | /* OK, ACK is valid, create big socket and | 729 | /* OK, ACK is valid, create big socket and |
| 717 | * feed this segment to it. It will repeat all | 730 | * feed this segment to it. It will repeat all |
| @@ -736,21 +749,11 @@ listen_overflow: | |||
| 736 | } | 749 | } |
| 737 | 750 | ||
| 738 | embryonic_reset: | 751 | embryonic_reset: |
| 739 | if (!(flg & TCP_FLAG_RST)) { | 752 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); |
| 740 | /* Received a bad SYN pkt - for TFO We try not to reset | 753 | if (!(flg & TCP_FLAG_RST)) |
| 741 | * the local connection unless it's really necessary to | ||
| 742 | * avoid becoming vulnerable to outside attack aiming at | ||
| 743 | * resetting legit local connections. | ||
| 744 | */ | ||
| 745 | req->rsk_ops->send_reset(sk, skb); | 754 | req->rsk_ops->send_reset(sk, skb); |
| 746 | } else if (fastopen) { /* received a valid RST pkt */ | 755 | |
| 747 | reqsk_fastopen_remove(sk, req, true); | 756 | inet_csk_reqsk_queue_drop(sk, req, prev); |
| 748 | tcp_reset(sk); | ||
| 749 | } | ||
| 750 | if (!fastopen) { | ||
| 751 | inet_csk_reqsk_queue_drop(sk, req, prev); | ||
| 752 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); | ||
| 753 | } | ||
| 754 | return NULL; | 757 | return NULL; |
| 755 | } | 758 | } |
| 756 | EXPORT_SYMBOL(tcp_check_req); | 759 | EXPORT_SYMBOL(tcp_check_req); |
| @@ -759,12 +762,6 @@ EXPORT_SYMBOL(tcp_check_req); | |||
| 759 | * Queue segment on the new socket if the new socket is active, | 762 | * Queue segment on the new socket if the new socket is active, |
| 760 | * otherwise we just shortcircuit this and continue with | 763 | * otherwise we just shortcircuit this and continue with |
| 761 | * the new socket. | 764 | * the new socket. |
| 762 | * | ||
| 763 | * For the vast majority of cases child->sk_state will be TCP_SYN_RECV | ||
| 764 | * when entering. But other states are possible due to a race condition | ||
| 765 | * where after __inet_lookup_established() fails but before the listener | ||
| 766 | * locked is obtained, other packets cause the same connection to | ||
| 767 | * be created. | ||
| 768 | */ | 765 | */ |
| 769 | 766 | ||
| 770 | int tcp_child_process(struct sock *parent, struct sock *child, | 767 | int tcp_child_process(struct sock *parent, struct sock *child, |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5d451593ef1..faf257b9415 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
| @@ -34,8 +34,6 @@ | |||
| 34 | * | 34 | * |
| 35 | */ | 35 | */ |
| 36 | 36 | ||
| 37 | #define pr_fmt(fmt) "TCP: " fmt | ||
| 38 | |||
| 39 | #include <net/tcp.h> | 37 | #include <net/tcp.h> |
| 40 | 38 | ||
| 41 | #include <linux/compiler.h> | 39 | #include <linux/compiler.h> |
| @@ -50,9 +48,6 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1; | |||
| 50 | */ | 48 | */ |
| 51 | int sysctl_tcp_workaround_signed_windows __read_mostly = 0; | 49 | int sysctl_tcp_workaround_signed_windows __read_mostly = 0; |
| 52 | 50 | ||
| 53 | /* Default TSQ limit of two TSO segments */ | ||
| 54 | int sysctl_tcp_limit_output_bytes __read_mostly = 131072; | ||
| 55 | |||
| 56 | /* This limits the percentage of the congestion window which we | 51 | /* This limits the percentage of the congestion window which we |
| 57 | * will allow a single TSO frame to consume. Building TSO frames | 52 | * will allow a single TSO frame to consume. Building TSO frames |
| 58 | * which are too large can cause TCP streams to be bursty. | 53 | * which are too large can cause TCP streams to be bursty. |
| @@ -68,11 +63,9 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; | |||
| 68 | int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ | 63 | int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ |
| 69 | EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); | 64 | EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); |
| 70 | 65 | ||
| 71 | static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | ||
| 72 | int push_one, gfp_t gfp); | ||
| 73 | 66 | ||
| 74 | /* Account for new data that has been sent to the network. */ | 67 | /* Account for new data that has been sent to the network. */ |
| 75 | static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) | 68 | static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) |
| 76 | { | 69 | { |
| 77 | struct tcp_sock *tp = tcp_sk(sk); | 70 | struct tcp_sock *tp = tcp_sk(sk); |
| 78 | unsigned int prior_packets = tp->packets_out; | 71 | unsigned int prior_packets = tp->packets_out; |
| @@ -85,8 +78,9 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) | |||
| 85 | tp->frto_counter = 3; | 78 | tp->frto_counter = 3; |
| 86 | 79 | ||
| 87 | tp->packets_out += tcp_skb_pcount(skb); | 80 | tp->packets_out += tcp_skb_pcount(skb); |
| 88 | if (!prior_packets || tp->early_retrans_delayed) | 81 | if (!prior_packets) |
| 89 | tcp_rearm_rto(sk); | 82 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
| 83 | inet_csk(sk)->icsk_rto, TCP_RTO_MAX); | ||
| 90 | } | 84 | } |
| 91 | 85 | ||
| 92 | /* SND.NXT, if window was not shrunk. | 86 | /* SND.NXT, if window was not shrunk. |
| @@ -95,9 +89,9 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) | |||
| 95 | * Anything in between SND.UNA...SND.UNA+SND.WND also can be already | 89 | * Anything in between SND.UNA...SND.UNA+SND.WND also can be already |
| 96 | * invalid. OK, let's make this for now: | 90 | * invalid. OK, let's make this for now: |
| 97 | */ | 91 | */ |
| 98 | static inline __u32 tcp_acceptable_seq(const struct sock *sk) | 92 | static inline __u32 tcp_acceptable_seq(struct sock *sk) |
| 99 | { | 93 | { |
| 100 | const struct tcp_sock *tp = tcp_sk(sk); | 94 | struct tcp_sock *tp = tcp_sk(sk); |
| 101 | 95 | ||
| 102 | if (!before(tcp_wnd_end(tp), tp->snd_nxt)) | 96 | if (!before(tcp_wnd_end(tp), tp->snd_nxt)) |
| 103 | return tp->snd_nxt; | 97 | return tp->snd_nxt; |
| @@ -122,7 +116,7 @@ static inline __u32 tcp_acceptable_seq(const struct sock *sk) | |||
| 122 | static __u16 tcp_advertise_mss(struct sock *sk) | 116 | static __u16 tcp_advertise_mss(struct sock *sk) |
| 123 | { | 117 | { |
| 124 | struct tcp_sock *tp = tcp_sk(sk); | 118 | struct tcp_sock *tp = tcp_sk(sk); |
| 125 | const struct dst_entry *dst = __sk_dst_get(sk); | 119 | struct dst_entry *dst = __sk_dst_get(sk); |
| 126 | int mss = tp->advmss; | 120 | int mss = tp->advmss; |
| 127 | 121 | ||
| 128 | if (dst) { | 122 | if (dst) { |
| @@ -139,7 +133,7 @@ static __u16 tcp_advertise_mss(struct sock *sk) | |||
| 139 | 133 | ||
| 140 | /* RFC2861. Reset CWND after idle period longer RTO to "restart window". | 134 | /* RFC2861. Reset CWND after idle period longer RTO to "restart window". |
| 141 | * This is the first part of cwnd validation mechanism. */ | 135 | * This is the first part of cwnd validation mechanism. */ |
| 142 | static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst) | 136 | static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) |
| 143 | { | 137 | { |
| 144 | struct tcp_sock *tp = tcp_sk(sk); | 138 | struct tcp_sock *tp = tcp_sk(sk); |
| 145 | s32 delta = tcp_time_stamp - tp->lsndtime; | 139 | s32 delta = tcp_time_stamp - tp->lsndtime; |
| @@ -160,7 +154,7 @@ static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst) | |||
| 160 | 154 | ||
| 161 | /* Congestion state accounting after a packet has been sent. */ | 155 | /* Congestion state accounting after a packet has been sent. */ |
| 162 | static void tcp_event_data_sent(struct tcp_sock *tp, | 156 | static void tcp_event_data_sent(struct tcp_sock *tp, |
| 163 | struct sock *sk) | 157 | struct sk_buff *skb, struct sock *sk) |
| 164 | { | 158 | { |
| 165 | struct inet_connection_sock *icsk = inet_csk(sk); | 159 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 166 | const u32 now = tcp_time_stamp; | 160 | const u32 now = tcp_time_stamp; |
| @@ -301,11 +295,11 @@ static u16 tcp_select_window(struct sock *sk) | |||
| 301 | } | 295 | } |
| 302 | 296 | ||
| 303 | /* Packet ECN state for a SYN-ACK */ | 297 | /* Packet ECN state for a SYN-ACK */ |
| 304 | static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb) | 298 | static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) |
| 305 | { | 299 | { |
| 306 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; | 300 | TCP_SKB_CB(skb)->flags &= ~TCPHDR_CWR; |
| 307 | if (!(tp->ecn_flags & TCP_ECN_OK)) | 301 | if (!(tp->ecn_flags & TCP_ECN_OK)) |
| 308 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; | 302 | TCP_SKB_CB(skb)->flags &= ~TCPHDR_ECE; |
| 309 | } | 303 | } |
| 310 | 304 | ||
| 311 | /* Packet ECN state for a SYN. */ | 305 | /* Packet ECN state for a SYN. */ |
| @@ -315,13 +309,13 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) | |||
| 315 | 309 | ||
| 316 | tp->ecn_flags = 0; | 310 | tp->ecn_flags = 0; |
| 317 | if (sysctl_tcp_ecn == 1) { | 311 | if (sysctl_tcp_ecn == 1) { |
| 318 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; | 312 | TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR; |
| 319 | tp->ecn_flags = TCP_ECN_OK; | 313 | tp->ecn_flags = TCP_ECN_OK; |
| 320 | } | 314 | } |
| 321 | } | 315 | } |
| 322 | 316 | ||
| 323 | static __inline__ void | 317 | static __inline__ void |
| 324 | TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th) | 318 | TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th) |
| 325 | { | 319 | { |
| 326 | if (inet_rsk(req)->ecn_ok) | 320 | if (inet_rsk(req)->ecn_ok) |
| 327 | th->ece = 1; | 321 | th->ece = 1; |
| @@ -362,7 +356,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) | |||
| 362 | skb->ip_summed = CHECKSUM_PARTIAL; | 356 | skb->ip_summed = CHECKSUM_PARTIAL; |
| 363 | skb->csum = 0; | 357 | skb->csum = 0; |
| 364 | 358 | ||
| 365 | TCP_SKB_CB(skb)->tcp_flags = flags; | 359 | TCP_SKB_CB(skb)->flags = flags; |
| 366 | TCP_SKB_CB(skb)->sacked = 0; | 360 | TCP_SKB_CB(skb)->sacked = 0; |
| 367 | 361 | ||
| 368 | skb_shinfo(skb)->gso_segs = 1; | 362 | skb_shinfo(skb)->gso_segs = 1; |
| @@ -375,7 +369,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) | |||
| 375 | TCP_SKB_CB(skb)->end_seq = seq; | 369 | TCP_SKB_CB(skb)->end_seq = seq; |
| 376 | } | 370 | } |
| 377 | 371 | ||
| 378 | static inline bool tcp_urg_mode(const struct tcp_sock *tp) | 372 | static inline int tcp_urg_mode(const struct tcp_sock *tp) |
| 379 | { | 373 | { |
| 380 | return tp->snd_una != tp->snd_up; | 374 | return tp->snd_una != tp->snd_up; |
| 381 | } | 375 | } |
| @@ -385,17 +379,15 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) | |||
| 385 | #define OPTION_MD5 (1 << 2) | 379 | #define OPTION_MD5 (1 << 2) |
| 386 | #define OPTION_WSCALE (1 << 3) | 380 | #define OPTION_WSCALE (1 << 3) |
| 387 | #define OPTION_COOKIE_EXTENSION (1 << 4) | 381 | #define OPTION_COOKIE_EXTENSION (1 << 4) |
| 388 | #define OPTION_FAST_OPEN_COOKIE (1 << 8) | ||
| 389 | 382 | ||
| 390 | struct tcp_out_options { | 383 | struct tcp_out_options { |
| 391 | u16 options; /* bit field of OPTION_* */ | 384 | u8 options; /* bit field of OPTION_* */ |
| 392 | u16 mss; /* 0 to disable */ | ||
| 393 | u8 ws; /* window scale, 0 to disable */ | 385 | u8 ws; /* window scale, 0 to disable */ |
| 394 | u8 num_sack_blocks; /* number of SACK blocks to include */ | 386 | u8 num_sack_blocks; /* number of SACK blocks to include */ |
| 395 | u8 hash_size; /* bytes in hash_location */ | 387 | u8 hash_size; /* bytes in hash_location */ |
| 396 | __u8 *hash_location; /* temporary pointer, overloaded */ | 388 | u16 mss; /* 0 to disable */ |
| 397 | __u32 tsval, tsecr; /* need to include OPTION_TS */ | 389 | __u32 tsval, tsecr; /* need to include OPTION_TS */ |
| 398 | struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ | 390 | __u8 *hash_location; /* temporary pointer, overloaded */ |
| 399 | }; | 391 | }; |
| 400 | 392 | ||
| 401 | /* The sysctl int routines are generic, so check consistency here. | 393 | /* The sysctl int routines are generic, so check consistency here. |
| @@ -444,7 +436,7 @@ static u8 tcp_cookie_size_check(u8 desired) | |||
| 444 | static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, | 436 | static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, |
| 445 | struct tcp_out_options *opts) | 437 | struct tcp_out_options *opts) |
| 446 | { | 438 | { |
| 447 | u16 options = opts->options; /* mungable copy */ | 439 | u8 options = opts->options; /* mungable copy */ |
| 448 | 440 | ||
| 449 | /* Having both authentication and cookies for security is redundant, | 441 | /* Having both authentication and cookies for security is redundant, |
| 450 | * and there's certainly not enough room. Instead, the cookie-less | 442 | * and there's certainly not enough room. Instead, the cookie-less |
| @@ -566,37 +558,20 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, | |||
| 566 | 558 | ||
| 567 | tp->rx_opt.dsack = 0; | 559 | tp->rx_opt.dsack = 0; |
| 568 | } | 560 | } |
| 569 | |||
| 570 | if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { | ||
| 571 | struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; | ||
| 572 | |||
| 573 | *ptr++ = htonl((TCPOPT_EXP << 24) | | ||
| 574 | ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) | | ||
| 575 | TCPOPT_FASTOPEN_MAGIC); | ||
| 576 | |||
| 577 | memcpy(ptr, foc->val, foc->len); | ||
| 578 | if ((foc->len & 3) == 2) { | ||
| 579 | u8 *align = ((u8 *)ptr) + foc->len; | ||
| 580 | align[0] = align[1] = TCPOPT_NOP; | ||
| 581 | } | ||
| 582 | ptr += (foc->len + 3) >> 2; | ||
| 583 | } | ||
| 584 | } | 561 | } |
| 585 | 562 | ||
| 586 | /* Compute TCP options for SYN packets. This is not the final | 563 | /* Compute TCP options for SYN packets. This is not the final |
| 587 | * network wire format yet. | 564 | * network wire format yet. |
| 588 | */ | 565 | */ |
| 589 | static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, | 566 | static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, |
| 590 | struct tcp_out_options *opts, | 567 | struct tcp_out_options *opts, |
| 591 | struct tcp_md5sig_key **md5) | 568 | struct tcp_md5sig_key **md5) { |
| 592 | { | ||
| 593 | struct tcp_sock *tp = tcp_sk(sk); | 569 | struct tcp_sock *tp = tcp_sk(sk); |
| 594 | struct tcp_cookie_values *cvp = tp->cookie_values; | 570 | struct tcp_cookie_values *cvp = tp->cookie_values; |
| 595 | unsigned int remaining = MAX_TCP_OPTION_SPACE; | 571 | unsigned remaining = MAX_TCP_OPTION_SPACE; |
| 596 | u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? | 572 | u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? |
| 597 | tcp_cookie_size_check(cvp->cookie_desired) : | 573 | tcp_cookie_size_check(cvp->cookie_desired) : |
| 598 | 0; | 574 | 0; |
| 599 | struct tcp_fastopen_request *fastopen = tp->fastopen_req; | ||
| 600 | 575 | ||
| 601 | #ifdef CONFIG_TCP_MD5SIG | 576 | #ifdef CONFIG_TCP_MD5SIG |
| 602 | *md5 = tp->af_specific->md5_lookup(sk, sk); | 577 | *md5 = tp->af_specific->md5_lookup(sk, sk); |
| @@ -637,16 +612,6 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, | |||
| 637 | remaining -= TCPOLEN_SACKPERM_ALIGNED; | 612 | remaining -= TCPOLEN_SACKPERM_ALIGNED; |
| 638 | } | 613 | } |
| 639 | 614 | ||
| 640 | if (fastopen && fastopen->cookie.len >= 0) { | ||
| 641 | u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len; | ||
| 642 | need = (need + 3) & ~3U; /* Align to 32 bits */ | ||
| 643 | if (remaining >= need) { | ||
| 644 | opts->options |= OPTION_FAST_OPEN_COOKIE; | ||
| 645 | opts->fastopen_cookie = &fastopen->cookie; | ||
| 646 | remaining -= need; | ||
| 647 | tp->syn_fastopen = 1; | ||
| 648 | } | ||
| 649 | } | ||
| 650 | /* Note that timestamps are required by the specification. | 615 | /* Note that timestamps are required by the specification. |
| 651 | * | 616 | * |
| 652 | * Odd numbers of bytes are prohibited by the specification, ensuring | 617 | * Odd numbers of bytes are prohibited by the specification, ensuring |
| @@ -697,16 +662,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, | |||
| 697 | } | 662 | } |
| 698 | 663 | ||
| 699 | /* Set up TCP options for SYN-ACKs. */ | 664 | /* Set up TCP options for SYN-ACKs. */ |
| 700 | static unsigned int tcp_synack_options(struct sock *sk, | 665 | static unsigned tcp_synack_options(struct sock *sk, |
| 701 | struct request_sock *req, | 666 | struct request_sock *req, |
| 702 | unsigned int mss, struct sk_buff *skb, | 667 | unsigned mss, struct sk_buff *skb, |
| 703 | struct tcp_out_options *opts, | 668 | struct tcp_out_options *opts, |
| 704 | struct tcp_md5sig_key **md5, | 669 | struct tcp_md5sig_key **md5, |
| 705 | struct tcp_extend_values *xvp, | 670 | struct tcp_extend_values *xvp) |
| 706 | struct tcp_fastopen_cookie *foc) | ||
| 707 | { | 671 | { |
| 708 | struct inet_request_sock *ireq = inet_rsk(req); | 672 | struct inet_request_sock *ireq = inet_rsk(req); |
| 709 | unsigned int remaining = MAX_TCP_OPTION_SPACE; | 673 | unsigned remaining = MAX_TCP_OPTION_SPACE; |
| 710 | u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ? | 674 | u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ? |
| 711 | xvp->cookie_plus : | 675 | xvp->cookie_plus : |
| 712 | 0; | 676 | 0; |
| @@ -748,15 +712,7 @@ static unsigned int tcp_synack_options(struct sock *sk, | |||
| 748 | if (unlikely(!ireq->tstamp_ok)) | 712 | if (unlikely(!ireq->tstamp_ok)) |
| 749 | remaining -= TCPOLEN_SACKPERM_ALIGNED; | 713 | remaining -= TCPOLEN_SACKPERM_ALIGNED; |
| 750 | } | 714 | } |
| 751 | if (foc != NULL) { | 715 | |
| 752 | u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; | ||
| 753 | need = (need + 3) & ~3U; /* Align to 32 bits */ | ||
| 754 | if (remaining >= need) { | ||
| 755 | opts->options |= OPTION_FAST_OPEN_COOKIE; | ||
| 756 | opts->fastopen_cookie = foc; | ||
| 757 | remaining -= need; | ||
| 758 | } | ||
| 759 | } | ||
| 760 | /* Similar rationale to tcp_syn_options() applies here, too. | 716 | /* Similar rationale to tcp_syn_options() applies here, too. |
| 761 | * If the <SYN> options fit, the same options should fit now! | 717 | * If the <SYN> options fit, the same options should fit now! |
| 762 | */ | 718 | */ |
| @@ -785,13 +741,12 @@ static unsigned int tcp_synack_options(struct sock *sk, | |||
| 785 | /* Compute TCP options for ESTABLISHED sockets. This is not the | 741 | /* Compute TCP options for ESTABLISHED sockets. This is not the |
| 786 | * final wire format yet. | 742 | * final wire format yet. |
| 787 | */ | 743 | */ |
| 788 | static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb, | 744 | static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, |
| 789 | struct tcp_out_options *opts, | 745 | struct tcp_out_options *opts, |
| 790 | struct tcp_md5sig_key **md5) | 746 | struct tcp_md5sig_key **md5) { |
| 791 | { | ||
| 792 | struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; | 747 | struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; |
| 793 | struct tcp_sock *tp = tcp_sk(sk); | 748 | struct tcp_sock *tp = tcp_sk(sk); |
| 794 | unsigned int size = 0; | 749 | unsigned size = 0; |
| 795 | unsigned int eff_sacks; | 750 | unsigned int eff_sacks; |
| 796 | 751 | ||
| 797 | #ifdef CONFIG_TCP_MD5SIG | 752 | #ifdef CONFIG_TCP_MD5SIG |
| @@ -813,9 +768,9 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb | |||
| 813 | 768 | ||
| 814 | eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; | 769 | eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; |
| 815 | if (unlikely(eff_sacks)) { | 770 | if (unlikely(eff_sacks)) { |
| 816 | const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; | 771 | const unsigned remaining = MAX_TCP_OPTION_SPACE - size; |
| 817 | opts->num_sack_blocks = | 772 | opts->num_sack_blocks = |
| 818 | min_t(unsigned int, eff_sacks, | 773 | min_t(unsigned, eff_sacks, |
| 819 | (remaining - TCPOLEN_SACK_BASE_ALIGNED) / | 774 | (remaining - TCPOLEN_SACK_BASE_ALIGNED) / |
| 820 | TCPOLEN_SACK_PERBLOCK); | 775 | TCPOLEN_SACK_PERBLOCK); |
| 821 | size += TCPOLEN_SACK_BASE_ALIGNED + | 776 | size += TCPOLEN_SACK_BASE_ALIGNED + |
| @@ -825,160 +780,6 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb | |||
| 825 | return size; | 780 | return size; |
| 826 | } | 781 | } |
| 827 | 782 | ||
| 828 | |||
| 829 | /* TCP SMALL QUEUES (TSQ) | ||
| 830 | * | ||
| 831 | * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) | ||
| 832 | * to reduce RTT and bufferbloat. | ||
| 833 | * We do this using a special skb destructor (tcp_wfree). | ||
| 834 | * | ||
| 835 | * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb | ||
| 836 | * needs to be reallocated in a driver. | ||
| 837 | * The invariant being skb->truesize substracted from sk->sk_wmem_alloc | ||
| 838 | * | ||
| 839 | * Since transmit from skb destructor is forbidden, we use a tasklet | ||
| 840 | * to process all sockets that eventually need to send more skbs. | ||
| 841 | * We use one tasklet per cpu, with its own queue of sockets. | ||
| 842 | */ | ||
| 843 | struct tsq_tasklet { | ||
| 844 | struct tasklet_struct tasklet; | ||
| 845 | struct list_head head; /* queue of tcp sockets */ | ||
| 846 | }; | ||
| 847 | static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); | ||
| 848 | |||
| 849 | static void tcp_tsq_handler(struct sock *sk) | ||
| 850 | { | ||
| 851 | if ((1 << sk->sk_state) & | ||
| 852 | (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | | ||
| 853 | TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) | ||
| 854 | tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); | ||
| 855 | } | ||
| 856 | /* | ||
| 857 | * One tasklest per cpu tries to send more skbs. | ||
| 858 | * We run in tasklet context but need to disable irqs when | ||
| 859 | * transfering tsq->head because tcp_wfree() might | ||
| 860 | * interrupt us (non NAPI drivers) | ||
| 861 | */ | ||
| 862 | static void tcp_tasklet_func(unsigned long data) | ||
| 863 | { | ||
| 864 | struct tsq_tasklet *tsq = (struct tsq_tasklet *)data; | ||
| 865 | LIST_HEAD(list); | ||
| 866 | unsigned long flags; | ||
| 867 | struct list_head *q, *n; | ||
| 868 | struct tcp_sock *tp; | ||
| 869 | struct sock *sk; | ||
| 870 | |||
| 871 | local_irq_save(flags); | ||
| 872 | list_splice_init(&tsq->head, &list); | ||
| 873 | local_irq_restore(flags); | ||
| 874 | |||
| 875 | list_for_each_safe(q, n, &list) { | ||
| 876 | tp = list_entry(q, struct tcp_sock, tsq_node); | ||
| 877 | list_del(&tp->tsq_node); | ||
| 878 | |||
| 879 | sk = (struct sock *)tp; | ||
| 880 | bh_lock_sock(sk); | ||
| 881 | |||
| 882 | if (!sock_owned_by_user(sk)) { | ||
| 883 | tcp_tsq_handler(sk); | ||
| 884 | } else { | ||
| 885 | /* defer the work to tcp_release_cb() */ | ||
| 886 | set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); | ||
| 887 | } | ||
| 888 | bh_unlock_sock(sk); | ||
| 889 | |||
| 890 | clear_bit(TSQ_QUEUED, &tp->tsq_flags); | ||
| 891 | sk_free(sk); | ||
| 892 | } | ||
| 893 | } | ||
| 894 | |||
| 895 | #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ | ||
| 896 | (1UL << TCP_WRITE_TIMER_DEFERRED) | \ | ||
| 897 | (1UL << TCP_DELACK_TIMER_DEFERRED) | \ | ||
| 898 | (1UL << TCP_MTU_REDUCED_DEFERRED)) | ||
| 899 | /** | ||
| 900 | * tcp_release_cb - tcp release_sock() callback | ||
| 901 | * @sk: socket | ||
| 902 | * | ||
| 903 | * called from release_sock() to perform protocol dependent | ||
| 904 | * actions before socket release. | ||
| 905 | */ | ||
| 906 | void tcp_release_cb(struct sock *sk) | ||
| 907 | { | ||
| 908 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 909 | unsigned long flags, nflags; | ||
| 910 | |||
| 911 | /* perform an atomic operation only if at least one flag is set */ | ||
| 912 | do { | ||
| 913 | flags = tp->tsq_flags; | ||
| 914 | if (!(flags & TCP_DEFERRED_ALL)) | ||
| 915 | return; | ||
| 916 | nflags = flags & ~TCP_DEFERRED_ALL; | ||
| 917 | } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags); | ||
| 918 | |||
| 919 | if (flags & (1UL << TCP_TSQ_DEFERRED)) | ||
| 920 | tcp_tsq_handler(sk); | ||
| 921 | |||
| 922 | if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { | ||
| 923 | tcp_write_timer_handler(sk); | ||
| 924 | __sock_put(sk); | ||
| 925 | } | ||
| 926 | if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) { | ||
| 927 | tcp_delack_timer_handler(sk); | ||
| 928 | __sock_put(sk); | ||
| 929 | } | ||
| 930 | if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { | ||
| 931 | sk->sk_prot->mtu_reduced(sk); | ||
| 932 | __sock_put(sk); | ||
| 933 | } | ||
| 934 | } | ||
| 935 | EXPORT_SYMBOL(tcp_release_cb); | ||
| 936 | |||
| 937 | void __init tcp_tasklet_init(void) | ||
| 938 | { | ||
| 939 | int i; | ||
| 940 | |||
| 941 | for_each_possible_cpu(i) { | ||
| 942 | struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); | ||
| 943 | |||
| 944 | INIT_LIST_HEAD(&tsq->head); | ||
| 945 | tasklet_init(&tsq->tasklet, | ||
| 946 | tcp_tasklet_func, | ||
| 947 | (unsigned long)tsq); | ||
| 948 | } | ||
| 949 | } | ||
| 950 | |||
| 951 | /* | ||
| 952 | * Write buffer destructor automatically called from kfree_skb. | ||
| 953 | * We cant xmit new skbs from this context, as we might already | ||
| 954 | * hold qdisc lock. | ||
| 955 | */ | ||
| 956 | static void tcp_wfree(struct sk_buff *skb) | ||
| 957 | { | ||
| 958 | struct sock *sk = skb->sk; | ||
| 959 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 960 | |||
| 961 | if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && | ||
| 962 | !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { | ||
| 963 | unsigned long flags; | ||
| 964 | struct tsq_tasklet *tsq; | ||
| 965 | |||
| 966 | /* Keep a ref on socket. | ||
| 967 | * This last ref will be released in tcp_tasklet_func() | ||
| 968 | */ | ||
| 969 | atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc); | ||
| 970 | |||
| 971 | /* queue this socket to tasklet queue */ | ||
| 972 | local_irq_save(flags); | ||
| 973 | tsq = &__get_cpu_var(tsq_tasklet); | ||
| 974 | list_add(&tp->tsq_node, &tsq->head); | ||
| 975 | tasklet_schedule(&tsq->tasklet); | ||
| 976 | local_irq_restore(flags); | ||
| 977 | } else { | ||
| 978 | sock_wfree(skb); | ||
| 979 | } | ||
| 980 | } | ||
| 981 | |||
| 982 | /* This routine actually transmits TCP packets queued in by | 783 | /* This routine actually transmits TCP packets queued in by |
| 983 | * tcp_do_sendmsg(). This is used by both the initial | 784 | * tcp_do_sendmsg(). This is used by both the initial |
| 984 | * transmission and possible later retransmissions. | 785 | * transmission and possible later retransmissions. |
| @@ -998,7 +799,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
| 998 | struct tcp_sock *tp; | 799 | struct tcp_sock *tp; |
| 999 | struct tcp_skb_cb *tcb; | 800 | struct tcp_skb_cb *tcb; |
| 1000 | struct tcp_out_options opts; | 801 | struct tcp_out_options opts; |
| 1001 | unsigned int tcp_options_size, tcp_header_size; | 802 | unsigned tcp_options_size, tcp_header_size; |
| 1002 | struct tcp_md5sig_key *md5; | 803 | struct tcp_md5sig_key *md5; |
| 1003 | struct tcphdr *th; | 804 | struct tcphdr *th; |
| 1004 | int err; | 805 | int err; |
| @@ -1025,7 +826,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
| 1025 | tcb = TCP_SKB_CB(skb); | 826 | tcb = TCP_SKB_CB(skb); |
| 1026 | memset(&opts, 0, sizeof(opts)); | 827 | memset(&opts, 0, sizeof(opts)); |
| 1027 | 828 | ||
| 1028 | if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) | 829 | if (unlikely(tcb->flags & TCPHDR_SYN)) |
| 1029 | tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); | 830 | tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); |
| 1030 | else | 831 | else |
| 1031 | tcp_options_size = tcp_established_options(sk, skb, &opts, | 832 | tcp_options_size = tcp_established_options(sk, skb, &opts, |
| @@ -1040,12 +841,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
| 1040 | 841 | ||
| 1041 | skb_push(skb, tcp_header_size); | 842 | skb_push(skb, tcp_header_size); |
| 1042 | skb_reset_transport_header(skb); | 843 | skb_reset_transport_header(skb); |
| 1043 | 844 | skb_set_owner_w(skb, sk); | |
| 1044 | skb_orphan(skb); | ||
| 1045 | skb->sk = sk; | ||
| 1046 | skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ? | ||
| 1047 | tcp_wfree : sock_wfree; | ||
| 1048 | atomic_add(skb->truesize, &sk->sk_wmem_alloc); | ||
| 1049 | 845 | ||
| 1050 | /* Build TCP header and checksum it. */ | 846 | /* Build TCP header and checksum it. */ |
| 1051 | th = tcp_hdr(skb); | 847 | th = tcp_hdr(skb); |
| @@ -1054,9 +850,9 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
| 1054 | th->seq = htonl(tcb->seq); | 850 | th->seq = htonl(tcb->seq); |
| 1055 | th->ack_seq = htonl(tp->rcv_nxt); | 851 | th->ack_seq = htonl(tp->rcv_nxt); |
| 1056 | *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | | 852 | *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | |
| 1057 | tcb->tcp_flags); | 853 | tcb->flags); |
| 1058 | 854 | ||
| 1059 | if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { | 855 | if (unlikely(tcb->flags & TCPHDR_SYN)) { |
| 1060 | /* RFC1323: The window in SYN & SYN/ACK segments | 856 | /* RFC1323: The window in SYN & SYN/ACK segments |
| 1061 | * is never scaled. | 857 | * is never scaled. |
| 1062 | */ | 858 | */ |
| @@ -1079,7 +875,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
| 1079 | } | 875 | } |
| 1080 | 876 | ||
| 1081 | tcp_options_write((__be32 *)(th + 1), tp, &opts); | 877 | tcp_options_write((__be32 *)(th + 1), tp, &opts); |
| 1082 | if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) | 878 | if (likely((tcb->flags & TCPHDR_SYN) == 0)) |
| 1083 | TCP_ECN_send(sk, skb, tcp_header_size); | 879 | TCP_ECN_send(sk, skb, tcp_header_size); |
| 1084 | 880 | ||
| 1085 | #ifdef CONFIG_TCP_MD5SIG | 881 | #ifdef CONFIG_TCP_MD5SIG |
| @@ -1093,11 +889,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
| 1093 | 889 | ||
| 1094 | icsk->icsk_af_ops->send_check(sk, skb); | 890 | icsk->icsk_af_ops->send_check(sk, skb); |
| 1095 | 891 | ||
| 1096 | if (likely(tcb->tcp_flags & TCPHDR_ACK)) | 892 | if (likely(tcb->flags & TCPHDR_ACK)) |
| 1097 | tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); | 893 | tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); |
| 1098 | 894 | ||
| 1099 | if (skb->len != tcp_header_size) | 895 | if (skb->len != tcp_header_size) |
| 1100 | tcp_event_data_sent(tp, sk); | 896 | tcp_event_data_sent(tp, skb, sk); |
| 1101 | 897 | ||
| 1102 | if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) | 898 | if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) |
| 1103 | TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, | 899 | TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, |
| @@ -1130,7 +926,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) | |||
| 1130 | } | 926 | } |
| 1131 | 927 | ||
| 1132 | /* Initialize TSO segments for a packet. */ | 928 | /* Initialize TSO segments for a packet. */ |
| 1133 | static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, | 929 | static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, |
| 1134 | unsigned int mss_now) | 930 | unsigned int mss_now) |
| 1135 | { | 931 | { |
| 1136 | if (skb->len <= mss_now || !sk_can_gso(sk) || | 932 | if (skb->len <= mss_now || !sk_can_gso(sk) || |
| @@ -1151,7 +947,7 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, | |||
| 1151 | /* When a modification to fackets out becomes necessary, we need to check | 947 | /* When a modification to fackets out becomes necessary, we need to check |
| 1152 | * skb is counted to fackets_out or not. | 948 | * skb is counted to fackets_out or not. |
| 1153 | */ | 949 | */ |
| 1154 | static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb, | 950 | static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb, |
| 1155 | int decr) | 951 | int decr) |
| 1156 | { | 952 | { |
| 1157 | struct tcp_sock *tp = tcp_sk(sk); | 953 | struct tcp_sock *tp = tcp_sk(sk); |
| @@ -1166,7 +962,7 @@ static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb, | |||
| 1166 | /* Pcount in the middle of the write queue got changed, we need to do various | 962 | /* Pcount in the middle of the write queue got changed, we need to do various |
| 1167 | * tweaks to fix counters | 963 | * tweaks to fix counters |
| 1168 | */ | 964 | */ |
| 1169 | static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) | 965 | static void tcp_adjust_pcount(struct sock *sk, struct sk_buff *skb, int decr) |
| 1170 | { | 966 | { |
| 1171 | struct tcp_sock *tp = tcp_sk(sk); | 967 | struct tcp_sock *tp = tcp_sk(sk); |
| 1172 | 968 | ||
| @@ -1236,9 +1032,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, | |||
| 1236 | TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; | 1032 | TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; |
| 1237 | 1033 | ||
| 1238 | /* PSH and FIN should only be set in the second packet. */ | 1034 | /* PSH and FIN should only be set in the second packet. */ |
| 1239 | flags = TCP_SKB_CB(skb)->tcp_flags; | 1035 | flags = TCP_SKB_CB(skb)->flags; |
| 1240 | TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); | 1036 | TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); |
| 1241 | TCP_SKB_CB(buff)->tcp_flags = flags; | 1037 | TCP_SKB_CB(buff)->flags = flags; |
| 1242 | TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; | 1038 | TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; |
| 1243 | 1039 | ||
| 1244 | if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { | 1040 | if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { |
| @@ -1295,27 +1091,17 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) | |||
| 1295 | { | 1091 | { |
| 1296 | int i, k, eat; | 1092 | int i, k, eat; |
| 1297 | 1093 | ||
| 1298 | eat = min_t(int, len, skb_headlen(skb)); | ||
| 1299 | if (eat) { | ||
| 1300 | __skb_pull(skb, eat); | ||
| 1301 | skb->avail_size -= eat; | ||
| 1302 | len -= eat; | ||
| 1303 | if (!len) | ||
| 1304 | return; | ||
| 1305 | } | ||
| 1306 | eat = len; | 1094 | eat = len; |
| 1307 | k = 0; | 1095 | k = 0; |
| 1308 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { | 1096 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { |
| 1309 | int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); | 1097 | if (skb_shinfo(skb)->frags[i].size <= eat) { |
| 1310 | 1098 | put_page(skb_shinfo(skb)->frags[i].page); | |
| 1311 | if (size <= eat) { | 1099 | eat -= skb_shinfo(skb)->frags[i].size; |
| 1312 | skb_frag_unref(skb, i); | ||
| 1313 | eat -= size; | ||
| 1314 | } else { | 1100 | } else { |
| 1315 | skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; | 1101 | skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; |
| 1316 | if (eat) { | 1102 | if (eat) { |
| 1317 | skb_shinfo(skb)->frags[k].page_offset += eat; | 1103 | skb_shinfo(skb)->frags[k].page_offset += eat; |
| 1318 | skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); | 1104 | skb_shinfo(skb)->frags[k].size -= eat; |
| 1319 | eat = 0; | 1105 | eat = 0; |
| 1320 | } | 1106 | } |
| 1321 | k++; | 1107 | k++; |
| @@ -1334,7 +1120,11 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) | |||
| 1334 | if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) | 1120 | if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) |
| 1335 | return -ENOMEM; | 1121 | return -ENOMEM; |
| 1336 | 1122 | ||
| 1337 | __pskb_trim_head(skb, len); | 1123 | /* If len == headlen, we avoid __skb_pull to preserve alignment. */ |
| 1124 | if (unlikely(len < skb_headlen(skb))) | ||
| 1125 | __skb_pull(skb, len); | ||
| 1126 | else | ||
| 1127 | __pskb_trim_head(skb, len - skb_headlen(skb)); | ||
| 1338 | 1128 | ||
| 1339 | TCP_SKB_CB(skb)->seq += len; | 1129 | TCP_SKB_CB(skb)->seq += len; |
| 1340 | skb->ip_summed = CHECKSUM_PARTIAL; | 1130 | skb->ip_summed = CHECKSUM_PARTIAL; |
| @@ -1354,8 +1144,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) | |||
| 1354 | /* Calculate MSS. Not accounting for SACKs here. */ | 1144 | /* Calculate MSS. Not accounting for SACKs here. */ |
| 1355 | int tcp_mtu_to_mss(struct sock *sk, int pmtu) | 1145 | int tcp_mtu_to_mss(struct sock *sk, int pmtu) |
| 1356 | { | 1146 | { |
| 1357 | const struct tcp_sock *tp = tcp_sk(sk); | 1147 | struct tcp_sock *tp = tcp_sk(sk); |
| 1358 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1148 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 1359 | int mss_now; | 1149 | int mss_now; |
| 1360 | 1150 | ||
| 1361 | /* Calculate base mss without TCP options: | 1151 | /* Calculate base mss without TCP options: |
| @@ -1363,14 +1153,6 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu) | |||
| 1363 | */ | 1153 | */ |
| 1364 | mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); | 1154 | mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); |
| 1365 | 1155 | ||
| 1366 | /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */ | ||
| 1367 | if (icsk->icsk_af_ops->net_frag_header_len) { | ||
| 1368 | const struct dst_entry *dst = __sk_dst_get(sk); | ||
| 1369 | |||
| 1370 | if (dst && dst_allfrag(dst)) | ||
| 1371 | mss_now -= icsk->icsk_af_ops->net_frag_header_len; | ||
| 1372 | } | ||
| 1373 | |||
| 1374 | /* Clamp it (mss_clamp does not include tcp options) */ | 1156 | /* Clamp it (mss_clamp does not include tcp options) */ |
| 1375 | if (mss_now > tp->rx_opt.mss_clamp) | 1157 | if (mss_now > tp->rx_opt.mss_clamp) |
| 1376 | mss_now = tp->rx_opt.mss_clamp; | 1158 | mss_now = tp->rx_opt.mss_clamp; |
| @@ -1391,8 +1173,8 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu) | |||
| 1391 | /* Inverse of above */ | 1173 | /* Inverse of above */ |
| 1392 | int tcp_mss_to_mtu(struct sock *sk, int mss) | 1174 | int tcp_mss_to_mtu(struct sock *sk, int mss) |
| 1393 | { | 1175 | { |
| 1394 | const struct tcp_sock *tp = tcp_sk(sk); | 1176 | struct tcp_sock *tp = tcp_sk(sk); |
| 1395 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1177 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 1396 | int mtu; | 1178 | int mtu; |
| 1397 | 1179 | ||
| 1398 | mtu = mss + | 1180 | mtu = mss + |
| @@ -1400,13 +1182,6 @@ int tcp_mss_to_mtu(struct sock *sk, int mss) | |||
| 1400 | icsk->icsk_ext_hdr_len + | 1182 | icsk->icsk_ext_hdr_len + |
| 1401 | icsk->icsk_af_ops->net_header_len; | 1183 | icsk->icsk_af_ops->net_header_len; |
| 1402 | 1184 | ||
| 1403 | /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */ | ||
| 1404 | if (icsk->icsk_af_ops->net_frag_header_len) { | ||
| 1405 | const struct dst_entry *dst = __sk_dst_get(sk); | ||
| 1406 | |||
| 1407 | if (dst && dst_allfrag(dst)) | ||
| 1408 | mtu += icsk->icsk_af_ops->net_frag_header_len; | ||
| 1409 | } | ||
| 1410 | return mtu; | 1185 | return mtu; |
| 1411 | } | 1186 | } |
| 1412 | 1187 | ||
| @@ -1473,10 +1248,10 @@ EXPORT_SYMBOL(tcp_sync_mss); | |||
| 1473 | */ | 1248 | */ |
| 1474 | unsigned int tcp_current_mss(struct sock *sk) | 1249 | unsigned int tcp_current_mss(struct sock *sk) |
| 1475 | { | 1250 | { |
| 1476 | const struct tcp_sock *tp = tcp_sk(sk); | 1251 | struct tcp_sock *tp = tcp_sk(sk); |
| 1477 | const struct dst_entry *dst = __sk_dst_get(sk); | 1252 | struct dst_entry *dst = __sk_dst_get(sk); |
| 1478 | u32 mss_now; | 1253 | u32 mss_now; |
| 1479 | unsigned int header_len; | 1254 | unsigned header_len; |
| 1480 | struct tcp_out_options opts; | 1255 | struct tcp_out_options opts; |
| 1481 | struct tcp_md5sig_key *md5; | 1256 | struct tcp_md5sig_key *md5; |
| 1482 | 1257 | ||
| @@ -1534,22 +1309,22 @@ static void tcp_cwnd_validate(struct sock *sk) | |||
| 1534 | * modulo only when the receiver window alone is the limiting factor or | 1309 | * modulo only when the receiver window alone is the limiting factor or |
| 1535 | * when we would be allowed to send the split-due-to-Nagle skb fully. | 1310 | * when we would be allowed to send the split-due-to-Nagle skb fully. |
| 1536 | */ | 1311 | */ |
| 1537 | static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, | 1312 | static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb, |
| 1538 | unsigned int mss_now, unsigned int max_segs) | 1313 | unsigned int mss_now, unsigned int cwnd) |
| 1539 | { | 1314 | { |
| 1540 | const struct tcp_sock *tp = tcp_sk(sk); | 1315 | struct tcp_sock *tp = tcp_sk(sk); |
| 1541 | u32 needed, window, max_len; | 1316 | u32 needed, window, cwnd_len; |
| 1542 | 1317 | ||
| 1543 | window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; | 1318 | window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; |
| 1544 | max_len = mss_now * max_segs; | 1319 | cwnd_len = mss_now * cwnd; |
| 1545 | 1320 | ||
| 1546 | if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) | 1321 | if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk))) |
| 1547 | return max_len; | 1322 | return cwnd_len; |
| 1548 | 1323 | ||
| 1549 | needed = min(skb->len, window); | 1324 | needed = min(skb->len, window); |
| 1550 | 1325 | ||
| 1551 | if (max_len <= needed) | 1326 | if (cwnd_len <= needed) |
| 1552 | return max_len; | 1327 | return cwnd_len; |
| 1553 | 1328 | ||
| 1554 | return needed - needed % mss_now; | 1329 | return needed - needed % mss_now; |
| 1555 | } | 1330 | } |
| @@ -1557,14 +1332,13 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_b | |||
| 1557 | /* Can at least one segment of SKB be sent right now, according to the | 1332 | /* Can at least one segment of SKB be sent right now, according to the |
| 1558 | * congestion window rules? If so, return how many segments are allowed. | 1333 | * congestion window rules? If so, return how many segments are allowed. |
| 1559 | */ | 1334 | */ |
| 1560 | static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, | 1335 | static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, |
| 1561 | const struct sk_buff *skb) | 1336 | struct sk_buff *skb) |
| 1562 | { | 1337 | { |
| 1563 | u32 in_flight, cwnd; | 1338 | u32 in_flight, cwnd; |
| 1564 | 1339 | ||
| 1565 | /* Don't be strict about the congestion window for the final FIN. */ | 1340 | /* Don't be strict about the congestion window for the final FIN. */ |
| 1566 | if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && | 1341 | if ((TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1) |
| 1567 | tcp_skb_pcount(skb) == 1) | ||
| 1568 | return 1; | 1342 | return 1; |
| 1569 | 1343 | ||
| 1570 | in_flight = tcp_packets_in_flight(tp); | 1344 | in_flight = tcp_packets_in_flight(tp); |
| @@ -1579,7 +1353,7 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, | |||
| 1579 | * This must be invoked the first time we consider transmitting | 1353 | * This must be invoked the first time we consider transmitting |
| 1580 | * SKB onto the wire. | 1354 | * SKB onto the wire. |
| 1581 | */ | 1355 | */ |
| 1582 | static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, | 1356 | static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, |
| 1583 | unsigned int mss_now) | 1357 | unsigned int mss_now) |
| 1584 | { | 1358 | { |
| 1585 | int tso_segs = tcp_skb_pcount(skb); | 1359 | int tso_segs = tcp_skb_pcount(skb); |
| @@ -1592,33 +1366,33 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, | |||
| 1592 | } | 1366 | } |
| 1593 | 1367 | ||
| 1594 | /* Minshall's variant of the Nagle send check. */ | 1368 | /* Minshall's variant of the Nagle send check. */ |
| 1595 | static inline bool tcp_minshall_check(const struct tcp_sock *tp) | 1369 | static inline int tcp_minshall_check(const struct tcp_sock *tp) |
| 1596 | { | 1370 | { |
| 1597 | return after(tp->snd_sml, tp->snd_una) && | 1371 | return after(tp->snd_sml, tp->snd_una) && |
| 1598 | !after(tp->snd_sml, tp->snd_nxt); | 1372 | !after(tp->snd_sml, tp->snd_nxt); |
| 1599 | } | 1373 | } |
| 1600 | 1374 | ||
| 1601 | /* Return false, if packet can be sent now without violation Nagle's rules: | 1375 | /* Return 0, if packet can be sent now without violation Nagle's rules: |
| 1602 | * 1. It is full sized. | 1376 | * 1. It is full sized. |
| 1603 | * 2. Or it contains FIN. (already checked by caller) | 1377 | * 2. Or it contains FIN. (already checked by caller) |
| 1604 | * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. | 1378 | * 3. Or TCP_NODELAY was set. |
| 1605 | * 4. Or TCP_CORK is not set, and all sent packets are ACKed. | 1379 | * 4. Or TCP_CORK is not set, and all sent packets are ACKed. |
| 1606 | * With Minshall's modification: all sent small packets are ACKed. | 1380 | * With Minshall's modification: all sent small packets are ACKed. |
| 1607 | */ | 1381 | */ |
| 1608 | static inline bool tcp_nagle_check(const struct tcp_sock *tp, | 1382 | static inline int tcp_nagle_check(const struct tcp_sock *tp, |
| 1609 | const struct sk_buff *skb, | 1383 | const struct sk_buff *skb, |
| 1610 | unsigned int mss_now, int nonagle) | 1384 | unsigned mss_now, int nonagle) |
| 1611 | { | 1385 | { |
| 1612 | return skb->len < mss_now && | 1386 | return skb->len < mss_now && |
| 1613 | ((nonagle & TCP_NAGLE_CORK) || | 1387 | ((nonagle & TCP_NAGLE_CORK) || |
| 1614 | (!nonagle && tp->packets_out && tcp_minshall_check(tp))); | 1388 | (!nonagle && tp->packets_out && tcp_minshall_check(tp))); |
| 1615 | } | 1389 | } |
| 1616 | 1390 | ||
| 1617 | /* Return true if the Nagle test allows this packet to be | 1391 | /* Return non-zero if the Nagle test allows this packet to be |
| 1618 | * sent now. | 1392 | * sent now. |
| 1619 | */ | 1393 | */ |
| 1620 | static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, | 1394 | static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, |
| 1621 | unsigned int cur_mss, int nonagle) | 1395 | unsigned int cur_mss, int nonagle) |
| 1622 | { | 1396 | { |
| 1623 | /* Nagle rule does not apply to frames, which sit in the middle of the | 1397 | /* Nagle rule does not apply to frames, which sit in the middle of the |
| 1624 | * write_queue (they have no chances to get new data). | 1398 | * write_queue (they have no chances to get new data). |
| @@ -1627,25 +1401,24 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf | |||
| 1627 | * argument based upon the location of SKB in the send queue. | 1401 | * argument based upon the location of SKB in the send queue. |
| 1628 | */ | 1402 | */ |
| 1629 | if (nonagle & TCP_NAGLE_PUSH) | 1403 | if (nonagle & TCP_NAGLE_PUSH) |
| 1630 | return true; | 1404 | return 1; |
| 1631 | 1405 | ||
| 1632 | /* Don't use the nagle rule for urgent data (or for the final FIN). | 1406 | /* Don't use the nagle rule for urgent data (or for the final FIN). |
| 1633 | * Nagle can be ignored during F-RTO too (see RFC4138). | 1407 | * Nagle can be ignored during F-RTO too (see RFC4138). |
| 1634 | */ | 1408 | */ |
| 1635 | if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || | 1409 | if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || |
| 1636 | (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) | 1410 | (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)) |
| 1637 | return true; | 1411 | return 1; |
| 1638 | 1412 | ||
| 1639 | if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) | 1413 | if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) |
| 1640 | return true; | 1414 | return 1; |
| 1641 | 1415 | ||
| 1642 | return false; | 1416 | return 0; |
| 1643 | } | 1417 | } |
| 1644 | 1418 | ||
| 1645 | /* Does at least the first segment of SKB fit into the send window? */ | 1419 | /* Does at least the first segment of SKB fit into the send window? */ |
| 1646 | static bool tcp_snd_wnd_test(const struct tcp_sock *tp, | 1420 | static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, |
| 1647 | const struct sk_buff *skb, | 1421 | unsigned int cur_mss) |
| 1648 | unsigned int cur_mss) | ||
| 1649 | { | 1422 | { |
| 1650 | u32 end_seq = TCP_SKB_CB(skb)->end_seq; | 1423 | u32 end_seq = TCP_SKB_CB(skb)->end_seq; |
| 1651 | 1424 | ||
| @@ -1659,10 +1432,10 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, | |||
| 1659 | * should be put on the wire right now. If so, it returns the number of | 1432 | * should be put on the wire right now. If so, it returns the number of |
| 1660 | * packets allowed by the congestion window. | 1433 | * packets allowed by the congestion window. |
| 1661 | */ | 1434 | */ |
| 1662 | static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, | 1435 | static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, |
| 1663 | unsigned int cur_mss, int nonagle) | 1436 | unsigned int cur_mss, int nonagle) |
| 1664 | { | 1437 | { |
| 1665 | const struct tcp_sock *tp = tcp_sk(sk); | 1438 | struct tcp_sock *tp = tcp_sk(sk); |
| 1666 | unsigned int cwnd_quota; | 1439 | unsigned int cwnd_quota; |
| 1667 | 1440 | ||
| 1668 | tcp_init_tso_segs(sk, skb, cur_mss); | 1441 | tcp_init_tso_segs(sk, skb, cur_mss); |
| @@ -1678,9 +1451,9 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, | |||
| 1678 | } | 1451 | } |
| 1679 | 1452 | ||
| 1680 | /* Test if sending is allowed right now. */ | 1453 | /* Test if sending is allowed right now. */ |
| 1681 | bool tcp_may_send_now(struct sock *sk) | 1454 | int tcp_may_send_now(struct sock *sk) |
| 1682 | { | 1455 | { |
| 1683 | const struct tcp_sock *tp = tcp_sk(sk); | 1456 | struct tcp_sock *tp = tcp_sk(sk); |
| 1684 | struct sk_buff *skb = tcp_send_head(sk); | 1457 | struct sk_buff *skb = tcp_send_head(sk); |
| 1685 | 1458 | ||
| 1686 | return skb && | 1459 | return skb && |
| @@ -1722,9 +1495,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, | |||
| 1722 | TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; | 1495 | TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; |
| 1723 | 1496 | ||
| 1724 | /* PSH and FIN should only be set in the second packet. */ | 1497 | /* PSH and FIN should only be set in the second packet. */ |
| 1725 | flags = TCP_SKB_CB(skb)->tcp_flags; | 1498 | flags = TCP_SKB_CB(skb)->flags; |
| 1726 | TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); | 1499 | TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); |
| 1727 | TCP_SKB_CB(buff)->tcp_flags = flags; | 1500 | TCP_SKB_CB(buff)->flags = flags; |
| 1728 | 1501 | ||
| 1729 | /* This packet was never sent out yet, so no SACK bits. */ | 1502 | /* This packet was never sent out yet, so no SACK bits. */ |
| 1730 | TCP_SKB_CB(buff)->sacked = 0; | 1503 | TCP_SKB_CB(buff)->sacked = 0; |
| @@ -1748,14 +1521,14 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, | |||
| 1748 | * | 1521 | * |
| 1749 | * This algorithm is from John Heffner. | 1522 | * This algorithm is from John Heffner. |
| 1750 | */ | 1523 | */ |
| 1751 | static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) | 1524 | static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) |
| 1752 | { | 1525 | { |
| 1753 | struct tcp_sock *tp = tcp_sk(sk); | 1526 | struct tcp_sock *tp = tcp_sk(sk); |
| 1754 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1527 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 1755 | u32 send_win, cong_win, limit, in_flight; | 1528 | u32 send_win, cong_win, limit, in_flight; |
| 1756 | int win_divisor; | 1529 | int win_divisor; |
| 1757 | 1530 | ||
| 1758 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) | 1531 | if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) |
| 1759 | goto send_now; | 1532 | goto send_now; |
| 1760 | 1533 | ||
| 1761 | if (icsk->icsk_ca_state != TCP_CA_Open) | 1534 | if (icsk->icsk_ca_state != TCP_CA_Open) |
| @@ -1778,8 +1551,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) | |||
| 1778 | limit = min(send_win, cong_win); | 1551 | limit = min(send_win, cong_win); |
| 1779 | 1552 | ||
| 1780 | /* If a full-sized TSO skb can be sent, do it. */ | 1553 | /* If a full-sized TSO skb can be sent, do it. */ |
| 1781 | if (limit >= min_t(unsigned int, sk->sk_gso_max_size, | 1554 | if (limit >= sk->sk_gso_max_size) |
| 1782 | sk->sk_gso_max_segs * tp->mss_cache)) | ||
| 1783 | goto send_now; | 1555 | goto send_now; |
| 1784 | 1556 | ||
| 1785 | /* Middle in queue won't get any more data, full sendable already? */ | 1557 | /* Middle in queue won't get any more data, full sendable already? */ |
| @@ -1802,18 +1574,18 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) | |||
| 1802 | * frame, so if we have space for more than 3 frames | 1574 | * frame, so if we have space for more than 3 frames |
| 1803 | * then send now. | 1575 | * then send now. |
| 1804 | */ | 1576 | */ |
| 1805 | if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache) | 1577 | if (limit > tcp_max_burst(tp) * tp->mss_cache) |
| 1806 | goto send_now; | 1578 | goto send_now; |
| 1807 | } | 1579 | } |
| 1808 | 1580 | ||
| 1809 | /* Ok, it looks like it is advisable to defer. */ | 1581 | /* Ok, it looks like it is advisable to defer. */ |
| 1810 | tp->tso_deferred = 1 | (jiffies << 1); | 1582 | tp->tso_deferred = 1 | (jiffies << 1); |
| 1811 | 1583 | ||
| 1812 | return true; | 1584 | return 1; |
| 1813 | 1585 | ||
| 1814 | send_now: | 1586 | send_now: |
| 1815 | tp->tso_deferred = 0; | 1587 | tp->tso_deferred = 0; |
| 1816 | return false; | 1588 | return 0; |
| 1817 | } | 1589 | } |
| 1818 | 1590 | ||
| 1819 | /* Create a new MTU probe if we are ready. | 1591 | /* Create a new MTU probe if we are ready. |
| @@ -1883,7 +1655,7 @@ static int tcp_mtu_probe(struct sock *sk) | |||
| 1883 | 1655 | ||
| 1884 | TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; | 1656 | TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; |
| 1885 | TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; | 1657 | TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; |
| 1886 | TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK; | 1658 | TCP_SKB_CB(nskb)->flags = TCPHDR_ACK; |
| 1887 | TCP_SKB_CB(nskb)->sacked = 0; | 1659 | TCP_SKB_CB(nskb)->sacked = 0; |
| 1888 | nskb->csum = 0; | 1660 | nskb->csum = 0; |
| 1889 | nskb->ip_summed = skb->ip_summed; | 1661 | nskb->ip_summed = skb->ip_summed; |
| @@ -1903,11 +1675,11 @@ static int tcp_mtu_probe(struct sock *sk) | |||
| 1903 | if (skb->len <= copy) { | 1675 | if (skb->len <= copy) { |
| 1904 | /* We've eaten all the data from this skb. | 1676 | /* We've eaten all the data from this skb. |
| 1905 | * Throw it away. */ | 1677 | * Throw it away. */ |
| 1906 | TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; | 1678 | TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags; |
| 1907 | tcp_unlink_write_queue(skb, sk); | 1679 | tcp_unlink_write_queue(skb, sk); |
| 1908 | sk_wmem_free_skb(sk, skb); | 1680 | sk_wmem_free_skb(sk, skb); |
| 1909 | } else { | 1681 | } else { |
| 1910 | TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & | 1682 | TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & |
| 1911 | ~(TCPHDR_FIN|TCPHDR_PSH); | 1683 | ~(TCPHDR_FIN|TCPHDR_PSH); |
| 1912 | if (!skb_shinfo(skb)->nr_frags) { | 1684 | if (!skb_shinfo(skb)->nr_frags) { |
| 1913 | skb_pull(skb, copy); | 1685 | skb_pull(skb, copy); |
| @@ -1955,11 +1727,11 @@ static int tcp_mtu_probe(struct sock *sk) | |||
| 1955 | * snd_up-64k-mss .. snd_up cannot be large. However, taking into | 1727 | * snd_up-64k-mss .. snd_up cannot be large. However, taking into |
| 1956 | * account rare use of URG, this is not a big flaw. | 1728 | * account rare use of URG, this is not a big flaw. |
| 1957 | * | 1729 | * |
| 1958 | * Returns true, if no segments are in flight and we have queued segments, | 1730 | * Returns 1, if no segments are in flight and we have queued segments, but |
| 1959 | * but cannot send anything now because of SWS or another problem. | 1731 | * cannot send anything now because of SWS or another problem. |
| 1960 | */ | 1732 | */ |
| 1961 | static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | 1733 | static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, |
| 1962 | int push_one, gfp_t gfp) | 1734 | int push_one, gfp_t gfp) |
| 1963 | { | 1735 | { |
| 1964 | struct tcp_sock *tp = tcp_sk(sk); | 1736 | struct tcp_sock *tp = tcp_sk(sk); |
| 1965 | struct sk_buff *skb; | 1737 | struct sk_buff *skb; |
| @@ -1973,7 +1745,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
| 1973 | /* Do MTU probing. */ | 1745 | /* Do MTU probing. */ |
| 1974 | result = tcp_mtu_probe(sk); | 1746 | result = tcp_mtu_probe(sk); |
| 1975 | if (!result) { | 1747 | if (!result) { |
| 1976 | return false; | 1748 | return 0; |
| 1977 | } else if (result > 0) { | 1749 | } else if (result > 0) { |
| 1978 | sent_pkts = 1; | 1750 | sent_pkts = 1; |
| 1979 | } | 1751 | } |
| @@ -1982,13 +1754,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
| 1982 | while ((skb = tcp_send_head(sk))) { | 1754 | while ((skb = tcp_send_head(sk))) { |
| 1983 | unsigned int limit; | 1755 | unsigned int limit; |
| 1984 | 1756 | ||
| 1985 | |||
| 1986 | tso_segs = tcp_init_tso_segs(sk, skb, mss_now); | 1757 | tso_segs = tcp_init_tso_segs(sk, skb, mss_now); |
| 1987 | BUG_ON(!tso_segs); | 1758 | BUG_ON(!tso_segs); |
| 1988 | 1759 | ||
| 1989 | if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) | ||
| 1990 | goto repair; /* Skip network transmission */ | ||
| 1991 | |||
| 1992 | cwnd_quota = tcp_cwnd_test(tp, skb); | 1760 | cwnd_quota = tcp_cwnd_test(tp, skb); |
| 1993 | if (!cwnd_quota) | 1761 | if (!cwnd_quota) |
| 1994 | break; | 1762 | break; |
| @@ -2006,19 +1774,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
| 2006 | break; | 1774 | break; |
| 2007 | } | 1775 | } |
| 2008 | 1776 | ||
| 2009 | /* TSQ : sk_wmem_alloc accounts skb truesize, | ||
| 2010 | * including skb overhead. But thats OK. | ||
| 2011 | */ | ||
| 2012 | if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) { | ||
| 2013 | set_bit(TSQ_THROTTLED, &tp->tsq_flags); | ||
| 2014 | break; | ||
| 2015 | } | ||
| 2016 | limit = mss_now; | 1777 | limit = mss_now; |
| 2017 | if (tso_segs > 1 && !tcp_urg_mode(tp)) | 1778 | if (tso_segs > 1 && !tcp_urg_mode(tp)) |
| 2018 | limit = tcp_mss_split_point(sk, skb, mss_now, | 1779 | limit = tcp_mss_split_point(sk, skb, mss_now, |
| 2019 | min_t(unsigned int, | 1780 | cwnd_quota); |
| 2020 | cwnd_quota, | ||
| 2021 | sk->sk_gso_max_segs)); | ||
| 2022 | 1781 | ||
| 2023 | if (skb->len > limit && | 1782 | if (skb->len > limit && |
| 2024 | unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) | 1783 | unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) |
| @@ -2029,24 +1788,21 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
| 2029 | if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) | 1788 | if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) |
| 2030 | break; | 1789 | break; |
| 2031 | 1790 | ||
| 2032 | repair: | ||
| 2033 | /* Advance the send_head. This one is sent out. | 1791 | /* Advance the send_head. This one is sent out. |
| 2034 | * This call will increment packets_out. | 1792 | * This call will increment packets_out. |
| 2035 | */ | 1793 | */ |
| 2036 | tcp_event_new_data_sent(sk, skb); | 1794 | tcp_event_new_data_sent(sk, skb); |
| 2037 | 1795 | ||
| 2038 | tcp_minshall_update(tp, mss_now, skb); | 1796 | tcp_minshall_update(tp, mss_now, skb); |
| 2039 | sent_pkts += tcp_skb_pcount(skb); | 1797 | sent_pkts++; |
| 2040 | 1798 | ||
| 2041 | if (push_one) | 1799 | if (push_one) |
| 2042 | break; | 1800 | break; |
| 2043 | } | 1801 | } |
| 2044 | 1802 | ||
| 2045 | if (likely(sent_pkts)) { | 1803 | if (likely(sent_pkts)) { |
| 2046 | if (tcp_in_cwnd_reduction(sk)) | ||
| 2047 | tp->prr_out += sent_pkts; | ||
| 2048 | tcp_cwnd_validate(sk); | 1804 | tcp_cwnd_validate(sk); |
| 2049 | return false; | 1805 | return 0; |
| 2050 | } | 1806 | } |
| 2051 | return !tp->packets_out && tcp_send_head(sk); | 1807 | return !tp->packets_out && tcp_send_head(sk); |
| 2052 | } | 1808 | } |
| @@ -2065,8 +1821,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, | |||
| 2065 | if (unlikely(sk->sk_state == TCP_CLOSE)) | 1821 | if (unlikely(sk->sk_state == TCP_CLOSE)) |
| 2066 | return; | 1822 | return; |
| 2067 | 1823 | ||
| 2068 | if (tcp_write_xmit(sk, cur_mss, nonagle, 0, | 1824 | if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC)) |
| 2069 | sk_gfp_atomic(sk, GFP_ATOMIC))) | ||
| 2070 | tcp_check_probe_timer(sk); | 1825 | tcp_check_probe_timer(sk); |
| 2071 | } | 1826 | } |
| 2072 | 1827 | ||
| @@ -2155,7 +1910,7 @@ u32 __tcp_select_window(struct sock *sk) | |||
| 2155 | if (free_space < (full_space >> 1)) { | 1910 | if (free_space < (full_space >> 1)) { |
| 2156 | icsk->icsk_ack.quick = 0; | 1911 | icsk->icsk_ack.quick = 0; |
| 2157 | 1912 | ||
| 2158 | if (sk_under_memory_pressure(sk)) | 1913 | if (tcp_memory_pressure) |
| 2159 | tp->rcv_ssthresh = min(tp->rcv_ssthresh, | 1914 | tp->rcv_ssthresh = min(tp->rcv_ssthresh, |
| 2160 | 4U * tp->advmss); | 1915 | 4U * tp->advmss); |
| 2161 | 1916 | ||
| @@ -2228,7 +1983,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) | |||
| 2228 | TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; | 1983 | TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; |
| 2229 | 1984 | ||
| 2230 | /* Merge over control information. This moves PSH/FIN etc. over */ | 1985 | /* Merge over control information. This moves PSH/FIN etc. over */ |
| 2231 | TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags; | 1986 | TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(next_skb)->flags; |
| 2232 | 1987 | ||
| 2233 | /* All done, get rid of second SKB and account for it so | 1988 | /* All done, get rid of second SKB and account for it so |
| 2234 | * packet counting does not break. | 1989 | * packet counting does not break. |
| @@ -2246,22 +2001,22 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) | |||
| 2246 | } | 2001 | } |
| 2247 | 2002 | ||
| 2248 | /* Check if coalescing SKBs is legal. */ | 2003 | /* Check if coalescing SKBs is legal. */ |
| 2249 | static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) | 2004 | static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb) |
| 2250 | { | 2005 | { |
| 2251 | if (tcp_skb_pcount(skb) > 1) | 2006 | if (tcp_skb_pcount(skb) > 1) |
| 2252 | return false; | 2007 | return 0; |
| 2253 | /* TODO: SACK collapsing could be used to remove this condition */ | 2008 | /* TODO: SACK collapsing could be used to remove this condition */ |
| 2254 | if (skb_shinfo(skb)->nr_frags != 0) | 2009 | if (skb_shinfo(skb)->nr_frags != 0) |
| 2255 | return false; | 2010 | return 0; |
| 2256 | if (skb_cloned(skb)) | 2011 | if (skb_cloned(skb)) |
| 2257 | return false; | 2012 | return 0; |
| 2258 | if (skb == tcp_send_head(sk)) | 2013 | if (skb == tcp_send_head(sk)) |
| 2259 | return false; | 2014 | return 0; |
| 2260 | /* Some heurestics for collapsing over SACK'd could be invented */ | 2015 | /* Some heurestics for collapsing over SACK'd could be invented */ |
| 2261 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) | 2016 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) |
| 2262 | return false; | 2017 | return 0; |
| 2263 | 2018 | ||
| 2264 | return true; | 2019 | return 1; |
| 2265 | } | 2020 | } |
| 2266 | 2021 | ||
| 2267 | /* Collapse packets in the retransmit queue to make to create | 2022 | /* Collapse packets in the retransmit queue to make to create |
| @@ -2272,11 +2027,11 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, | |||
| 2272 | { | 2027 | { |
| 2273 | struct tcp_sock *tp = tcp_sk(sk); | 2028 | struct tcp_sock *tp = tcp_sk(sk); |
| 2274 | struct sk_buff *skb = to, *tmp; | 2029 | struct sk_buff *skb = to, *tmp; |
| 2275 | bool first = true; | 2030 | int first = 1; |
| 2276 | 2031 | ||
| 2277 | if (!sysctl_tcp_retrans_collapse) | 2032 | if (!sysctl_tcp_retrans_collapse) |
| 2278 | return; | 2033 | return; |
| 2279 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) | 2034 | if (TCP_SKB_CB(skb)->flags & TCPHDR_SYN) |
| 2280 | return; | 2035 | return; |
| 2281 | 2036 | ||
| 2282 | tcp_for_write_queue_from_safe(skb, tmp, sk) { | 2037 | tcp_for_write_queue_from_safe(skb, tmp, sk) { |
| @@ -2286,7 +2041,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, | |||
| 2286 | space -= skb->len; | 2041 | space -= skb->len; |
| 2287 | 2042 | ||
| 2288 | if (first) { | 2043 | if (first) { |
| 2289 | first = false; | 2044 | first = 0; |
| 2290 | continue; | 2045 | continue; |
| 2291 | } | 2046 | } |
| 2292 | 2047 | ||
| @@ -2295,7 +2050,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, | |||
| 2295 | /* Punt if not enough space exists in the first SKB for | 2050 | /* Punt if not enough space exists in the first SKB for |
| 2296 | * the data in the second | 2051 | * the data in the second |
| 2297 | */ | 2052 | */ |
| 2298 | if (skb->len > skb_availroom(to)) | 2053 | if (skb->len > skb_tailroom(to)) |
| 2299 | break; | 2054 | break; |
| 2300 | 2055 | ||
| 2301 | if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) | 2056 | if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) |
| @@ -2309,11 +2064,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, | |||
| 2309 | * state updates are done by the caller. Returns non-zero if an | 2064 | * state updates are done by the caller. Returns non-zero if an |
| 2310 | * error occurred which prevented the send. | 2065 | * error occurred which prevented the send. |
| 2311 | */ | 2066 | */ |
| 2312 | int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | 2067 | int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) |
| 2313 | { | 2068 | { |
| 2314 | struct tcp_sock *tp = tcp_sk(sk); | 2069 | struct tcp_sock *tp = tcp_sk(sk); |
| 2315 | struct inet_connection_sock *icsk = inet_csk(sk); | 2070 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 2316 | unsigned int cur_mss; | 2071 | unsigned int cur_mss; |
| 2072 | int err; | ||
| 2317 | 2073 | ||
| 2318 | /* Inconslusive MTU probe */ | 2074 | /* Inconslusive MTU probe */ |
| 2319 | if (icsk->icsk_mtup.probe_size) { | 2075 | if (icsk->icsk_mtup.probe_size) { |
| @@ -2367,12 +2123,12 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
| 2367 | * since it is cheap to do so and saves bytes on the network. | 2123 | * since it is cheap to do so and saves bytes on the network. |
| 2368 | */ | 2124 | */ |
| 2369 | if (skb->len > 0 && | 2125 | if (skb->len > 0 && |
| 2370 | (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && | 2126 | (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && |
| 2371 | tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { | 2127 | tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { |
| 2372 | if (!pskb_trim(skb, 0)) { | 2128 | if (!pskb_trim(skb, 0)) { |
| 2373 | /* Reuse, even though it does some unnecessary work */ | 2129 | /* Reuse, even though it does some unnecessary work */ |
| 2374 | tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1, | 2130 | tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1, |
| 2375 | TCP_SKB_CB(skb)->tcp_flags); | 2131 | TCP_SKB_CB(skb)->flags); |
| 2376 | skb->ip_summed = CHECKSUM_NONE; | 2132 | skb->ip_summed = CHECKSUM_NONE; |
| 2377 | } | 2133 | } |
| 2378 | } | 2134 | } |
| @@ -2382,21 +2138,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
| 2382 | */ | 2138 | */ |
| 2383 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 2139 | TCP_SKB_CB(skb)->when = tcp_time_stamp; |
| 2384 | 2140 | ||
| 2385 | /* make sure skb->data is aligned on arches that require it */ | 2141 | err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); |
| 2386 | if (unlikely(NET_IP_ALIGN && ((unsigned long)skb->data & 3))) { | ||
| 2387 | struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, | ||
| 2388 | GFP_ATOMIC); | ||
| 2389 | return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : | ||
| 2390 | -ENOBUFS; | ||
| 2391 | } else { | ||
| 2392 | return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); | ||
| 2393 | } | ||
| 2394 | } | ||
| 2395 | |||
| 2396 | int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | ||
| 2397 | { | ||
| 2398 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2399 | int err = __tcp_retransmit_skb(sk, skb); | ||
| 2400 | 2142 | ||
| 2401 | if (err == 0) { | 2143 | if (err == 0) { |
| 2402 | /* Update global TCP statistics. */ | 2144 | /* Update global TCP statistics. */ |
| @@ -2406,7 +2148,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
| 2406 | 2148 | ||
| 2407 | #if FASTRETRANS_DEBUG > 0 | 2149 | #if FASTRETRANS_DEBUG > 0 |
| 2408 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { | 2150 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { |
| 2409 | net_dbg_ratelimited("retrans_out leaked\n"); | 2151 | if (net_ratelimit()) |
| 2152 | printk(KERN_DEBUG "retrans_out leaked.\n"); | ||
| 2410 | } | 2153 | } |
| 2411 | #endif | 2154 | #endif |
| 2412 | if (!tp->retrans_out) | 2155 | if (!tp->retrans_out) |
| @@ -2431,18 +2174,18 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
| 2431 | /* Check if we forward retransmits are possible in the current | 2174 | /* Check if we forward retransmits are possible in the current |
| 2432 | * window/congestion state. | 2175 | * window/congestion state. |
| 2433 | */ | 2176 | */ |
| 2434 | static bool tcp_can_forward_retransmit(struct sock *sk) | 2177 | static int tcp_can_forward_retransmit(struct sock *sk) |
| 2435 | { | 2178 | { |
| 2436 | const struct inet_connection_sock *icsk = inet_csk(sk); | 2179 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 2437 | const struct tcp_sock *tp = tcp_sk(sk); | 2180 | struct tcp_sock *tp = tcp_sk(sk); |
| 2438 | 2181 | ||
| 2439 | /* Forward retransmissions are possible only during Recovery. */ | 2182 | /* Forward retransmissions are possible only during Recovery. */ |
| 2440 | if (icsk->icsk_ca_state != TCP_CA_Recovery) | 2183 | if (icsk->icsk_ca_state != TCP_CA_Recovery) |
| 2441 | return false; | 2184 | return 0; |
| 2442 | 2185 | ||
| 2443 | /* No forward retransmissions in Reno are possible. */ | 2186 | /* No forward retransmissions in Reno are possible. */ |
| 2444 | if (tcp_is_reno(tp)) | 2187 | if (tcp_is_reno(tp)) |
| 2445 | return false; | 2188 | return 0; |
| 2446 | 2189 | ||
| 2447 | /* Yeah, we have to make difficult choice between forward transmission | 2190 | /* Yeah, we have to make difficult choice between forward transmission |
| 2448 | * and retransmission... Both ways have their merits... | 2191 | * and retransmission... Both ways have their merits... |
| @@ -2453,9 +2196,9 @@ static bool tcp_can_forward_retransmit(struct sock *sk) | |||
| 2453 | */ | 2196 | */ |
| 2454 | 2197 | ||
| 2455 | if (tcp_may_send_now(sk)) | 2198 | if (tcp_may_send_now(sk)) |
| 2456 | return false; | 2199 | return 0; |
| 2457 | 2200 | ||
| 2458 | return true; | 2201 | return 1; |
| 2459 | } | 2202 | } |
| 2460 | 2203 | ||
| 2461 | /* This gets called after a retransmit timeout, and the initially | 2204 | /* This gets called after a retransmit timeout, and the initially |
| @@ -2545,15 +2288,10 @@ begin_fwd: | |||
| 2545 | if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) | 2288 | if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) |
| 2546 | continue; | 2289 | continue; |
| 2547 | 2290 | ||
| 2548 | if (tcp_retransmit_skb(sk, skb)) { | 2291 | if (tcp_retransmit_skb(sk, skb)) |
| 2549 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); | ||
| 2550 | return; | 2292 | return; |
| 2551 | } | ||
| 2552 | NET_INC_STATS_BH(sock_net(sk), mib_idx); | 2293 | NET_INC_STATS_BH(sock_net(sk), mib_idx); |
| 2553 | 2294 | ||
| 2554 | if (tcp_in_cwnd_reduction(sk)) | ||
| 2555 | tp->prr_out += tcp_skb_pcount(skb); | ||
| 2556 | |||
| 2557 | if (skb == tcp_write_queue_head(sk)) | 2295 | if (skb == tcp_write_queue_head(sk)) |
| 2558 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | 2296 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
| 2559 | inet_csk(sk)->icsk_rto, | 2297 | inet_csk(sk)->icsk_rto, |
| @@ -2577,7 +2315,7 @@ void tcp_send_fin(struct sock *sk) | |||
| 2577 | mss_now = tcp_current_mss(sk); | 2315 | mss_now = tcp_current_mss(sk); |
| 2578 | 2316 | ||
| 2579 | if (tcp_send_head(sk) != NULL) { | 2317 | if (tcp_send_head(sk) != NULL) { |
| 2580 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; | 2318 | TCP_SKB_CB(skb)->flags |= TCPHDR_FIN; |
| 2581 | TCP_SKB_CB(skb)->end_seq++; | 2319 | TCP_SKB_CB(skb)->end_seq++; |
| 2582 | tp->write_seq++; | 2320 | tp->write_seq++; |
| 2583 | } else { | 2321 | } else { |
| @@ -2639,11 +2377,11 @@ int tcp_send_synack(struct sock *sk) | |||
| 2639 | struct sk_buff *skb; | 2377 | struct sk_buff *skb; |
| 2640 | 2378 | ||
| 2641 | skb = tcp_write_queue_head(sk); | 2379 | skb = tcp_write_queue_head(sk); |
| 2642 | if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { | 2380 | if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPHDR_SYN)) { |
| 2643 | pr_debug("%s: wrong queue state\n", __func__); | 2381 | printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); |
| 2644 | return -EFAULT; | 2382 | return -EFAULT; |
| 2645 | } | 2383 | } |
| 2646 | if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { | 2384 | if (!(TCP_SKB_CB(skb)->flags & TCPHDR_ACK)) { |
| 2647 | if (skb_cloned(skb)) { | 2385 | if (skb_cloned(skb)) { |
| 2648 | struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); | 2386 | struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); |
| 2649 | if (nskb == NULL) | 2387 | if (nskb == NULL) |
| @@ -2657,27 +2395,17 @@ int tcp_send_synack(struct sock *sk) | |||
| 2657 | skb = nskb; | 2395 | skb = nskb; |
| 2658 | } | 2396 | } |
| 2659 | 2397 | ||
| 2660 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; | 2398 | TCP_SKB_CB(skb)->flags |= TCPHDR_ACK; |
| 2661 | TCP_ECN_send_synack(tcp_sk(sk), skb); | 2399 | TCP_ECN_send_synack(tcp_sk(sk), skb); |
| 2662 | } | 2400 | } |
| 2663 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 2401 | TCP_SKB_CB(skb)->when = tcp_time_stamp; |
| 2664 | return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); | 2402 | return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); |
| 2665 | } | 2403 | } |
| 2666 | 2404 | ||
| 2667 | /** | 2405 | /* Prepare a SYN-ACK. */ |
| 2668 | * tcp_make_synack - Prepare a SYN-ACK. | ||
| 2669 | * sk: listener socket | ||
| 2670 | * dst: dst entry attached to the SYNACK | ||
| 2671 | * req: request_sock pointer | ||
| 2672 | * rvp: request_values pointer | ||
| 2673 | * | ||
| 2674 | * Allocate one skb and build a SYNACK packet. | ||
| 2675 | * @dst is consumed : Caller should not use it again. | ||
| 2676 | */ | ||
| 2677 | struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | 2406 | struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, |
| 2678 | struct request_sock *req, | 2407 | struct request_sock *req, |
| 2679 | struct request_values *rvp, | 2408 | struct request_values *rvp) |
| 2680 | struct tcp_fastopen_cookie *foc) | ||
| 2681 | { | 2409 | { |
| 2682 | struct tcp_out_options opts; | 2410 | struct tcp_out_options opts; |
| 2683 | struct tcp_extend_values *xvp = tcp_xv(rvp); | 2411 | struct tcp_extend_values *xvp = tcp_xv(rvp); |
| @@ -2693,16 +2421,14 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
| 2693 | 2421 | ||
| 2694 | if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) | 2422 | if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) |
| 2695 | s_data_desired = cvp->s_data_desired; | 2423 | s_data_desired = cvp->s_data_desired; |
| 2696 | skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, | 2424 | skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC); |
| 2697 | sk_gfp_atomic(sk, GFP_ATOMIC)); | 2425 | if (skb == NULL) |
| 2698 | if (unlikely(!skb)) { | ||
| 2699 | dst_release(dst); | ||
| 2700 | return NULL; | 2426 | return NULL; |
| 2701 | } | 2427 | |
| 2702 | /* Reserve space for headers. */ | 2428 | /* Reserve space for headers. */ |
| 2703 | skb_reserve(skb, MAX_TCP_HEADER); | 2429 | skb_reserve(skb, MAX_TCP_HEADER); |
| 2704 | 2430 | ||
| 2705 | skb_dst_set(skb, dst); | 2431 | skb_dst_set(skb, dst_clone(dst)); |
| 2706 | 2432 | ||
| 2707 | mss = dst_metric_advmss(dst); | 2433 | mss = dst_metric_advmss(dst); |
| 2708 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) | 2434 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) |
| @@ -2737,7 +2463,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
| 2737 | #endif | 2463 | #endif |
| 2738 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 2464 | TCP_SKB_CB(skb)->when = tcp_time_stamp; |
| 2739 | tcp_header_size = tcp_synack_options(sk, req, mss, | 2465 | tcp_header_size = tcp_synack_options(sk, req, mss, |
| 2740 | skb, &opts, &md5, xvp, foc) | 2466 | skb, &opts, &md5, xvp) |
| 2741 | + sizeof(*th); | 2467 | + sizeof(*th); |
| 2742 | 2468 | ||
| 2743 | skb_push(skb, tcp_header_size); | 2469 | skb_push(skb, tcp_header_size); |
| @@ -2791,8 +2517,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
| 2791 | } | 2517 | } |
| 2792 | 2518 | ||
| 2793 | th->seq = htonl(TCP_SKB_CB(skb)->seq); | 2519 | th->seq = htonl(TCP_SKB_CB(skb)->seq); |
| 2794 | /* XXX data is queued and acked as is. No buffer/window check */ | 2520 | th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); |
| 2795 | th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); | ||
| 2796 | 2521 | ||
| 2797 | /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ | 2522 | /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ |
| 2798 | th->window = htons(min(req->rcv_wnd, 65535U)); | 2523 | th->window = htons(min(req->rcv_wnd, 65535U)); |
| @@ -2813,9 +2538,9 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
| 2813 | EXPORT_SYMBOL(tcp_make_synack); | 2538 | EXPORT_SYMBOL(tcp_make_synack); |
| 2814 | 2539 | ||
| 2815 | /* Do all connect socket setups that can be done AF independent. */ | 2540 | /* Do all connect socket setups that can be done AF independent. */ |
| 2816 | void tcp_connect_init(struct sock *sk) | 2541 | static void tcp_connect_init(struct sock *sk) |
| 2817 | { | 2542 | { |
| 2818 | const struct dst_entry *dst = __sk_dst_get(sk); | 2543 | struct dst_entry *dst = __sk_dst_get(sk); |
| 2819 | struct tcp_sock *tp = tcp_sk(sk); | 2544 | struct tcp_sock *tp = tcp_sk(sk); |
| 2820 | __u8 rcv_wscale; | 2545 | __u8 rcv_wscale; |
| 2821 | 2546 | ||
| @@ -2868,121 +2593,15 @@ void tcp_connect_init(struct sock *sk) | |||
| 2868 | tp->snd_una = tp->write_seq; | 2593 | tp->snd_una = tp->write_seq; |
| 2869 | tp->snd_sml = tp->write_seq; | 2594 | tp->snd_sml = tp->write_seq; |
| 2870 | tp->snd_up = tp->write_seq; | 2595 | tp->snd_up = tp->write_seq; |
| 2871 | tp->snd_nxt = tp->write_seq; | 2596 | tp->rcv_nxt = 0; |
| 2872 | 2597 | tp->rcv_wup = 0; | |
| 2873 | if (likely(!tp->repair)) | 2598 | tp->copied_seq = 0; |
| 2874 | tp->rcv_nxt = 0; | ||
| 2875 | tp->rcv_wup = tp->rcv_nxt; | ||
| 2876 | tp->copied_seq = tp->rcv_nxt; | ||
| 2877 | 2599 | ||
| 2878 | inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; | 2600 | inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; |
| 2879 | inet_csk(sk)->icsk_retransmits = 0; | 2601 | inet_csk(sk)->icsk_retransmits = 0; |
| 2880 | tcp_clear_retrans(tp); | 2602 | tcp_clear_retrans(tp); |
| 2881 | } | 2603 | } |
| 2882 | 2604 | ||
| 2883 | static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) | ||
| 2884 | { | ||
| 2885 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2886 | struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); | ||
| 2887 | |||
| 2888 | tcb->end_seq += skb->len; | ||
| 2889 | skb_header_release(skb); | ||
| 2890 | __tcp_add_write_queue_tail(sk, skb); | ||
| 2891 | sk->sk_wmem_queued += skb->truesize; | ||
| 2892 | sk_mem_charge(sk, skb->truesize); | ||
| 2893 | tp->write_seq = tcb->end_seq; | ||
| 2894 | tp->packets_out += tcp_skb_pcount(skb); | ||
| 2895 | } | ||
| 2896 | |||
| 2897 | /* Build and send a SYN with data and (cached) Fast Open cookie. However, | ||
| 2898 | * queue a data-only packet after the regular SYN, such that regular SYNs | ||
| 2899 | * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges | ||
| 2900 | * only the SYN sequence, the data are retransmitted in the first ACK. | ||
| 2901 | * If cookie is not cached or other error occurs, falls back to send a | ||
| 2902 | * regular SYN with Fast Open cookie request option. | ||
| 2903 | */ | ||
| 2904 | static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) | ||
| 2905 | { | ||
| 2906 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2907 | struct tcp_fastopen_request *fo = tp->fastopen_req; | ||
| 2908 | int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen; | ||
| 2909 | struct sk_buff *syn_data = NULL, *data; | ||
| 2910 | unsigned long last_syn_loss = 0; | ||
| 2911 | |||
| 2912 | tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ | ||
| 2913 | tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, | ||
| 2914 | &syn_loss, &last_syn_loss); | ||
| 2915 | /* Recurring FO SYN losses: revert to regular handshake temporarily */ | ||
| 2916 | if (syn_loss > 1 && | ||
| 2917 | time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { | ||
| 2918 | fo->cookie.len = -1; | ||
| 2919 | goto fallback; | ||
| 2920 | } | ||
| 2921 | |||
| 2922 | if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) | ||
| 2923 | fo->cookie.len = -1; | ||
| 2924 | else if (fo->cookie.len <= 0) | ||
| 2925 | goto fallback; | ||
| 2926 | |||
| 2927 | /* MSS for SYN-data is based on cached MSS and bounded by PMTU and | ||
| 2928 | * user-MSS. Reserve maximum option space for middleboxes that add | ||
| 2929 | * private TCP options. The cost is reduced data space in SYN :( | ||
| 2930 | */ | ||
| 2931 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) | ||
| 2932 | tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; | ||
| 2933 | space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - | ||
| 2934 | MAX_TCP_OPTION_SPACE; | ||
| 2935 | |||
| 2936 | syn_data = skb_copy_expand(syn, skb_headroom(syn), space, | ||
| 2937 | sk->sk_allocation); | ||
| 2938 | if (syn_data == NULL) | ||
| 2939 | goto fallback; | ||
| 2940 | |||
| 2941 | for (i = 0; i < iovlen && syn_data->len < space; ++i) { | ||
| 2942 | struct iovec *iov = &fo->data->msg_iov[i]; | ||
| 2943 | unsigned char __user *from = iov->iov_base; | ||
| 2944 | int len = iov->iov_len; | ||
| 2945 | |||
| 2946 | if (syn_data->len + len > space) | ||
| 2947 | len = space - syn_data->len; | ||
| 2948 | else if (i + 1 == iovlen) | ||
| 2949 | /* No more data pending in inet_wait_for_connect() */ | ||
| 2950 | fo->data = NULL; | ||
| 2951 | |||
| 2952 | if (skb_add_data(syn_data, from, len)) | ||
| 2953 | goto fallback; | ||
| 2954 | } | ||
| 2955 | |||
| 2956 | /* Queue a data-only packet after the regular SYN for retransmission */ | ||
| 2957 | data = pskb_copy(syn_data, sk->sk_allocation); | ||
| 2958 | if (data == NULL) | ||
| 2959 | goto fallback; | ||
| 2960 | TCP_SKB_CB(data)->seq++; | ||
| 2961 | TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN; | ||
| 2962 | TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH); | ||
| 2963 | tcp_connect_queue_skb(sk, data); | ||
| 2964 | fo->copied = data->len; | ||
| 2965 | |||
| 2966 | if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { | ||
| 2967 | tp->syn_data = (fo->copied > 0); | ||
| 2968 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); | ||
| 2969 | goto done; | ||
| 2970 | } | ||
| 2971 | syn_data = NULL; | ||
| 2972 | |||
| 2973 | fallback: | ||
| 2974 | /* Send a regular SYN with Fast Open cookie request option */ | ||
| 2975 | if (fo->cookie.len > 0) | ||
| 2976 | fo->cookie.len = 0; | ||
| 2977 | err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); | ||
| 2978 | if (err) | ||
| 2979 | tp->syn_fastopen = 0; | ||
| 2980 | kfree_skb(syn_data); | ||
| 2981 | done: | ||
| 2982 | fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ | ||
| 2983 | return err; | ||
| 2984 | } | ||
| 2985 | |||
| 2986 | /* Build a SYN and send it off. */ | 2605 | /* Build a SYN and send it off. */ |
| 2987 | int tcp_connect(struct sock *sk) | 2606 | int tcp_connect(struct sock *sk) |
| 2988 | { | 2607 | { |
| @@ -2992,11 +2611,6 @@ int tcp_connect(struct sock *sk) | |||
| 2992 | 2611 | ||
| 2993 | tcp_connect_init(sk); | 2612 | tcp_connect_init(sk); |
| 2994 | 2613 | ||
| 2995 | if (unlikely(tp->repair)) { | ||
| 2996 | tcp_finish_connect(sk, NULL); | ||
| 2997 | return 0; | ||
| 2998 | } | ||
| 2999 | |||
| 3000 | buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); | 2614 | buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); |
| 3001 | if (unlikely(buff == NULL)) | 2615 | if (unlikely(buff == NULL)) |
| 3002 | return -ENOBUFS; | 2616 | return -ENOBUFS; |
| @@ -3004,14 +2618,19 @@ int tcp_connect(struct sock *sk) | |||
| 3004 | /* Reserve space for headers. */ | 2618 | /* Reserve space for headers. */ |
| 3005 | skb_reserve(buff, MAX_TCP_HEADER); | 2619 | skb_reserve(buff, MAX_TCP_HEADER); |
| 3006 | 2620 | ||
| 2621 | tp->snd_nxt = tp->write_seq; | ||
| 3007 | tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); | 2622 | tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); |
| 3008 | tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp; | ||
| 3009 | tcp_connect_queue_skb(sk, buff); | ||
| 3010 | TCP_ECN_send_syn(sk, buff); | 2623 | TCP_ECN_send_syn(sk, buff); |
| 3011 | 2624 | ||
| 3012 | /* Send off SYN; include data in Fast Open. */ | 2625 | /* Send it off. */ |
| 3013 | err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : | 2626 | TCP_SKB_CB(buff)->when = tcp_time_stamp; |
| 3014 | tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); | 2627 | tp->retrans_stamp = TCP_SKB_CB(buff)->when; |
| 2628 | skb_header_release(buff); | ||
| 2629 | __tcp_add_write_queue_tail(sk, buff); | ||
| 2630 | sk->sk_wmem_queued += buff->truesize; | ||
| 2631 | sk_mem_charge(sk, buff->truesize); | ||
| 2632 | tp->packets_out += tcp_skb_pcount(buff); | ||
| 2633 | err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); | ||
| 3015 | if (err == -ECONNREFUSED) | 2634 | if (err == -ECONNREFUSED) |
| 3016 | return err; | 2635 | return err; |
| 3017 | 2636 | ||
| @@ -3098,7 +2717,7 @@ void tcp_send_ack(struct sock *sk) | |||
| 3098 | * tcp_transmit_skb() will set the ownership to this | 2717 | * tcp_transmit_skb() will set the ownership to this |
| 3099 | * sock. | 2718 | * sock. |
| 3100 | */ | 2719 | */ |
| 3101 | buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); | 2720 | buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); |
| 3102 | if (buff == NULL) { | 2721 | if (buff == NULL) { |
| 3103 | inet_csk_schedule_ack(sk); | 2722 | inet_csk_schedule_ack(sk); |
| 3104 | inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; | 2723 | inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; |
| @@ -3113,7 +2732,7 @@ void tcp_send_ack(struct sock *sk) | |||
| 3113 | 2732 | ||
| 3114 | /* Send it off, this clears delayed acks for us. */ | 2733 | /* Send it off, this clears delayed acks for us. */ |
| 3115 | TCP_SKB_CB(buff)->when = tcp_time_stamp; | 2734 | TCP_SKB_CB(buff)->when = tcp_time_stamp; |
| 3116 | tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); | 2735 | tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); |
| 3117 | } | 2736 | } |
| 3118 | 2737 | ||
| 3119 | /* This routine sends a packet with an out of date sequence | 2738 | /* This routine sends a packet with an out of date sequence |
| @@ -3133,7 +2752,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) | |||
| 3133 | struct sk_buff *skb; | 2752 | struct sk_buff *skb; |
| 3134 | 2753 | ||
| 3135 | /* We don't queue it, tcp_transmit_skb() sets ownership. */ | 2754 | /* We don't queue it, tcp_transmit_skb() sets ownership. */ |
| 3136 | skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); | 2755 | skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); |
| 3137 | if (skb == NULL) | 2756 | if (skb == NULL) |
| 3138 | return -1; | 2757 | return -1; |
| 3139 | 2758 | ||
| @@ -3148,15 +2767,6 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) | |||
| 3148 | return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); | 2767 | return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); |
| 3149 | } | 2768 | } |
| 3150 | 2769 | ||
| 3151 | void tcp_send_window_probe(struct sock *sk) | ||
| 3152 | { | ||
| 3153 | if (sk->sk_state == TCP_ESTABLISHED) { | ||
| 3154 | tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; | ||
| 3155 | tcp_sk(sk)->snd_nxt = tcp_sk(sk)->write_seq; | ||
| 3156 | tcp_xmit_probe_skb(sk, 0); | ||
| 3157 | } | ||
| 3158 | } | ||
| 3159 | |||
| 3160 | /* Initiate keepalive or window probe from timer. */ | 2770 | /* Initiate keepalive or window probe from timer. */ |
| 3161 | int tcp_write_wakeup(struct sock *sk) | 2771 | int tcp_write_wakeup(struct sock *sk) |
| 3162 | { | 2772 | { |
| @@ -3182,13 +2792,13 @@ int tcp_write_wakeup(struct sock *sk) | |||
| 3182 | if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || | 2792 | if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || |
| 3183 | skb->len > mss) { | 2793 | skb->len > mss) { |
| 3184 | seg_size = min(seg_size, mss); | 2794 | seg_size = min(seg_size, mss); |
| 3185 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; | 2795 | TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; |
| 3186 | if (tcp_fragment(sk, skb, seg_size, mss)) | 2796 | if (tcp_fragment(sk, skb, seg_size, mss)) |
| 3187 | return -1; | 2797 | return -1; |
| 3188 | } else if (!tcp_skb_pcount(skb)) | 2798 | } else if (!tcp_skb_pcount(skb)) |
| 3189 | tcp_set_skb_tso_segs(sk, skb, mss); | 2799 | tcp_set_skb_tso_segs(sk, skb, mss); |
| 3190 | 2800 | ||
| 3191 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; | 2801 | TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; |
| 3192 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 2802 | TCP_SKB_CB(skb)->when = tcp_time_stamp; |
| 3193 | err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); | 2803 | err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); |
| 3194 | if (!err) | 2804 | if (!err) |
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 4526fe68e60..85ee7eb7e38 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c | |||
| @@ -18,8 +18,6 @@ | |||
| 18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
| 19 | */ | 19 | */ |
| 20 | 20 | ||
| 21 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
| 22 | |||
| 23 | #include <linux/kernel.h> | 21 | #include <linux/kernel.h> |
| 24 | #include <linux/kprobes.h> | 22 | #include <linux/kprobes.h> |
| 25 | #include <linux/socket.h> | 23 | #include <linux/socket.h> |
| @@ -91,7 +89,7 @@ static inline int tcp_probe_avail(void) | |||
| 91 | * Note: arguments must match tcp_rcv_established()! | 89 | * Note: arguments must match tcp_rcv_established()! |
| 92 | */ | 90 | */ |
| 93 | static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, | 91 | static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, |
| 94 | struct tcphdr *th, unsigned int len) | 92 | struct tcphdr *th, unsigned len) |
| 95 | { | 93 | { |
| 96 | const struct tcp_sock *tp = tcp_sk(sk); | 94 | const struct tcp_sock *tp = tcp_sk(sk); |
| 97 | const struct inet_sock *inet = inet_sk(sk); | 95 | const struct inet_sock *inet = inet_sk(sk); |
| @@ -138,7 +136,7 @@ static struct jprobe tcp_jprobe = { | |||
| 138 | .entry = jtcp_rcv_established, | 136 | .entry = jtcp_rcv_established, |
| 139 | }; | 137 | }; |
| 140 | 138 | ||
| 141 | static int tcpprobe_open(struct inode *inode, struct file *file) | 139 | static int tcpprobe_open(struct inode * inode, struct file * file) |
| 142 | { | 140 | { |
| 143 | /* Reset (empty) log */ | 141 | /* Reset (empty) log */ |
| 144 | spin_lock_bh(&tcp_probe.lock); | 142 | spin_lock_bh(&tcp_probe.lock); |
| @@ -241,7 +239,7 @@ static __init int tcpprobe_init(void) | |||
| 241 | if (ret) | 239 | if (ret) |
| 242 | goto err1; | 240 | goto err1; |
| 243 | 241 | ||
| 244 | pr_info("probe registered (port=%d) bufsize=%u\n", port, bufsize); | 242 | pr_info("TCP probe registered (port=%d) bufsize=%u\n", port, bufsize); |
| 245 | return 0; | 243 | return 0; |
| 246 | err1: | 244 | err1: |
| 247 | proc_net_remove(&init_net, procname); | 245 | proc_net_remove(&init_net, procname); |
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b78aac30c49..ecd44b0c45f 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c | |||
| @@ -32,6 +32,17 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; | |||
| 32 | int sysctl_tcp_orphan_retries __read_mostly; | 32 | int sysctl_tcp_orphan_retries __read_mostly; |
| 33 | int sysctl_tcp_thin_linear_timeouts __read_mostly; | 33 | int sysctl_tcp_thin_linear_timeouts __read_mostly; |
| 34 | 34 | ||
| 35 | static void tcp_write_timer(unsigned long); | ||
| 36 | static void tcp_delack_timer(unsigned long); | ||
| 37 | static void tcp_keepalive_timer (unsigned long data); | ||
| 38 | |||
| 39 | void tcp_init_xmit_timers(struct sock *sk) | ||
| 40 | { | ||
| 41 | inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, | ||
| 42 | &tcp_keepalive_timer); | ||
| 43 | } | ||
| 44 | EXPORT_SYMBOL(tcp_init_xmit_timers); | ||
| 45 | |||
| 35 | static void tcp_write_err(struct sock *sk) | 46 | static void tcp_write_err(struct sock *sk) |
| 36 | { | 47 | { |
| 37 | sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; | 48 | sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; |
| @@ -66,7 +77,10 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset) | |||
| 66 | if (sk->sk_err_soft) | 77 | if (sk->sk_err_soft) |
| 67 | shift++; | 78 | shift++; |
| 68 | 79 | ||
| 69 | if (tcp_check_oom(sk, shift)) { | 80 | if (tcp_too_many_orphans(sk, shift)) { |
| 81 | if (net_ratelimit()) | ||
| 82 | printk(KERN_INFO "Out of socket memory\n"); | ||
| 83 | |||
| 70 | /* Catch exceptional cases, when connection requires reset. | 84 | /* Catch exceptional cases, when connection requires reset. |
| 71 | * 1. Last segment was sent recently. */ | 85 | * 1. Last segment was sent recently. */ |
| 72 | if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || | 86 | if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || |
| @@ -157,13 +171,13 @@ static int tcp_write_timeout(struct sock *sk) | |||
| 157 | { | 171 | { |
| 158 | struct inet_connection_sock *icsk = inet_csk(sk); | 172 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 159 | int retry_until; | 173 | int retry_until; |
| 160 | bool do_reset, syn_set = false; | 174 | bool do_reset, syn_set = 0; |
| 161 | 175 | ||
| 162 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { | 176 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { |
| 163 | if (icsk->icsk_retransmits) | 177 | if (icsk->icsk_retransmits) |
| 164 | dst_negative_advice(sk); | 178 | dst_negative_advice(sk); |
| 165 | retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; | 179 | retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; |
| 166 | syn_set = true; | 180 | syn_set = 1; |
| 167 | } else { | 181 | } else { |
| 168 | if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { | 182 | if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { |
| 169 | /* Black hole detection */ | 183 | /* Black hole detection */ |
| @@ -194,11 +208,21 @@ static int tcp_write_timeout(struct sock *sk) | |||
| 194 | return 0; | 208 | return 0; |
| 195 | } | 209 | } |
| 196 | 210 | ||
| 197 | void tcp_delack_timer_handler(struct sock *sk) | 211 | static void tcp_delack_timer(unsigned long data) |
| 198 | { | 212 | { |
| 213 | struct sock *sk = (struct sock *)data; | ||
| 199 | struct tcp_sock *tp = tcp_sk(sk); | 214 | struct tcp_sock *tp = tcp_sk(sk); |
| 200 | struct inet_connection_sock *icsk = inet_csk(sk); | 215 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 201 | 216 | ||
| 217 | bh_lock_sock(sk); | ||
| 218 | if (sock_owned_by_user(sk)) { | ||
| 219 | /* Try again later. */ | ||
| 220 | icsk->icsk_ack.blocked = 1; | ||
| 221 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); | ||
| 222 | sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); | ||
| 223 | goto out_unlock; | ||
| 224 | } | ||
| 225 | |||
| 202 | sk_mem_reclaim_partial(sk); | 226 | sk_mem_reclaim_partial(sk); |
| 203 | 227 | ||
| 204 | if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) | 228 | if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) |
| @@ -237,24 +261,9 @@ void tcp_delack_timer_handler(struct sock *sk) | |||
| 237 | } | 261 | } |
| 238 | 262 | ||
| 239 | out: | 263 | out: |
| 240 | if (sk_under_memory_pressure(sk)) | 264 | if (tcp_memory_pressure) |
| 241 | sk_mem_reclaim(sk); | 265 | sk_mem_reclaim(sk); |
| 242 | } | 266 | out_unlock: |
| 243 | |||
| 244 | static void tcp_delack_timer(unsigned long data) | ||
| 245 | { | ||
| 246 | struct sock *sk = (struct sock *)data; | ||
| 247 | |||
| 248 | bh_lock_sock(sk); | ||
| 249 | if (!sock_owned_by_user(sk)) { | ||
| 250 | tcp_delack_timer_handler(sk); | ||
| 251 | } else { | ||
| 252 | inet_csk(sk)->icsk_ack.blocked = 1; | ||
| 253 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); | ||
| 254 | /* deleguate our work to tcp_release_cb() */ | ||
| 255 | if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) | ||
| 256 | sock_hold(sk); | ||
| 257 | } | ||
| 258 | bh_unlock_sock(sk); | 267 | bh_unlock_sock(sk); |
| 259 | sock_put(sk); | 268 | sock_put(sk); |
| 260 | } | 269 | } |
| @@ -305,35 +314,6 @@ static void tcp_probe_timer(struct sock *sk) | |||
| 305 | } | 314 | } |
| 306 | 315 | ||
| 307 | /* | 316 | /* |
| 308 | * Timer for Fast Open socket to retransmit SYNACK. Note that the | ||
| 309 | * sk here is the child socket, not the parent (listener) socket. | ||
| 310 | */ | ||
| 311 | static void tcp_fastopen_synack_timer(struct sock *sk) | ||
| 312 | { | ||
| 313 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 314 | int max_retries = icsk->icsk_syn_retries ? : | ||
| 315 | sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ | ||
| 316 | struct request_sock *req; | ||
| 317 | |||
| 318 | req = tcp_sk(sk)->fastopen_rsk; | ||
| 319 | req->rsk_ops->syn_ack_timeout(sk, req); | ||
| 320 | |||
| 321 | if (req->num_timeout >= max_retries) { | ||
| 322 | tcp_write_err(sk); | ||
| 323 | return; | ||
| 324 | } | ||
| 325 | /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error | ||
| 326 | * returned from rtx_syn_ack() to make it more persistent like | ||
| 327 | * regular retransmit because if the child socket has been accepted | ||
| 328 | * it's not good to give up too easily. | ||
| 329 | */ | ||
| 330 | inet_rtx_syn_ack(sk, req); | ||
| 331 | req->num_timeout++; | ||
| 332 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | ||
| 333 | TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); | ||
| 334 | } | ||
| 335 | |||
| 336 | /* | ||
| 337 | * The TCP retransmit timer. | 317 | * The TCP retransmit timer. |
| 338 | */ | 318 | */ |
| 339 | 319 | ||
| @@ -342,19 +322,6 @@ void tcp_retransmit_timer(struct sock *sk) | |||
| 342 | struct tcp_sock *tp = tcp_sk(sk); | 322 | struct tcp_sock *tp = tcp_sk(sk); |
| 343 | struct inet_connection_sock *icsk = inet_csk(sk); | 323 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 344 | 324 | ||
| 345 | if (tp->early_retrans_delayed) { | ||
| 346 | tcp_resume_early_retransmit(sk); | ||
| 347 | return; | ||
| 348 | } | ||
| 349 | if (tp->fastopen_rsk) { | ||
| 350 | WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && | ||
| 351 | sk->sk_state != TCP_FIN_WAIT1); | ||
| 352 | tcp_fastopen_synack_timer(sk); | ||
| 353 | /* Before we receive ACK to our SYN-ACK don't retransmit | ||
| 354 | * anything else (e.g., data or FIN segments). | ||
| 355 | */ | ||
| 356 | return; | ||
| 357 | } | ||
| 358 | if (!tp->packets_out) | 325 | if (!tp->packets_out) |
| 359 | goto out; | 326 | goto out; |
| 360 | 327 | ||
| @@ -367,22 +334,22 @@ void tcp_retransmit_timer(struct sock *sk) | |||
| 367 | * connection. If the socket is an orphan, time it out, | 334 | * connection. If the socket is an orphan, time it out, |
| 368 | * we cannot allow such beasts to hang infinitely. | 335 | * we cannot allow such beasts to hang infinitely. |
| 369 | */ | 336 | */ |
| 337 | #ifdef TCP_DEBUG | ||
| 370 | struct inet_sock *inet = inet_sk(sk); | 338 | struct inet_sock *inet = inet_sk(sk); |
| 371 | if (sk->sk_family == AF_INET) { | 339 | if (sk->sk_family == AF_INET) { |
| 372 | LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"), | 340 | LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", |
| 373 | &inet->inet_daddr, | 341 | &inet->inet_daddr, ntohs(inet->inet_dport), |
| 374 | ntohs(inet->inet_dport), inet->inet_num, | 342 | inet->inet_num, tp->snd_una, tp->snd_nxt); |
| 375 | tp->snd_una, tp->snd_nxt); | ||
| 376 | } | 343 | } |
| 377 | #if IS_ENABLED(CONFIG_IPV6) | 344 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 378 | else if (sk->sk_family == AF_INET6) { | 345 | else if (sk->sk_family == AF_INET6) { |
| 379 | struct ipv6_pinfo *np = inet6_sk(sk); | 346 | struct ipv6_pinfo *np = inet6_sk(sk); |
| 380 | LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"), | 347 | LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", |
| 381 | &np->daddr, | 348 | &np->daddr, ntohs(inet->inet_dport), |
| 382 | ntohs(inet->inet_dport), inet->inet_num, | 349 | inet->inet_num, tp->snd_una, tp->snd_nxt); |
| 383 | tp->snd_una, tp->snd_nxt); | ||
| 384 | } | 350 | } |
| 385 | #endif | 351 | #endif |
| 352 | #endif | ||
| 386 | if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { | 353 | if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { |
| 387 | tcp_write_err(sk); | 354 | tcp_write_err(sk); |
| 388 | goto out; | 355 | goto out; |
| @@ -481,11 +448,19 @@ out_reset_timer: | |||
| 481 | out:; | 448 | out:; |
| 482 | } | 449 | } |
| 483 | 450 | ||
| 484 | void tcp_write_timer_handler(struct sock *sk) | 451 | static void tcp_write_timer(unsigned long data) |
| 485 | { | 452 | { |
| 453 | struct sock *sk = (struct sock *)data; | ||
| 486 | struct inet_connection_sock *icsk = inet_csk(sk); | 454 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 487 | int event; | 455 | int event; |
| 488 | 456 | ||
| 457 | bh_lock_sock(sk); | ||
| 458 | if (sock_owned_by_user(sk)) { | ||
| 459 | /* Try again later */ | ||
| 460 | sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20)); | ||
| 461 | goto out_unlock; | ||
| 462 | } | ||
| 463 | |||
| 489 | if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) | 464 | if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) |
| 490 | goto out; | 465 | goto out; |
| 491 | 466 | ||
| @@ -508,20 +483,7 @@ void tcp_write_timer_handler(struct sock *sk) | |||
| 508 | 483 | ||
| 509 | out: | 484 | out: |
| 510 | sk_mem_reclaim(sk); | 485 | sk_mem_reclaim(sk); |
| 511 | } | 486 | out_unlock: |
| 512 | |||
| 513 | static void tcp_write_timer(unsigned long data) | ||
| 514 | { | ||
| 515 | struct sock *sk = (struct sock *)data; | ||
| 516 | |||
| 517 | bh_lock_sock(sk); | ||
| 518 | if (!sock_owned_by_user(sk)) { | ||
| 519 | tcp_write_timer_handler(sk); | ||
| 520 | } else { | ||
| 521 | /* deleguate our work to tcp_release_cb() */ | ||
| 522 | if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) | ||
| 523 | sock_hold(sk); | ||
| 524 | } | ||
| 525 | bh_unlock_sock(sk); | 487 | bh_unlock_sock(sk); |
| 526 | sock_put(sk); | 488 | sock_put(sk); |
| 527 | } | 489 | } |
| @@ -638,10 +600,3 @@ out: | |||
| 638 | bh_unlock_sock(sk); | 600 | bh_unlock_sock(sk); |
| 639 | sock_put(sk); | 601 | sock_put(sk); |
| 640 | } | 602 | } |
| 641 | |||
| 642 | void tcp_init_xmit_timers(struct sock *sk) | ||
| 643 | { | ||
| 644 | inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, | ||
| 645 | &tcp_keepalive_timer); | ||
| 646 | } | ||
| 647 | EXPORT_SYMBOL(tcp_init_xmit_timers); | ||
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index 0d017183062..ac3b3ee4b07 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c | |||
| @@ -105,7 +105,7 @@ drop: | |||
| 105 | return 0; | 105 | return 0; |
| 106 | } | 106 | } |
| 107 | 107 | ||
| 108 | #if IS_ENABLED(CONFIG_IPV6) | 108 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 109 | static int tunnel64_rcv(struct sk_buff *skb) | 109 | static int tunnel64_rcv(struct sk_buff *skb) |
| 110 | { | 110 | { |
| 111 | struct xfrm_tunnel *handler; | 111 | struct xfrm_tunnel *handler; |
| @@ -134,7 +134,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info) | |||
| 134 | break; | 134 | break; |
| 135 | } | 135 | } |
| 136 | 136 | ||
| 137 | #if IS_ENABLED(CONFIG_IPV6) | 137 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 138 | static void tunnel64_err(struct sk_buff *skb, u32 info) | 138 | static void tunnel64_err(struct sk_buff *skb, u32 info) |
| 139 | { | 139 | { |
| 140 | struct xfrm_tunnel *handler; | 140 | struct xfrm_tunnel *handler; |
| @@ -152,7 +152,7 @@ static const struct net_protocol tunnel4_protocol = { | |||
| 152 | .netns_ok = 1, | 152 | .netns_ok = 1, |
| 153 | }; | 153 | }; |
| 154 | 154 | ||
| 155 | #if IS_ENABLED(CONFIG_IPV6) | 155 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 156 | static const struct net_protocol tunnel64_protocol = { | 156 | static const struct net_protocol tunnel64_protocol = { |
| 157 | .handler = tunnel64_rcv, | 157 | .handler = tunnel64_rcv, |
| 158 | .err_handler = tunnel64_err, | 158 | .err_handler = tunnel64_err, |
| @@ -164,12 +164,12 @@ static const struct net_protocol tunnel64_protocol = { | |||
| 164 | static int __init tunnel4_init(void) | 164 | static int __init tunnel4_init(void) |
| 165 | { | 165 | { |
| 166 | if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) { | 166 | if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) { |
| 167 | pr_err("%s: can't add protocol\n", __func__); | 167 | printk(KERN_ERR "tunnel4 init: can't add protocol\n"); |
| 168 | return -EAGAIN; | 168 | return -EAGAIN; |
| 169 | } | 169 | } |
| 170 | #if IS_ENABLED(CONFIG_IPV6) | 170 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 171 | if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) { | 171 | if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) { |
| 172 | pr_err("tunnel64 init: can't add protocol\n"); | 172 | printk(KERN_ERR "tunnel64 init: can't add protocol\n"); |
| 173 | inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); | 173 | inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); |
| 174 | return -EAGAIN; | 174 | return -EAGAIN; |
| 175 | } | 175 | } |
| @@ -179,12 +179,12 @@ static int __init tunnel4_init(void) | |||
| 179 | 179 | ||
| 180 | static void __exit tunnel4_fini(void) | 180 | static void __exit tunnel4_fini(void) |
| 181 | { | 181 | { |
| 182 | #if IS_ENABLED(CONFIG_IPV6) | 182 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 183 | if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6)) | 183 | if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6)) |
| 184 | pr_err("tunnel64 close: can't remove protocol\n"); | 184 | printk(KERN_ERR "tunnel64 close: can't remove protocol\n"); |
| 185 | #endif | 185 | #endif |
| 186 | if (inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP)) | 186 | if (inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP)) |
| 187 | pr_err("tunnel4 close: can't remove protocol\n"); | 187 | printk(KERN_ERR "tunnel4 close: can't remove protocol\n"); |
| 188 | } | 188 | } |
| 189 | 189 | ||
| 190 | module_init(tunnel4_init); | 190 | module_init(tunnel4_init); |
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 79c8dbe59b5..1b5a19340a9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
| @@ -77,8 +77,7 @@ | |||
| 77 | * 2 of the License, or (at your option) any later version. | 77 | * 2 of the License, or (at your option) any later version. |
| 78 | */ | 78 | */ |
| 79 | 79 | ||
| 80 | #define pr_fmt(fmt) "UDP: " fmt | 80 | #include <asm/system.h> |
| 81 | |||
| 82 | #include <asm/uaccess.h> | 81 | #include <asm/uaccess.h> |
| 83 | #include <asm/ioctls.h> | 82 | #include <asm/ioctls.h> |
| 84 | #include <linux/bootmem.h> | 83 | #include <linux/bootmem.h> |
| @@ -107,8 +106,6 @@ | |||
| 107 | #include <net/checksum.h> | 106 | #include <net/checksum.h> |
| 108 | #include <net/xfrm.h> | 107 | #include <net/xfrm.h> |
| 109 | #include <trace/events/udp.h> | 108 | #include <trace/events/udp.h> |
| 110 | #include <linux/static_key.h> | ||
| 111 | #include <trace/events/skb.h> | ||
| 112 | #include "udp_impl.h" | 109 | #include "udp_impl.h" |
| 113 | 110 | ||
| 114 | struct udp_table udp_table __read_mostly; | 111 | struct udp_table udp_table __read_mostly; |
| @@ -208,7 +205,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, | |||
| 208 | 205 | ||
| 209 | if (!snum) { | 206 | if (!snum) { |
| 210 | int low, high, remaining; | 207 | int low, high, remaining; |
| 211 | unsigned int rand; | 208 | unsigned rand; |
| 212 | unsigned short first, last; | 209 | unsigned short first, last; |
| 213 | DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); | 210 | DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); |
| 214 | 211 | ||
| @@ -448,7 +445,7 @@ exact_match: | |||
| 448 | /* UDP is nearly always wildcards out the wazoo, it makes no sense to try | 445 | /* UDP is nearly always wildcards out the wazoo, it makes no sense to try |
| 449 | * harder than this. -DaveM | 446 | * harder than this. -DaveM |
| 450 | */ | 447 | */ |
| 451 | struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, | 448 | static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, |
| 452 | __be16 sport, __be32 daddr, __be16 dport, | 449 | __be16 sport, __be32 daddr, __be16 dport, |
| 453 | int dif, struct udp_table *udptable) | 450 | int dif, struct udp_table *udptable) |
| 454 | { | 451 | { |
| @@ -515,7 +512,6 @@ begin: | |||
| 515 | rcu_read_unlock(); | 512 | rcu_read_unlock(); |
| 516 | return result; | 513 | return result; |
| 517 | } | 514 | } |
| 518 | EXPORT_SYMBOL_GPL(__udp4_lib_lookup); | ||
| 519 | 515 | ||
| 520 | static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, | 516 | static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, |
| 521 | __be16 sport, __be16 dport, | 517 | __be16 sport, __be16 dport, |
| @@ -616,7 +612,6 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) | |||
| 616 | break; | 612 | break; |
| 617 | case ICMP_DEST_UNREACH: | 613 | case ICMP_DEST_UNREACH: |
| 618 | if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ | 614 | if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ |
| 619 | ipv4_sk_update_pmtu(skb, sk, info); | ||
| 620 | if (inet->pmtudisc != IP_PMTUDISC_DONT) { | 615 | if (inet->pmtudisc != IP_PMTUDISC_DONT) { |
| 621 | err = EMSGSIZE; | 616 | err = EMSGSIZE; |
| 622 | harderr = 1; | 617 | harderr = 1; |
| @@ -630,9 +625,6 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) | |||
| 630 | err = icmp_err_convert[code].errno; | 625 | err = icmp_err_convert[code].errno; |
| 631 | } | 626 | } |
| 632 | break; | 627 | break; |
| 633 | case ICMP_REDIRECT: | ||
| 634 | ipv4_sk_redirect(skb, sk); | ||
| 635 | break; | ||
| 636 | } | 628 | } |
| 637 | 629 | ||
| 638 | /* | 630 | /* |
| @@ -758,7 +750,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) | |||
| 758 | uh->check = CSUM_MANGLED_0; | 750 | uh->check = CSUM_MANGLED_0; |
| 759 | 751 | ||
| 760 | send: | 752 | send: |
| 761 | err = ip_send_skb(sock_net(sk), skb); | 753 | err = ip_send_skb(skb); |
| 762 | if (err) { | 754 | if (err) { |
| 763 | if (err == -ENOBUFS && !inet->recverr) { | 755 | if (err == -ENOBUFS && !inet->recverr) { |
| 764 | UDP_INC_STATS_USER(sock_net(sk), | 756 | UDP_INC_STATS_USER(sock_net(sk), |
| @@ -852,7 +844,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
| 852 | * Get and verify the address. | 844 | * Get and verify the address. |
| 853 | */ | 845 | */ |
| 854 | if (msg->msg_name) { | 846 | if (msg->msg_name) { |
| 855 | struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; | 847 | struct sockaddr_in * usin = (struct sockaddr_in *)msg->msg_name; |
| 856 | if (msg->msg_namelen < sizeof(*usin)) | 848 | if (msg->msg_namelen < sizeof(*usin)) |
| 857 | return -EINVAL; | 849 | return -EINVAL; |
| 858 | if (usin->sin_family != AF_INET) { | 850 | if (usin->sin_family != AF_INET) { |
| @@ -924,8 +916,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
| 924 | if (!saddr) | 916 | if (!saddr) |
| 925 | saddr = inet->mc_addr; | 917 | saddr = inet->mc_addr; |
| 926 | connected = 0; | 918 | connected = 0; |
| 927 | } else if (!ipc.oif) | 919 | } |
| 928 | ipc.oif = inet->uc_index; | ||
| 929 | 920 | ||
| 930 | if (connected) | 921 | if (connected) |
| 931 | rt = (struct rtable *)sk_dst_check(sk, 0); | 922 | rt = (struct rtable *)sk_dst_check(sk, 0); |
| @@ -982,7 +973,7 @@ back_from_confirm: | |||
| 982 | /* ... which is an evident application bug. --ANK */ | 973 | /* ... which is an evident application bug. --ANK */ |
| 983 | release_sock(sk); | 974 | release_sock(sk); |
| 984 | 975 | ||
| 985 | LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("cork app bug 2\n")); | 976 | LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n"); |
| 986 | err = -EINVAL; | 977 | err = -EINVAL; |
| 987 | goto out; | 978 | goto out; |
| 988 | } | 979 | } |
| @@ -1061,7 +1052,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, | |||
| 1061 | if (unlikely(!up->pending)) { | 1052 | if (unlikely(!up->pending)) { |
| 1062 | release_sock(sk); | 1053 | release_sock(sk); |
| 1063 | 1054 | ||
| 1064 | LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("udp cork app bug 3\n")); | 1055 | LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n"); |
| 1065 | return -EINVAL; | 1056 | return -EINVAL; |
| 1066 | } | 1057 | } |
| 1067 | 1058 | ||
| @@ -1173,8 +1164,8 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
| 1173 | struct inet_sock *inet = inet_sk(sk); | 1164 | struct inet_sock *inet = inet_sk(sk); |
| 1174 | struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; | 1165 | struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; |
| 1175 | struct sk_buff *skb; | 1166 | struct sk_buff *skb; |
| 1176 | unsigned int ulen, copied; | 1167 | unsigned int ulen; |
| 1177 | int peeked, off = 0; | 1168 | int peeked; |
| 1178 | int err; | 1169 | int err; |
| 1179 | int is_udplite = IS_UDPLITE(sk); | 1170 | int is_udplite = IS_UDPLITE(sk); |
| 1180 | bool slow; | 1171 | bool slow; |
| @@ -1190,15 +1181,14 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
| 1190 | 1181 | ||
| 1191 | try_again: | 1182 | try_again: |
| 1192 | skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), | 1183 | skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), |
| 1193 | &peeked, &off, &err); | 1184 | &peeked, &err); |
| 1194 | if (!skb) | 1185 | if (!skb) |
| 1195 | goto out; | 1186 | goto out; |
| 1196 | 1187 | ||
| 1197 | ulen = skb->len - sizeof(struct udphdr); | 1188 | ulen = skb->len - sizeof(struct udphdr); |
| 1198 | copied = len; | 1189 | if (len > ulen) |
| 1199 | if (copied > ulen) | 1190 | len = ulen; |
| 1200 | copied = ulen; | 1191 | else if (len < ulen) |
| 1201 | else if (copied < ulen) | ||
| 1202 | msg->msg_flags |= MSG_TRUNC; | 1192 | msg->msg_flags |= MSG_TRUNC; |
| 1203 | 1193 | ||
| 1204 | /* | 1194 | /* |
| @@ -1207,14 +1197,14 @@ try_again: | |||
| 1207 | * coverage checksum (UDP-Lite), do it before the copy. | 1197 | * coverage checksum (UDP-Lite), do it before the copy. |
| 1208 | */ | 1198 | */ |
| 1209 | 1199 | ||
| 1210 | if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { | 1200 | if (len < ulen || UDP_SKB_CB(skb)->partial_cov) { |
| 1211 | if (udp_lib_checksum_complete(skb)) | 1201 | if (udp_lib_checksum_complete(skb)) |
| 1212 | goto csum_copy_err; | 1202 | goto csum_copy_err; |
| 1213 | } | 1203 | } |
| 1214 | 1204 | ||
| 1215 | if (skb_csum_unnecessary(skb)) | 1205 | if (skb_csum_unnecessary(skb)) |
| 1216 | err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), | 1206 | err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), |
| 1217 | msg->msg_iov, copied); | 1207 | msg->msg_iov, len); |
| 1218 | else { | 1208 | else { |
| 1219 | err = skb_copy_and_csum_datagram_iovec(skb, | 1209 | err = skb_copy_and_csum_datagram_iovec(skb, |
| 1220 | sizeof(struct udphdr), | 1210 | sizeof(struct udphdr), |
| @@ -1224,15 +1214,8 @@ try_again: | |||
| 1224 | goto csum_copy_err; | 1214 | goto csum_copy_err; |
| 1225 | } | 1215 | } |
| 1226 | 1216 | ||
| 1227 | if (unlikely(err)) { | 1217 | if (err) |
| 1228 | trace_kfree_skb(skb, udp_recvmsg); | ||
| 1229 | if (!peeked) { | ||
| 1230 | atomic_inc(&sk->sk_drops); | ||
| 1231 | UDP_INC_STATS_USER(sock_net(sk), | ||
| 1232 | UDP_MIB_INERRORS, is_udplite); | ||
| 1233 | } | ||
| 1234 | goto out_free; | 1218 | goto out_free; |
| 1235 | } | ||
| 1236 | 1219 | ||
| 1237 | if (!peeked) | 1220 | if (!peeked) |
| 1238 | UDP_INC_STATS_USER(sock_net(sk), | 1221 | UDP_INC_STATS_USER(sock_net(sk), |
| @@ -1250,7 +1233,7 @@ try_again: | |||
| 1250 | if (inet->cmsg_flags) | 1233 | if (inet->cmsg_flags) |
| 1251 | ip_cmsg_recv(msg, skb); | 1234 | ip_cmsg_recv(msg, skb); |
| 1252 | 1235 | ||
| 1253 | err = copied; | 1236 | err = len; |
| 1254 | if (flags & MSG_TRUNC) | 1237 | if (flags & MSG_TRUNC) |
| 1255 | err = ulen; | 1238 | err = ulen; |
| 1256 | 1239 | ||
| @@ -1284,7 +1267,7 @@ int udp_disconnect(struct sock *sk, int flags) | |||
| 1284 | sk->sk_state = TCP_CLOSE; | 1267 | sk->sk_state = TCP_CLOSE; |
| 1285 | inet->inet_daddr = 0; | 1268 | inet->inet_daddr = 0; |
| 1286 | inet->inet_dport = 0; | 1269 | inet->inet_dport = 0; |
| 1287 | sock_rps_reset_rxhash(sk); | 1270 | sock_rps_save_rxhash(sk, 0); |
| 1288 | sk->sk_bound_dev_if = 0; | 1271 | sk->sk_bound_dev_if = 0; |
| 1289 | if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) | 1272 | if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) |
| 1290 | inet_reset_saddr(sk); | 1273 | inet_reset_saddr(sk); |
| @@ -1372,9 +1355,9 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) | |||
| 1372 | int rc; | 1355 | int rc; |
| 1373 | 1356 | ||
| 1374 | if (inet_sk(sk)->inet_daddr) | 1357 | if (inet_sk(sk)->inet_daddr) |
| 1375 | sock_rps_save_rxhash(sk, skb); | 1358 | sock_rps_save_rxhash(sk, skb->rxhash); |
| 1376 | 1359 | ||
| 1377 | rc = sock_queue_rcv_skb(sk, skb); | 1360 | rc = ip_queue_rcv_skb(sk, skb); |
| 1378 | if (rc < 0) { | 1361 | if (rc < 0) { |
| 1379 | int is_udplite = IS_UDPLITE(sk); | 1362 | int is_udplite = IS_UDPLITE(sk); |
| 1380 | 1363 | ||
| @@ -1392,14 +1375,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) | |||
| 1392 | 1375 | ||
| 1393 | } | 1376 | } |
| 1394 | 1377 | ||
| 1395 | static struct static_key udp_encap_needed __read_mostly; | ||
| 1396 | void udp_encap_enable(void) | ||
| 1397 | { | ||
| 1398 | if (!static_key_enabled(&udp_encap_needed)) | ||
| 1399 | static_key_slow_inc(&udp_encap_needed); | ||
| 1400 | } | ||
| 1401 | EXPORT_SYMBOL(udp_encap_enable); | ||
| 1402 | |||
| 1403 | /* returns: | 1378 | /* returns: |
| 1404 | * -1: error | 1379 | * -1: error |
| 1405 | * 0: success | 1380 | * 0: success |
| @@ -1421,9 +1396,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) | |||
| 1421 | goto drop; | 1396 | goto drop; |
| 1422 | nf_reset(skb); | 1397 | nf_reset(skb); |
| 1423 | 1398 | ||
| 1424 | if (static_key_false(&udp_encap_needed) && up->encap_type) { | 1399 | if (up->encap_type) { |
| 1425 | int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); | ||
| 1426 | |||
| 1427 | /* | 1400 | /* |
| 1428 | * This is an encapsulation socket so pass the skb to | 1401 | * This is an encapsulation socket so pass the skb to |
| 1429 | * the socket's udp_encap_rcv() hook. Otherwise, just | 1402 | * the socket's udp_encap_rcv() hook. Otherwise, just |
| @@ -1436,11 +1409,11 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) | |||
| 1436 | */ | 1409 | */ |
| 1437 | 1410 | ||
| 1438 | /* if we're overly short, let UDP handle it */ | 1411 | /* if we're overly short, let UDP handle it */ |
| 1439 | encap_rcv = ACCESS_ONCE(up->encap_rcv); | 1412 | if (skb->len > sizeof(struct udphdr) && |
| 1440 | if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) { | 1413 | up->encap_rcv != NULL) { |
| 1441 | int ret; | 1414 | int ret; |
| 1442 | 1415 | ||
| 1443 | ret = encap_rcv(sk, skb); | 1416 | ret = (*up->encap_rcv)(sk, skb); |
| 1444 | if (ret <= 0) { | 1417 | if (ret <= 0) { |
| 1445 | UDP_INC_STATS_BH(sock_net(sk), | 1418 | UDP_INC_STATS_BH(sock_net(sk), |
| 1446 | UDP_MIB_INDATAGRAMS, | 1419 | UDP_MIB_INDATAGRAMS, |
| @@ -1469,8 +1442,9 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) | |||
| 1469 | * provided by the application." | 1442 | * provided by the application." |
| 1470 | */ | 1443 | */ |
| 1471 | if (up->pcrlen == 0) { /* full coverage was set */ | 1444 | if (up->pcrlen == 0) { /* full coverage was set */ |
| 1472 | LIMIT_NETDEBUG(KERN_WARNING "UDPLite: partial coverage %d while full coverage %d requested\n", | 1445 | LIMIT_NETDEBUG(KERN_WARNING "UDPLITE: partial coverage " |
| 1473 | UDP_SKB_CB(skb)->cscov, skb->len); | 1446 | "%d while full coverage %d requested\n", |
| 1447 | UDP_SKB_CB(skb)->cscov, skb->len); | ||
| 1474 | goto drop; | 1448 | goto drop; |
| 1475 | } | 1449 | } |
| 1476 | /* The next case involves violating the min. coverage requested | 1450 | /* The next case involves violating the min. coverage requested |
| @@ -1480,27 +1454,28 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) | |||
| 1480 | * Therefore the above ...()->partial_cov statement is essential. | 1454 | * Therefore the above ...()->partial_cov statement is essential. |
| 1481 | */ | 1455 | */ |
| 1482 | if (UDP_SKB_CB(skb)->cscov < up->pcrlen) { | 1456 | if (UDP_SKB_CB(skb)->cscov < up->pcrlen) { |
| 1483 | LIMIT_NETDEBUG(KERN_WARNING "UDPLite: coverage %d too small, need min %d\n", | 1457 | LIMIT_NETDEBUG(KERN_WARNING |
| 1484 | UDP_SKB_CB(skb)->cscov, up->pcrlen); | 1458 | "UDPLITE: coverage %d too small, need min %d\n", |
| 1459 | UDP_SKB_CB(skb)->cscov, up->pcrlen); | ||
| 1485 | goto drop; | 1460 | goto drop; |
| 1486 | } | 1461 | } |
| 1487 | } | 1462 | } |
| 1488 | 1463 | ||
| 1489 | if (rcu_access_pointer(sk->sk_filter) && | 1464 | if (rcu_dereference_raw(sk->sk_filter)) { |
| 1490 | udp_lib_checksum_complete(skb)) | 1465 | if (udp_lib_checksum_complete(skb)) |
| 1491 | goto drop; | 1466 | goto drop; |
| 1467 | } | ||
| 1492 | 1468 | ||
| 1493 | 1469 | ||
| 1494 | if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) | 1470 | if (sk_rcvqueues_full(sk, skb)) |
| 1495 | goto drop; | 1471 | goto drop; |
| 1496 | 1472 | ||
| 1497 | rc = 0; | 1473 | rc = 0; |
| 1498 | 1474 | ||
| 1499 | ipv4_pktinfo_prepare(skb); | ||
| 1500 | bh_lock_sock(sk); | 1475 | bh_lock_sock(sk); |
| 1501 | if (!sock_owned_by_user(sk)) | 1476 | if (!sock_owned_by_user(sk)) |
| 1502 | rc = __udp_queue_rcv_skb(sk, skb); | 1477 | rc = __udp_queue_rcv_skb(sk, skb); |
| 1503 | else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { | 1478 | else if (sk_add_backlog(sk, skb)) { |
| 1504 | bh_unlock_sock(sk); | 1479 | bh_unlock_sock(sk); |
| 1505 | goto drop; | 1480 | goto drop; |
| 1506 | } | 1481 | } |
| @@ -1709,10 +1684,13 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, | |||
| 1709 | 1684 | ||
| 1710 | short_packet: | 1685 | short_packet: |
| 1711 | LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n", | 1686 | LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n", |
| 1712 | proto == IPPROTO_UDPLITE ? "Lite" : "", | 1687 | proto == IPPROTO_UDPLITE ? "-Lite" : "", |
| 1713 | &saddr, ntohs(uh->source), | 1688 | &saddr, |
| 1714 | ulen, skb->len, | 1689 | ntohs(uh->source), |
| 1715 | &daddr, ntohs(uh->dest)); | 1690 | ulen, |
| 1691 | skb->len, | ||
| 1692 | &daddr, | ||
| 1693 | ntohs(uh->dest)); | ||
| 1716 | goto drop; | 1694 | goto drop; |
| 1717 | 1695 | ||
| 1718 | csum_error: | 1696 | csum_error: |
| @@ -1721,8 +1699,11 @@ csum_error: | |||
| 1721 | * the network is concerned, anyway) as per 4.1.3.4 (MUST). | 1699 | * the network is concerned, anyway) as per 4.1.3.4 (MUST). |
| 1722 | */ | 1700 | */ |
| 1723 | LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n", | 1701 | LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n", |
| 1724 | proto == IPPROTO_UDPLITE ? "Lite" : "", | 1702 | proto == IPPROTO_UDPLITE ? "-Lite" : "", |
| 1725 | &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest), | 1703 | &saddr, |
| 1704 | ntohs(uh->source), | ||
| 1705 | &daddr, | ||
| 1706 | ntohs(uh->dest), | ||
| 1726 | ulen); | 1707 | ulen); |
| 1727 | drop: | 1708 | drop: |
| 1728 | UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); | 1709 | UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); |
| @@ -1781,7 +1762,6 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, | |||
| 1781 | /* FALLTHROUGH */ | 1762 | /* FALLTHROUGH */ |
| 1782 | case UDP_ENCAP_L2TPINUDP: | 1763 | case UDP_ENCAP_L2TPINUDP: |
| 1783 | up->encap_type = val; | 1764 | up->encap_type = val; |
| 1784 | udp_encap_enable(); | ||
| 1785 | break; | 1765 | break; |
| 1786 | default: | 1766 | default: |
| 1787 | err = -ENOPROTOOPT; | 1767 | err = -ENOPROTOOPT; |
| @@ -2058,7 +2038,7 @@ static void udp_seq_stop(struct seq_file *seq, void *v) | |||
| 2058 | spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); | 2038 | spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); |
| 2059 | } | 2039 | } |
| 2060 | 2040 | ||
| 2061 | int udp_seq_open(struct inode *inode, struct file *file) | 2041 | static int udp_seq_open(struct inode *inode, struct file *file) |
| 2062 | { | 2042 | { |
| 2063 | struct udp_seq_afinfo *afinfo = PDE(inode)->data; | 2043 | struct udp_seq_afinfo *afinfo = PDE(inode)->data; |
| 2064 | struct udp_iter_state *s; | 2044 | struct udp_iter_state *s; |
| @@ -2074,7 +2054,6 @@ int udp_seq_open(struct inode *inode, struct file *file) | |||
| 2074 | s->udp_table = afinfo->udp_table; | 2054 | s->udp_table = afinfo->udp_table; |
| 2075 | return err; | 2055 | return err; |
| 2076 | } | 2056 | } |
| 2077 | EXPORT_SYMBOL(udp_seq_open); | ||
| 2078 | 2057 | ||
| 2079 | /* ------------------------------------------------------------------------ */ | 2058 | /* ------------------------------------------------------------------------ */ |
| 2080 | int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo) | 2059 | int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo) |
| @@ -2082,12 +2061,17 @@ int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo) | |||
| 2082 | struct proc_dir_entry *p; | 2061 | struct proc_dir_entry *p; |
| 2083 | int rc = 0; | 2062 | int rc = 0; |
| 2084 | 2063 | ||
| 2064 | afinfo->seq_fops.open = udp_seq_open; | ||
| 2065 | afinfo->seq_fops.read = seq_read; | ||
| 2066 | afinfo->seq_fops.llseek = seq_lseek; | ||
| 2067 | afinfo->seq_fops.release = seq_release_net; | ||
| 2068 | |||
| 2085 | afinfo->seq_ops.start = udp_seq_start; | 2069 | afinfo->seq_ops.start = udp_seq_start; |
| 2086 | afinfo->seq_ops.next = udp_seq_next; | 2070 | afinfo->seq_ops.next = udp_seq_next; |
| 2087 | afinfo->seq_ops.stop = udp_seq_stop; | 2071 | afinfo->seq_ops.stop = udp_seq_stop; |
| 2088 | 2072 | ||
| 2089 | p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, | 2073 | p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, |
| 2090 | afinfo->seq_fops, afinfo); | 2074 | &afinfo->seq_fops, afinfo); |
| 2091 | if (!p) | 2075 | if (!p) |
| 2092 | rc = -ENOMEM; | 2076 | rc = -ENOMEM; |
| 2093 | return rc; | 2077 | return rc; |
| @@ -2115,9 +2099,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, | |||
| 2115 | bucket, src, srcp, dest, destp, sp->sk_state, | 2099 | bucket, src, srcp, dest, destp, sp->sk_state, |
| 2116 | sk_wmem_alloc_get(sp), | 2100 | sk_wmem_alloc_get(sp), |
| 2117 | sk_rmem_alloc_get(sp), | 2101 | sk_rmem_alloc_get(sp), |
| 2118 | 0, 0L, 0, | 2102 | 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), |
| 2119 | from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), | ||
| 2120 | 0, sock_i_ino(sp), | ||
| 2121 | atomic_read(&sp->sk_refcnt), sp, | 2103 | atomic_read(&sp->sk_refcnt), sp, |
| 2122 | atomic_read(&sp->sk_drops), len); | 2104 | atomic_read(&sp->sk_drops), len); |
| 2123 | } | 2105 | } |
| @@ -2139,20 +2121,14 @@ int udp4_seq_show(struct seq_file *seq, void *v) | |||
| 2139 | return 0; | 2121 | return 0; |
| 2140 | } | 2122 | } |
| 2141 | 2123 | ||
| 2142 | static const struct file_operations udp_afinfo_seq_fops = { | ||
| 2143 | .owner = THIS_MODULE, | ||
| 2144 | .open = udp_seq_open, | ||
| 2145 | .read = seq_read, | ||
| 2146 | .llseek = seq_lseek, | ||
| 2147 | .release = seq_release_net | ||
| 2148 | }; | ||
| 2149 | |||
| 2150 | /* ------------------------------------------------------------------------ */ | 2124 | /* ------------------------------------------------------------------------ */ |
| 2151 | static struct udp_seq_afinfo udp4_seq_afinfo = { | 2125 | static struct udp_seq_afinfo udp4_seq_afinfo = { |
| 2152 | .name = "udp", | 2126 | .name = "udp", |
| 2153 | .family = AF_INET, | 2127 | .family = AF_INET, |
| 2154 | .udp_table = &udp_table, | 2128 | .udp_table = &udp_table, |
| 2155 | .seq_fops = &udp_afinfo_seq_fops, | 2129 | .seq_fops = { |
| 2130 | .owner = THIS_MODULE, | ||
| 2131 | }, | ||
| 2156 | .seq_ops = { | 2132 | .seq_ops = { |
| 2157 | .show = udp4_seq_show, | 2133 | .show = udp4_seq_show, |
| 2158 | }, | 2134 | }, |
| @@ -2187,15 +2163,9 @@ void udp4_proc_exit(void) | |||
| 2187 | static __initdata unsigned long uhash_entries; | 2163 | static __initdata unsigned long uhash_entries; |
| 2188 | static int __init set_uhash_entries(char *str) | 2164 | static int __init set_uhash_entries(char *str) |
| 2189 | { | 2165 | { |
| 2190 | ssize_t ret; | ||
| 2191 | |||
| 2192 | if (!str) | 2166 | if (!str) |
| 2193 | return 0; | 2167 | return 0; |
| 2194 | 2168 | uhash_entries = simple_strtoul(str, &str, 0); | |
| 2195 | ret = kstrtoul(str, 0, &uhash_entries); | ||
| 2196 | if (ret) | ||
| 2197 | return 0; | ||
| 2198 | |||
| 2199 | if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN) | 2169 | if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN) |
| 2200 | uhash_entries = UDP_HTABLE_SIZE_MIN; | 2170 | uhash_entries = UDP_HTABLE_SIZE_MIN; |
| 2201 | return 1; | 2171 | return 1; |
| @@ -2206,16 +2176,26 @@ void __init udp_table_init(struct udp_table *table, const char *name) | |||
| 2206 | { | 2176 | { |
| 2207 | unsigned int i; | 2177 | unsigned int i; |
| 2208 | 2178 | ||
| 2209 | table->hash = alloc_large_system_hash(name, | 2179 | if (!CONFIG_BASE_SMALL) |
| 2210 | 2 * sizeof(struct udp_hslot), | 2180 | table->hash = alloc_large_system_hash(name, |
| 2211 | uhash_entries, | 2181 | 2 * sizeof(struct udp_hslot), |
| 2212 | 21, /* one slot per 2 MB */ | 2182 | uhash_entries, |
| 2213 | 0, | 2183 | 21, /* one slot per 2 MB */ |
| 2214 | &table->log, | 2184 | 0, |
| 2215 | &table->mask, | 2185 | &table->log, |
| 2216 | UDP_HTABLE_SIZE_MIN, | 2186 | &table->mask, |
| 2217 | 64 * 1024); | 2187 | 64 * 1024); |
| 2218 | 2188 | /* | |
| 2189 | * Make sure hash table has the minimum size | ||
| 2190 | */ | ||
| 2191 | if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) { | ||
| 2192 | table->hash = kmalloc(UDP_HTABLE_SIZE_MIN * | ||
| 2193 | 2 * sizeof(struct udp_hslot), GFP_KERNEL); | ||
| 2194 | if (!table->hash) | ||
| 2195 | panic(name); | ||
| 2196 | table->log = ilog2(UDP_HTABLE_SIZE_MIN); | ||
| 2197 | table->mask = UDP_HTABLE_SIZE_MIN - 1; | ||
| 2198 | } | ||
| 2219 | table->hash2 = table->hash + (table->mask + 1); | 2199 | table->hash2 = table->hash + (table->mask + 1); |
| 2220 | for (i = 0; i <= table->mask; i++) { | 2200 | for (i = 0; i <= table->mask; i++) { |
| 2221 | INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); | 2201 | INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); |
| @@ -2263,8 +2243,7 @@ int udp4_ufo_send_check(struct sk_buff *skb) | |||
| 2263 | return 0; | 2243 | return 0; |
| 2264 | } | 2244 | } |
| 2265 | 2245 | ||
| 2266 | struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, | 2246 | struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features) |
| 2267 | netdev_features_t features) | ||
| 2268 | { | 2247 | { |
| 2269 | struct sk_buff *segs = ERR_PTR(-EINVAL); | 2248 | struct sk_buff *segs = ERR_PTR(-EINVAL); |
| 2270 | unsigned int mss; | 2249 | unsigned int mss; |
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c deleted file mode 100644 index 505b30ad918..00000000000 --- a/net/ipv4/udp_diag.c +++ /dev/null | |||
| @@ -1,216 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * udp_diag.c Module for monitoring UDP transport protocols sockets. | ||
| 3 | * | ||
| 4 | * Authors: Pavel Emelyanov, <xemul@parallels.com> | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or | ||
| 7 | * modify it under the terms of the GNU General Public License | ||
| 8 | * as published by the Free Software Foundation; either version | ||
| 9 | * 2 of the License, or (at your option) any later version. | ||
| 10 | */ | ||
| 11 | |||
| 12 | |||
| 13 | #include <linux/module.h> | ||
| 14 | #include <linux/inet_diag.h> | ||
| 15 | #include <linux/udp.h> | ||
| 16 | #include <net/udp.h> | ||
| 17 | #include <net/udplite.h> | ||
| 18 | #include <linux/sock_diag.h> | ||
| 19 | |||
| 20 | static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, | ||
| 21 | struct netlink_callback *cb, struct inet_diag_req_v2 *req, | ||
| 22 | struct nlattr *bc) | ||
| 23 | { | ||
| 24 | if (!inet_diag_bc_sk(bc, sk)) | ||
| 25 | return 0; | ||
| 26 | |||
| 27 | return inet_sk_diag_fill(sk, NULL, skb, req, | ||
| 28 | sk_user_ns(NETLINK_CB(cb->skb).ssk), | ||
| 29 | NETLINK_CB(cb->skb).portid, | ||
| 30 | cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); | ||
| 31 | } | ||
| 32 | |||
| 33 | static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, | ||
| 34 | const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req) | ||
| 35 | { | ||
| 36 | int err = -EINVAL; | ||
| 37 | struct sock *sk; | ||
| 38 | struct sk_buff *rep; | ||
| 39 | struct net *net = sock_net(in_skb->sk); | ||
| 40 | |||
| 41 | if (req->sdiag_family == AF_INET) | ||
| 42 | sk = __udp4_lib_lookup(net, | ||
| 43 | req->id.idiag_src[0], req->id.idiag_sport, | ||
| 44 | req->id.idiag_dst[0], req->id.idiag_dport, | ||
| 45 | req->id.idiag_if, tbl); | ||
| 46 | #if IS_ENABLED(CONFIG_IPV6) | ||
| 47 | else if (req->sdiag_family == AF_INET6) | ||
| 48 | sk = __udp6_lib_lookup(net, | ||
| 49 | (struct in6_addr *)req->id.idiag_src, | ||
| 50 | req->id.idiag_sport, | ||
| 51 | (struct in6_addr *)req->id.idiag_dst, | ||
| 52 | req->id.idiag_dport, | ||
| 53 | req->id.idiag_if, tbl); | ||
| 54 | #endif | ||
| 55 | else | ||
| 56 | goto out_nosk; | ||
| 57 | |||
| 58 | err = -ENOENT; | ||
| 59 | if (sk == NULL) | ||
| 60 | goto out_nosk; | ||
| 61 | |||
| 62 | err = sock_diag_check_cookie(sk, req->id.idiag_cookie); | ||
| 63 | if (err) | ||
| 64 | goto out; | ||
| 65 | |||
| 66 | err = -ENOMEM; | ||
| 67 | rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + | ||
| 68 | sizeof(struct inet_diag_meminfo) + | ||
| 69 | 64)), GFP_KERNEL); | ||
| 70 | if (!rep) | ||
| 71 | goto out; | ||
| 72 | |||
| 73 | err = inet_sk_diag_fill(sk, NULL, rep, req, | ||
| 74 | sk_user_ns(NETLINK_CB(in_skb).ssk), | ||
| 75 | NETLINK_CB(in_skb).portid, | ||
| 76 | nlh->nlmsg_seq, 0, nlh); | ||
| 77 | if (err < 0) { | ||
| 78 | WARN_ON(err == -EMSGSIZE); | ||
| 79 | kfree_skb(rep); | ||
| 80 | goto out; | ||
| 81 | } | ||
| 82 | err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, | ||
| 83 | MSG_DONTWAIT); | ||
| 84 | if (err > 0) | ||
| 85 | err = 0; | ||
| 86 | out: | ||
| 87 | if (sk) | ||
| 88 | sock_put(sk); | ||
| 89 | out_nosk: | ||
| 90 | return err; | ||
| 91 | } | ||
| 92 | |||
| 93 | static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, | ||
| 94 | struct inet_diag_req_v2 *r, struct nlattr *bc) | ||
| 95 | { | ||
| 96 | int num, s_num, slot, s_slot; | ||
| 97 | struct net *net = sock_net(skb->sk); | ||
| 98 | |||
| 99 | s_slot = cb->args[0]; | ||
| 100 | num = s_num = cb->args[1]; | ||
| 101 | |||
| 102 | for (slot = s_slot; slot <= table->mask; num = s_num = 0, slot++) { | ||
| 103 | struct sock *sk; | ||
| 104 | struct hlist_nulls_node *node; | ||
| 105 | struct udp_hslot *hslot = &table->hash[slot]; | ||
| 106 | |||
| 107 | if (hlist_nulls_empty(&hslot->head)) | ||
| 108 | continue; | ||
| 109 | |||
| 110 | spin_lock_bh(&hslot->lock); | ||
| 111 | sk_nulls_for_each(sk, node, &hslot->head) { | ||
| 112 | struct inet_sock *inet = inet_sk(sk); | ||
| 113 | |||
| 114 | if (!net_eq(sock_net(sk), net)) | ||
| 115 | continue; | ||
| 116 | if (num < s_num) | ||
| 117 | goto next; | ||
| 118 | if (!(r->idiag_states & (1 << sk->sk_state))) | ||
| 119 | goto next; | ||
| 120 | if (r->sdiag_family != AF_UNSPEC && | ||
| 121 | sk->sk_family != r->sdiag_family) | ||
| 122 | goto next; | ||
| 123 | if (r->id.idiag_sport != inet->inet_sport && | ||
| 124 | r->id.idiag_sport) | ||
| 125 | goto next; | ||
| 126 | if (r->id.idiag_dport != inet->inet_dport && | ||
| 127 | r->id.idiag_dport) | ||
| 128 | goto next; | ||
| 129 | |||
| 130 | if (sk_diag_dump(sk, skb, cb, r, bc) < 0) { | ||
| 131 | spin_unlock_bh(&hslot->lock); | ||
| 132 | goto done; | ||
| 133 | } | ||
| 134 | next: | ||
| 135 | num++; | ||
| 136 | } | ||
| 137 | spin_unlock_bh(&hslot->lock); | ||
| 138 | } | ||
| 139 | done: | ||
| 140 | cb->args[0] = slot; | ||
| 141 | cb->args[1] = num; | ||
| 142 | } | ||
| 143 | |||
| 144 | static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, | ||
| 145 | struct inet_diag_req_v2 *r, struct nlattr *bc) | ||
| 146 | { | ||
| 147 | udp_dump(&udp_table, skb, cb, r, bc); | ||
| 148 | } | ||
| 149 | |||
| 150 | static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, | ||
| 151 | struct inet_diag_req_v2 *req) | ||
| 152 | { | ||
| 153 | return udp_dump_one(&udp_table, in_skb, nlh, req); | ||
| 154 | } | ||
| 155 | |||
| 156 | static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, | ||
| 157 | void *info) | ||
| 158 | { | ||
| 159 | r->idiag_rqueue = sk_rmem_alloc_get(sk); | ||
| 160 | r->idiag_wqueue = sk_wmem_alloc_get(sk); | ||
| 161 | } | ||
| 162 | |||
| 163 | static const struct inet_diag_handler udp_diag_handler = { | ||
| 164 | .dump = udp_diag_dump, | ||
| 165 | .dump_one = udp_diag_dump_one, | ||
| 166 | .idiag_get_info = udp_diag_get_info, | ||
| 167 | .idiag_type = IPPROTO_UDP, | ||
| 168 | }; | ||
| 169 | |||
| 170 | static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, | ||
| 171 | struct inet_diag_req_v2 *r, struct nlattr *bc) | ||
| 172 | { | ||
| 173 | udp_dump(&udplite_table, skb, cb, r, bc); | ||
| 174 | } | ||
| 175 | |||
| 176 | static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, | ||
| 177 | struct inet_diag_req_v2 *req) | ||
| 178 | { | ||
| 179 | return udp_dump_one(&udplite_table, in_skb, nlh, req); | ||
| 180 | } | ||
| 181 | |||
| 182 | static const struct inet_diag_handler udplite_diag_handler = { | ||
| 183 | .dump = udplite_diag_dump, | ||
| 184 | .dump_one = udplite_diag_dump_one, | ||
| 185 | .idiag_get_info = udp_diag_get_info, | ||
| 186 | .idiag_type = IPPROTO_UDPLITE, | ||
| 187 | }; | ||
| 188 | |||
| 189 | static int __init udp_diag_init(void) | ||
| 190 | { | ||
| 191 | int err; | ||
| 192 | |||
| 193 | err = inet_diag_register(&udp_diag_handler); | ||
| 194 | if (err) | ||
| 195 | goto out; | ||
| 196 | err = inet_diag_register(&udplite_diag_handler); | ||
| 197 | if (err) | ||
| 198 | goto out_lite; | ||
| 199 | out: | ||
| 200 | return err; | ||
| 201 | out_lite: | ||
| 202 | inet_diag_unregister(&udp_diag_handler); | ||
| 203 | goto out; | ||
| 204 | } | ||
| 205 | |||
| 206 | static void __exit udp_diag_exit(void) | ||
| 207 | { | ||
| 208 | inet_diag_unregister(&udplite_diag_handler); | ||
| 209 | inet_diag_unregister(&udp_diag_handler); | ||
| 210 | } | ||
| 211 | |||
| 212 | module_init(udp_diag_init); | ||
| 213 | module_exit(udp_diag_exit); | ||
| 214 | MODULE_LICENSE("GPL"); | ||
| 215 | MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */); | ||
| 216 | MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */); | ||
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 5a681e298b9..aaad650d47d 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h | |||
| @@ -25,7 +25,7 @@ extern int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
| 25 | size_t len, int noblock, int flags, int *addr_len); | 25 | size_t len, int noblock, int flags, int *addr_len); |
| 26 | extern int udp_sendpage(struct sock *sk, struct page *page, int offset, | 26 | extern int udp_sendpage(struct sock *sk, struct page *page, int offset, |
| 27 | size_t size, int flags); | 27 | size_t size, int flags); |
| 28 | extern int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); | 28 | extern int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb); |
| 29 | extern void udp_destroy_sock(struct sock *sk); | 29 | extern void udp_destroy_sock(struct sock *sk); |
| 30 | 30 | ||
| 31 | #ifdef CONFIG_PROC_FS | 31 | #ifdef CONFIG_PROC_FS |
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 2c46acd4cc3..aee9963f7f5 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c | |||
| @@ -10,10 +10,6 @@ | |||
| 10 | * as published by the Free Software Foundation; either version | 10 | * as published by the Free Software Foundation; either version |
| 11 | * 2 of the License, or (at your option) any later version. | 11 | * 2 of the License, or (at your option) any later version. |
| 12 | */ | 12 | */ |
| 13 | |||
| 14 | #define pr_fmt(fmt) "UDPLite: " fmt | ||
| 15 | |||
| 16 | #include <linux/export.h> | ||
| 17 | #include "udp_impl.h" | 13 | #include "udp_impl.h" |
| 18 | 14 | ||
| 19 | struct udp_table udplite_table __read_mostly; | 15 | struct udp_table udplite_table __read_mostly; |
| @@ -75,20 +71,13 @@ static struct inet_protosw udplite4_protosw = { | |||
| 75 | }; | 71 | }; |
| 76 | 72 | ||
| 77 | #ifdef CONFIG_PROC_FS | 73 | #ifdef CONFIG_PROC_FS |
| 78 | |||
| 79 | static const struct file_operations udplite_afinfo_seq_fops = { | ||
| 80 | .owner = THIS_MODULE, | ||
| 81 | .open = udp_seq_open, | ||
| 82 | .read = seq_read, | ||
| 83 | .llseek = seq_lseek, | ||
| 84 | .release = seq_release_net | ||
| 85 | }; | ||
| 86 | |||
| 87 | static struct udp_seq_afinfo udplite4_seq_afinfo = { | 74 | static struct udp_seq_afinfo udplite4_seq_afinfo = { |
| 88 | .name = "udplite", | 75 | .name = "udplite", |
| 89 | .family = AF_INET, | 76 | .family = AF_INET, |
| 90 | .udp_table = &udplite_table, | 77 | .udp_table = &udplite_table, |
| 91 | .seq_fops = &udplite_afinfo_seq_fops, | 78 | .seq_fops = { |
| 79 | .owner = THIS_MODULE, | ||
| 80 | }, | ||
| 92 | .seq_ops = { | 81 | .seq_ops = { |
| 93 | .show = udp4_seq_show, | 82 | .show = udp4_seq_show, |
| 94 | }, | 83 | }, |
| @@ -132,11 +121,11 @@ void __init udplite4_register(void) | |||
| 132 | inet_register_protosw(&udplite4_protosw); | 121 | inet_register_protosw(&udplite4_protosw); |
| 133 | 122 | ||
| 134 | if (udplite4_proc_init()) | 123 | if (udplite4_proc_init()) |
| 135 | pr_err("%s: Cannot register /proc!\n", __func__); | 124 | printk(KERN_ERR "%s: Cannot register /proc!\n", __func__); |
| 136 | return; | 125 | return; |
| 137 | 126 | ||
| 138 | out_unregister_proto: | 127 | out_unregister_proto: |
| 139 | proto_unregister(&udplite_prot); | 128 | proto_unregister(&udplite_prot); |
| 140 | out_register_err: | 129 | out_register_err: |
| 141 | pr_crit("%s: Cannot add UDP-Lite protocol\n", __func__); | 130 | printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__); |
| 142 | } | 131 | } |
diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index e3db3f91511..63418185f52 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c | |||
| @@ -110,7 +110,10 @@ static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb) | |||
| 110 | 110 | ||
| 111 | skb_push(skb, sizeof(*iph)); | 111 | skb_push(skb, sizeof(*iph)); |
| 112 | skb_reset_network_header(skb); | 112 | skb_reset_network_header(skb); |
| 113 | skb_mac_header_rebuild(skb); | 113 | |
| 114 | memmove(skb->data - skb->mac_len, skb_mac_header(skb), | ||
| 115 | skb->mac_len); | ||
| 116 | skb_set_mac_header(skb, -skb->mac_len); | ||
| 114 | 117 | ||
| 115 | xfrm4_beet_make_header(skb); | 118 | xfrm4_beet_make_header(skb); |
| 116 | 119 | ||
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index ddee0a099a2..534972e114a 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c | |||
| @@ -15,65 +15,6 @@ | |||
| 15 | #include <net/ip.h> | 15 | #include <net/ip.h> |
| 16 | #include <net/xfrm.h> | 16 | #include <net/xfrm.h> |
| 17 | 17 | ||
| 18 | /* Informational hook. The decap is still done here. */ | ||
| 19 | static struct xfrm_tunnel __rcu *rcv_notify_handlers __read_mostly; | ||
| 20 | static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex); | ||
| 21 | |||
| 22 | int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel *handler) | ||
| 23 | { | ||
| 24 | struct xfrm_tunnel __rcu **pprev; | ||
| 25 | struct xfrm_tunnel *t; | ||
| 26 | int ret = -EEXIST; | ||
| 27 | int priority = handler->priority; | ||
| 28 | |||
| 29 | mutex_lock(&xfrm4_mode_tunnel_input_mutex); | ||
| 30 | |||
| 31 | for (pprev = &rcv_notify_handlers; | ||
| 32 | (t = rcu_dereference_protected(*pprev, | ||
| 33 | lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL; | ||
| 34 | pprev = &t->next) { | ||
| 35 | if (t->priority > priority) | ||
| 36 | break; | ||
| 37 | if (t->priority == priority) | ||
| 38 | goto err; | ||
| 39 | |||
| 40 | } | ||
| 41 | |||
| 42 | handler->next = *pprev; | ||
| 43 | rcu_assign_pointer(*pprev, handler); | ||
| 44 | |||
| 45 | ret = 0; | ||
| 46 | |||
| 47 | err: | ||
| 48 | mutex_unlock(&xfrm4_mode_tunnel_input_mutex); | ||
| 49 | return ret; | ||
| 50 | } | ||
| 51 | EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register); | ||
| 52 | |||
| 53 | int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel *handler) | ||
| 54 | { | ||
| 55 | struct xfrm_tunnel __rcu **pprev; | ||
| 56 | struct xfrm_tunnel *t; | ||
| 57 | int ret = -ENOENT; | ||
| 58 | |||
| 59 | mutex_lock(&xfrm4_mode_tunnel_input_mutex); | ||
| 60 | for (pprev = &rcv_notify_handlers; | ||
| 61 | (t = rcu_dereference_protected(*pprev, | ||
| 62 | lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL; | ||
| 63 | pprev = &t->next) { | ||
| 64 | if (t == handler) { | ||
| 65 | *pprev = handler->next; | ||
| 66 | ret = 0; | ||
| 67 | break; | ||
| 68 | } | ||
| 69 | } | ||
| 70 | mutex_unlock(&xfrm4_mode_tunnel_input_mutex); | ||
| 71 | synchronize_net(); | ||
| 72 | |||
| 73 | return ret; | ||
| 74 | } | ||
| 75 | EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_deregister); | ||
| 76 | |||
| 77 | static inline void ipip_ecn_decapsulate(struct sk_buff *skb) | 18 | static inline void ipip_ecn_decapsulate(struct sk_buff *skb) |
| 78 | { | 19 | { |
| 79 | struct iphdr *inner_iph = ipip_hdr(skb); | 20 | struct iphdr *inner_iph = ipip_hdr(skb); |
| @@ -123,14 +64,9 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) | |||
| 123 | return 0; | 64 | return 0; |
| 124 | } | 65 | } |
| 125 | 66 | ||
| 126 | #define for_each_input_rcu(head, handler) \ | ||
| 127 | for (handler = rcu_dereference(head); \ | ||
| 128 | handler != NULL; \ | ||
| 129 | handler = rcu_dereference(handler->next)) | ||
| 130 | |||
| 131 | static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) | 67 | static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) |
| 132 | { | 68 | { |
| 133 | struct xfrm_tunnel *handler; | 69 | const unsigned char *old_mac; |
| 134 | int err = -EINVAL; | 70 | int err = -EINVAL; |
| 135 | 71 | ||
| 136 | if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) | 72 | if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) |
| @@ -139,9 +75,6 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) | |||
| 139 | if (!pskb_may_pull(skb, sizeof(struct iphdr))) | 75 | if (!pskb_may_pull(skb, sizeof(struct iphdr))) |
| 140 | goto out; | 76 | goto out; |
| 141 | 77 | ||
| 142 | for_each_input_rcu(rcv_notify_handlers, handler) | ||
| 143 | handler->handler(skb); | ||
| 144 | |||
| 145 | if (skb_cloned(skb) && | 78 | if (skb_cloned(skb) && |
| 146 | (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) | 79 | (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) |
| 147 | goto out; | 80 | goto out; |
| @@ -151,9 +84,10 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) | |||
| 151 | if (!(x->props.flags & XFRM_STATE_NOECN)) | 84 | if (!(x->props.flags & XFRM_STATE_NOECN)) |
| 152 | ipip_ecn_decapsulate(skb); | 85 | ipip_ecn_decapsulate(skb); |
| 153 | 86 | ||
| 87 | old_mac = skb_mac_header(skb); | ||
| 88 | skb_set_mac_header(skb, -skb->mac_len); | ||
| 89 | memmove(skb_mac_header(skb), old_mac, skb->mac_len); | ||
| 154 | skb_reset_network_header(skb); | 90 | skb_reset_network_header(skb); |
| 155 | skb_mac_header_rebuild(skb); | ||
| 156 | |||
| 157 | err = 0; | 91 | err = 0; |
| 158 | 92 | ||
| 159 | out: | 93 | out: |
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 3be0ac2c192..a0b4c5da8d4 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c | |||
| @@ -79,21 +79,30 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, | |||
| 79 | struct rtable *rt = (struct rtable *)xdst->route; | 79 | struct rtable *rt = (struct rtable *)xdst->route; |
| 80 | const struct flowi4 *fl4 = &fl->u.ip4; | 80 | const struct flowi4 *fl4 = &fl->u.ip4; |
| 81 | 81 | ||
| 82 | xdst->u.rt.rt_key_dst = fl4->daddr; | ||
| 83 | xdst->u.rt.rt_key_src = fl4->saddr; | ||
| 84 | xdst->u.rt.rt_key_tos = fl4->flowi4_tos; | ||
| 85 | xdst->u.rt.rt_route_iif = fl4->flowi4_iif; | ||
| 82 | xdst->u.rt.rt_iif = fl4->flowi4_iif; | 86 | xdst->u.rt.rt_iif = fl4->flowi4_iif; |
| 87 | xdst->u.rt.rt_oif = fl4->flowi4_oif; | ||
| 88 | xdst->u.rt.rt_mark = fl4->flowi4_mark; | ||
| 83 | 89 | ||
| 84 | xdst->u.dst.dev = dev; | 90 | xdst->u.dst.dev = dev; |
| 85 | dev_hold(dev); | 91 | dev_hold(dev); |
| 86 | 92 | ||
| 93 | xdst->u.rt.peer = rt->peer; | ||
| 94 | if (rt->peer) | ||
| 95 | atomic_inc(&rt->peer->refcnt); | ||
| 96 | |||
| 87 | /* Sheit... I remember I did this right. Apparently, | 97 | /* Sheit... I remember I did this right. Apparently, |
| 88 | * it was magically lost, so this code needs audit */ | 98 | * it was magically lost, so this code needs audit */ |
| 89 | xdst->u.rt.rt_is_input = rt->rt_is_input; | ||
| 90 | xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | | 99 | xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | |
| 91 | RTCF_LOCAL); | 100 | RTCF_LOCAL); |
| 92 | xdst->u.rt.rt_type = rt->rt_type; | 101 | xdst->u.rt.rt_type = rt->rt_type; |
| 102 | xdst->u.rt.rt_src = rt->rt_src; | ||
| 103 | xdst->u.rt.rt_dst = rt->rt_dst; | ||
| 93 | xdst->u.rt.rt_gateway = rt->rt_gateway; | 104 | xdst->u.rt.rt_gateway = rt->rt_gateway; |
| 94 | xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; | 105 | xdst->u.rt.rt_spec_dst = rt->rt_spec_dst; |
| 95 | xdst->u.rt.rt_pmtu = rt->rt_pmtu; | ||
| 96 | INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); | ||
| 97 | 106 | ||
| 98 | return 0; | 107 | return 0; |
| 99 | } | 108 | } |
| @@ -143,7 +152,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) | |||
| 143 | 152 | ||
| 144 | case IPPROTO_AH: | 153 | case IPPROTO_AH: |
| 145 | if (pskb_may_pull(skb, xprth + 8 - skb->data)) { | 154 | if (pskb_may_pull(skb, xprth + 8 - skb->data)) { |
| 146 | __be32 *ah_hdr = (__be32 *)xprth; | 155 | __be32 *ah_hdr = (__be32*)xprth; |
| 147 | 156 | ||
| 148 | fl4->fl4_ipsec_spi = ah_hdr[1]; | 157 | fl4->fl4_ipsec_spi = ah_hdr[1]; |
| 149 | } | 158 | } |
| @@ -189,22 +198,12 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops) | |||
| 189 | return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); | 198 | return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); |
| 190 | } | 199 | } |
| 191 | 200 | ||
| 192 | static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk, | 201 | static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) |
| 193 | struct sk_buff *skb, u32 mtu) | ||
| 194 | { | ||
| 195 | struct xfrm_dst *xdst = (struct xfrm_dst *)dst; | ||
| 196 | struct dst_entry *path = xdst->route; | ||
| 197 | |||
| 198 | path->ops->update_pmtu(path, sk, skb, mtu); | ||
| 199 | } | ||
| 200 | |||
| 201 | static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk, | ||
| 202 | struct sk_buff *skb) | ||
| 203 | { | 202 | { |
| 204 | struct xfrm_dst *xdst = (struct xfrm_dst *)dst; | 203 | struct xfrm_dst *xdst = (struct xfrm_dst *)dst; |
| 205 | struct dst_entry *path = xdst->route; | 204 | struct dst_entry *path = xdst->route; |
| 206 | 205 | ||
| 207 | path->ops->redirect(path, sk, skb); | 206 | path->ops->update_pmtu(path, mtu); |
| 208 | } | 207 | } |
| 209 | 208 | ||
| 210 | static void xfrm4_dst_destroy(struct dst_entry *dst) | 209 | static void xfrm4_dst_destroy(struct dst_entry *dst) |
| @@ -213,6 +212,9 @@ static void xfrm4_dst_destroy(struct dst_entry *dst) | |||
| 213 | 212 | ||
| 214 | dst_destroy_metrics_generic(dst); | 213 | dst_destroy_metrics_generic(dst); |
| 215 | 214 | ||
| 215 | if (likely(xdst->u.rt.peer)) | ||
| 216 | inet_putpeer(xdst->u.rt.peer); | ||
| 217 | |||
| 216 | xfrm_dst_destroy(xdst); | 218 | xfrm_dst_destroy(xdst); |
| 217 | } | 219 | } |
| 218 | 220 | ||
| @@ -230,7 +232,6 @@ static struct dst_ops xfrm4_dst_ops = { | |||
| 230 | .protocol = cpu_to_be16(ETH_P_IP), | 232 | .protocol = cpu_to_be16(ETH_P_IP), |
| 231 | .gc = xfrm4_garbage_collect, | 233 | .gc = xfrm4_garbage_collect, |
| 232 | .update_pmtu = xfrm4_update_pmtu, | 234 | .update_pmtu = xfrm4_update_pmtu, |
| 233 | .redirect = xfrm4_redirect, | ||
| 234 | .cow_metrics = dst_cow_metrics_generic, | 235 | .cow_metrics = dst_cow_metrics_generic, |
| 235 | .destroy = xfrm4_dst_destroy, | 236 | .destroy = xfrm4_dst_destroy, |
| 236 | .ifdown = xfrm4_dst_ifdown, | 237 | .ifdown = xfrm4_dst_ifdown, |
| @@ -279,15 +280,26 @@ static void __exit xfrm4_policy_fini(void) | |||
| 279 | xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo); | 280 | xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo); |
| 280 | } | 281 | } |
| 281 | 282 | ||
| 282 | void __init xfrm4_init(void) | 283 | void __init xfrm4_init(int rt_max_size) |
| 283 | { | 284 | { |
| 285 | /* | ||
| 286 | * Select a default value for the gc_thresh based on the main route | ||
| 287 | * table hash size. It seems to me the worst case scenario is when | ||
| 288 | * we have ipsec operating in transport mode, in which we create a | ||
| 289 | * dst_entry per socket. The xfrm gc algorithm starts trying to remove | ||
| 290 | * entries at gc_thresh, and prevents new allocations as 2*gc_thresh | ||
| 291 | * so lets set an initial xfrm gc_thresh value at the rt_max_size/2. | ||
| 292 | * That will let us store an ipsec connection per route table entry, | ||
| 293 | * and start cleaning when were 1/2 full | ||
| 294 | */ | ||
| 295 | xfrm4_dst_ops.gc_thresh = rt_max_size/2; | ||
| 284 | dst_entries_init(&xfrm4_dst_ops); | 296 | dst_entries_init(&xfrm4_dst_ops); |
| 285 | 297 | ||
| 286 | xfrm4_state_init(); | 298 | xfrm4_state_init(); |
| 287 | xfrm4_policy_init(); | 299 | xfrm4_policy_init(); |
| 288 | #ifdef CONFIG_SYSCTL | 300 | #ifdef CONFIG_SYSCTL |
| 289 | sysctl_hdr = register_net_sysctl(&init_net, "net/ipv4", | 301 | sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path, |
| 290 | xfrm4_policy_table); | 302 | xfrm4_policy_table); |
| 291 | #endif | 303 | #endif |
| 292 | } | 304 | } |
| 293 | 305 | ||
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 9258e751bab..d9ac0a0058b 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c | |||
| @@ -12,7 +12,6 @@ | |||
| 12 | #include <linux/pfkeyv2.h> | 12 | #include <linux/pfkeyv2.h> |
| 13 | #include <linux/ipsec.h> | 13 | #include <linux/ipsec.h> |
| 14 | #include <linux/netfilter_ipv4.h> | 14 | #include <linux/netfilter_ipv4.h> |
| 15 | #include <linux/export.h> | ||
| 16 | 15 | ||
| 17 | static int xfrm4_init_flags(struct xfrm_state *x) | 16 | static int xfrm4_init_flags(struct xfrm_state *x) |
| 18 | { | 17 | { |
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index 05a5df2febc..82806455e85 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c | |||
| @@ -3,8 +3,6 @@ | |||
| 3 | * Copyright (C) 2003 David S. Miller (davem@redhat.com) | 3 | * Copyright (C) 2003 David S. Miller (davem@redhat.com) |
| 4 | */ | 4 | */ |
| 5 | 5 | ||
| 6 | #define pr_fmt(fmt) "IPsec: " fmt | ||
| 7 | |||
| 8 | #include <linux/skbuff.h> | 6 | #include <linux/skbuff.h> |
| 9 | #include <linux/module.h> | 7 | #include <linux/module.h> |
| 10 | #include <linux/mutex.h> | 8 | #include <linux/mutex.h> |
| @@ -66,7 +64,7 @@ static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = { | |||
| 66 | .priority = 2, | 64 | .priority = 2, |
| 67 | }; | 65 | }; |
| 68 | 66 | ||
| 69 | #if IS_ENABLED(CONFIG_IPV6) | 67 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 70 | static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = { | 68 | static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = { |
| 71 | .handler = xfrm_tunnel_rcv, | 69 | .handler = xfrm_tunnel_rcv, |
| 72 | .err_handler = xfrm_tunnel_err, | 70 | .err_handler = xfrm_tunnel_err, |
| @@ -77,18 +75,18 @@ static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = { | |||
| 77 | static int __init ipip_init(void) | 75 | static int __init ipip_init(void) |
| 78 | { | 76 | { |
| 79 | if (xfrm_register_type(&ipip_type, AF_INET) < 0) { | 77 | if (xfrm_register_type(&ipip_type, AF_INET) < 0) { |
| 80 | pr_info("%s: can't add xfrm type\n", __func__); | 78 | printk(KERN_INFO "ipip init: can't add xfrm type\n"); |
| 81 | return -EAGAIN; | 79 | return -EAGAIN; |
| 82 | } | 80 | } |
| 83 | 81 | ||
| 84 | if (xfrm4_tunnel_register(&xfrm_tunnel_handler, AF_INET)) { | 82 | if (xfrm4_tunnel_register(&xfrm_tunnel_handler, AF_INET)) { |
| 85 | pr_info("%s: can't add xfrm handler for AF_INET\n", __func__); | 83 | printk(KERN_INFO "ipip init: can't add xfrm handler for AF_INET\n"); |
| 86 | xfrm_unregister_type(&ipip_type, AF_INET); | 84 | xfrm_unregister_type(&ipip_type, AF_INET); |
| 87 | return -EAGAIN; | 85 | return -EAGAIN; |
| 88 | } | 86 | } |
| 89 | #if IS_ENABLED(CONFIG_IPV6) | 87 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 90 | if (xfrm4_tunnel_register(&xfrm64_tunnel_handler, AF_INET6)) { | 88 | if (xfrm4_tunnel_register(&xfrm64_tunnel_handler, AF_INET6)) { |
| 91 | pr_info("%s: can't add xfrm handler for AF_INET6\n", __func__); | 89 | printk(KERN_INFO "ipip init: can't add xfrm handler for AF_INET6\n"); |
| 92 | xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET); | 90 | xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET); |
| 93 | xfrm_unregister_type(&ipip_type, AF_INET); | 91 | xfrm_unregister_type(&ipip_type, AF_INET); |
| 94 | return -EAGAIN; | 92 | return -EAGAIN; |
| @@ -99,16 +97,14 @@ static int __init ipip_init(void) | |||
| 99 | 97 | ||
| 100 | static void __exit ipip_fini(void) | 98 | static void __exit ipip_fini(void) |
| 101 | { | 99 | { |
| 102 | #if IS_ENABLED(CONFIG_IPV6) | 100 | #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) |
| 103 | if (xfrm4_tunnel_deregister(&xfrm64_tunnel_handler, AF_INET6)) | 101 | if (xfrm4_tunnel_deregister(&xfrm64_tunnel_handler, AF_INET6)) |
| 104 | pr_info("%s: can't remove xfrm handler for AF_INET6\n", | 102 | printk(KERN_INFO "ipip close: can't remove xfrm handler for AF_INET6\n"); |
| 105 | __func__); | ||
| 106 | #endif | 103 | #endif |
| 107 | if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET)) | 104 | if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET)) |
| 108 | pr_info("%s: can't remove xfrm handler for AF_INET\n", | 105 | printk(KERN_INFO "ipip close: can't remove xfrm handler for AF_INET\n"); |
| 109 | __func__); | ||
| 110 | if (xfrm_unregister_type(&ipip_type, AF_INET) < 0) | 106 | if (xfrm_unregister_type(&ipip_type, AF_INET) < 0) |
| 111 | pr_info("%s: can't remove xfrm type\n", __func__); | 107 | printk(KERN_INFO "ipip close: can't remove xfrm type\n"); |
| 112 | } | 108 | } |
| 113 | 109 | ||
| 114 | module_init(ipip_init); | 110 | module_init(ipip_init); |
