From e0f802fbcaa3bffe4728e37a8fa1279b5d554173 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Tue, 17 Jun 2014 11:25:37 +0300 Subject: tcp: move ir_mark initialization to tcp_openreq_init ir_mark initialization is done for both TCP v4 and v6, move it in the common tcp_openreq_init function. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 229239ad96b1..08ae3da0db4a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1025,7 +1025,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tcp_clear_options(&tmp_opt); tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; - tcp_openreq_init(req, &tmp_opt, skb); + tcp_openreq_init(req, &tmp_opt, skb, sk); ireq = inet_rsk(req); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; @@ -1034,7 +1034,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) TCP_ECN_create_request(req, skb, sock_net(sk)); ireq->ir_iif = sk->sk_bound_dev_if; - ireq->ir_mark = inet_request_mark(sk, skb); /* So that link locals have meaning */ if (!sk->sk_bound_dev_if && -- cgit v1.2.2 From e59d82fd33f7670cf67fd69cf684aa589ec8340a Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Fri, 9 May 2014 23:43:41 +0200 Subject: vti6: Simplify error handling in module init and exit The error handling in the module init and exit functions can be shortened to safe us some code. 1/ Remove the code duplications in the init function, jump straight to the existing cleanup code by adding some labels. Also give the error message some more value by telling the reason why loading the module has failed. 2/ Remove the error handling in the exit function as the only legitimate reason xfrm6_protocol_deregister() might fail is inet6_del_protocol() returning -1. That, in turn, means some other protocol handler had been registered for this very protocol in the meantime. But that essentially means we haven't been handling that protocol any more, anyway. What it definitely means not is that we "can't deregister protocol". Therefore just get rid of that bogus warning. It's plain wrong. Signed-off-by: Mathias Krause Signed-off-by: Steffen Klassert --- net/ipv6/ip6_vti.c | 51 ++++++++++++++++++++------------------------------- 1 file changed, 20 insertions(+), 31 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 9aaa6bb229e4..b61b0b1bb7ce 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -1089,36 +1089,26 @@ static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = { **/ static int __init vti6_tunnel_init(void) { - int err; + const char *msg; + int err; + msg = "tunnel device"; err = register_pernet_device(&vti6_net_ops); if (err < 0) - goto out_pernet; + goto pernet_dev_failed; + msg = "tunnel protocols"; err = xfrm6_protocol_register(&vti_esp6_protocol, IPPROTO_ESP); - if (err < 0) { - pr_err("%s: can't register vti6 protocol\n", __func__); - - goto out; - } - + if (err < 0) + goto xfrm_proto_esp_failed; err = xfrm6_protocol_register(&vti_ah6_protocol, IPPROTO_AH); - if (err < 0) { - xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); - pr_err("%s: can't register vti6 protocol\n", __func__); - - goto out; - } - + if (err < 0) + goto xfrm_proto_ah_failed; err = xfrm6_protocol_register(&vti_ipcomp6_protocol, IPPROTO_COMP); - if (err < 0) { - xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); - xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); - pr_err("%s: can't register vti6 protocol\n", __func__); - - goto out; - } + if (err < 0) + goto xfrm_proto_comp_failed; + msg = "netlink interface"; err = rtnl_link_register(&vti6_link_ops); if (err < 0) goto rtnl_link_failed; @@ -1127,11 +1117,14 @@ static int __init vti6_tunnel_init(void) rtnl_link_failed: xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP); +xfrm_proto_comp_failed: xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); +xfrm_proto_ah_failed: xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); -out: +xfrm_proto_esp_failed: unregister_pernet_device(&vti6_net_ops); -out_pernet: +pernet_dev_failed: + pr_err("vti6 init: failed to register %s\n", msg); return err; } @@ -1141,13 +1134,9 @@ out_pernet: static void __exit vti6_tunnel_cleanup(void) { rtnl_link_unregister(&vti6_link_ops); - if (xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP)) - pr_info("%s: can't deregister protocol\n", __func__); - if (xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH)) - pr_info("%s: can't deregister protocol\n", __func__); - if (xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP)) - pr_info("%s: can't deregister protocol\n", __func__); - + xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP); + xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); + xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); unregister_pernet_device(&vti6_net_ops); } -- cgit v1.2.2 From 83e96d443b372611adf19e4171d41deb1d8760cf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 19 Jun 2014 20:47:14 +0200 Subject: netfilter: log: split family specific code to nf_log_{ip,ip6,common}.c files The plain text logging is currently embedded into the xt_LOG target. In order to be able to use the plain text logging from nft_log, as a first step, this patch moves the family specific code to the following files and Kconfig symbols: 1) net/ipv4/netfilter/nf_log_ip.c: CONFIG_NF_LOG_IPV4 2) net/ipv6/netfilter/nf_log_ip6.c: CONFIG_NF_LOG_IPV6 3) net/netfilter/nf_log_common.c: CONFIG_NF_LOG_COMMON These new modules will be required by xt_LOG and nft_log. This patch is based on original patch from Arturo Borrero Gonzalez. Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/Kconfig | 5 + net/ipv6/netfilter/Makefile | 3 + net/ipv6/netfilter/nf_log_ipv6.c | 417 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 425 insertions(+) create mode 100644 net/ipv6/netfilter/nf_log_ipv6.c (limited to 'net/ipv6') diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 4bff1f297e39..1537130e9f5b 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -227,6 +227,11 @@ config IP6_NF_SECURITY If unsure, say N. +config NF_LOG_IPV6 + tristate "IPv6 packet logging" + depends on NETFILTER_ADVANCED + select NF_LOG_COMMON + config NF_NAT_IPV6 tristate "IPv6 NAT" depends on NF_CONNTRACK_IPV6 diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 70d3dd66f2cd..c0b263104ed2 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -23,6 +23,9 @@ obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o +# logging +obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o + # nf_tables obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c new file mode 100644 index 000000000000..804060946d2b --- /dev/null +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -0,0 +1,417 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 5, + .logflags = NF_LOG_MASK, + }, + }, +}; + +/* One level of recursion won't kill us */ +static void dump_ipv6_packet(struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, unsigned int ip6hoff, + int recurse) +{ + u_int8_t currenthdr; + int fragment; + struct ipv6hdr _ip6h; + const struct ipv6hdr *ih; + unsigned int ptr; + unsigned int hdrlen = 0; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; + + ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); + if (ih == NULL) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ + nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); + + /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ + nf_log_buf_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", + ntohs(ih->payload_len) + sizeof(struct ipv6hdr), + (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, + ih->hop_limit, + (ntohl(*(__be32 *)ih) & 0x000fffff)); + + fragment = 0; + ptr = ip6hoff + sizeof(struct ipv6hdr); + currenthdr = ih->nexthdr; + while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) { + struct ipv6_opt_hdr _hdr; + const struct ipv6_opt_hdr *hp; + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + if (hp == NULL) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + /* Max length: 48 "OPT (...) " */ + if (logflags & XT_LOG_IPOPT) + nf_log_buf_add(m, "OPT ( "); + + switch (currenthdr) { + case IPPROTO_FRAGMENT: { + struct frag_hdr _fhdr; + const struct frag_hdr *fh; + + nf_log_buf_add(m, "FRAG:"); + fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), + &_fhdr); + if (fh == NULL) { + nf_log_buf_add(m, "TRUNCATED "); + return; + } + + /* Max length: 6 "65535 " */ + nf_log_buf_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8); + + /* Max length: 11 "INCOMPLETE " */ + if (fh->frag_off & htons(0x0001)) + nf_log_buf_add(m, "INCOMPLETE "); + + nf_log_buf_add(m, "ID:%08x ", + ntohl(fh->identification)); + + if (ntohs(fh->frag_off) & 0xFFF8) + fragment = 1; + + hdrlen = 8; + + break; + } + case IPPROTO_DSTOPTS: + case IPPROTO_ROUTING: + case IPPROTO_HOPOPTS: + if (fragment) { + if (logflags & XT_LOG_IPOPT) + nf_log_buf_add(m, ")"); + return; + } + hdrlen = ipv6_optlen(hp); + break; + /* Max Length */ + case IPPROTO_AH: + if (logflags & XT_LOG_IPOPT) { + struct ip_auth_hdr _ahdr; + const struct ip_auth_hdr *ah; + + /* Max length: 3 "AH " */ + nf_log_buf_add(m, "AH "); + + if (fragment) { + nf_log_buf_add(m, ")"); + return; + } + + ah = skb_header_pointer(skb, ptr, sizeof(_ahdr), + &_ahdr); + if (ah == NULL) { + /* + * Max length: 26 "INCOMPLETE [65535 + * bytes] )" + */ + nf_log_buf_add(m, "INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 15 "SPI=0xF1234567 */ + nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi)); + + } + + hdrlen = (hp->hdrlen+2)<<2; + break; + case IPPROTO_ESP: + if (logflags & XT_LOG_IPOPT) { + struct ip_esp_hdr _esph; + const struct ip_esp_hdr *eh; + + /* Max length: 4 "ESP " */ + nf_log_buf_add(m, "ESP "); + + if (fragment) { + nf_log_buf_add(m, ")"); + return; + } + + /* + * Max length: 26 "INCOMPLETE [65535 bytes] )" + */ + eh = skb_header_pointer(skb, ptr, sizeof(_esph), + &_esph); + if (eh == NULL) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 16 "SPI=0xF1234567 )" */ + nf_log_buf_add(m, "SPI=0x%x )", + ntohl(eh->spi)); + } + return; + default: + /* Max length: 20 "Unknown Ext Hdr 255" */ + nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr); + return; + } + if (logflags & XT_LOG_IPOPT) + nf_log_buf_add(m, ") "); + + currenthdr = hp->nexthdr; + ptr += hdrlen; + } + + switch (currenthdr) { + case IPPROTO_TCP: + if (nf_log_dump_tcp_header(m, skb, currenthdr, fragment, + ptr, logflags)) + return; + break; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + if (nf_log_dump_udp_header(m, skb, currenthdr, fragment, ptr)) + return; + break; + case IPPROTO_ICMPV6: { + struct icmp6hdr _icmp6h; + const struct icmp6hdr *ic; + + /* Max length: 13 "PROTO=ICMPv6 " */ + nf_log_buf_add(m, "PROTO=ICMPv6 "); + + if (fragment) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); + if (ic == NULL) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - ptr); + return; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + nf_log_buf_add(m, "TYPE=%u CODE=%u ", + ic->icmp6_type, ic->icmp6_code); + + switch (ic->icmp6_type) { + case ICMPV6_ECHO_REQUEST: + case ICMPV6_ECHO_REPLY: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + nf_log_buf_add(m, "ID=%u SEQ=%u ", + ntohs(ic->icmp6_identifier), + ntohs(ic->icmp6_sequence)); + break; + case ICMPV6_MGM_QUERY: + case ICMPV6_MGM_REPORT: + case ICMPV6_MGM_REDUCTION: + break; + + case ICMPV6_PARAMPROB: + /* Max length: 17 "POINTER=ffffffff " */ + nf_log_buf_add(m, "POINTER=%08x ", + ntohl(ic->icmp6_pointer)); + /* Fall through */ + case ICMPV6_DEST_UNREACH: + case ICMPV6_PKT_TOOBIG: + case ICMPV6_TIME_EXCEED: + /* Max length: 3+maxlen */ + if (recurse) { + nf_log_buf_add(m, "["); + dump_ipv6_packet(m, info, skb, + ptr + sizeof(_icmp6h), 0); + nf_log_buf_add(m, "] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) { + nf_log_buf_add(m, "MTU=%u ", + ntohl(ic->icmp6_mtu)); + } + } + break; + } + /* Max length: 10 "PROTO=255 " */ + default: + nf_log_buf_add(m, "PROTO=%u ", currenthdr); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((logflags & XT_LOG_UID) && recurse) + nf_log_dump_sk_uid_gid(m, skb->sk); + + /* Max length: 16 "MARK=0xFFFFFFFF " */ + if (recurse && skb->mark) + nf_log_buf_add(m, "MARK=0x%x ", skb->mark); +} + +static void dump_ipv6_mac_header(struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + unsigned int logflags = 0; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + + if (!(logflags & XT_LOG_MACDECODE)) + goto fallback; + + switch (dev->type) { + case ARPHRD_ETHER: + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + return; + default: + break; + } + +fallback: + nf_log_buf_add(m, "MAC="); + if (dev->hard_header_len && + skb->mac_header != skb->network_header) { + const unsigned char *p = skb_mac_header(skb); + unsigned int len = dev->hard_header_len; + unsigned int i; + + if (dev->type == ARPHRD_SIT) { + p -= ETH_HLEN; + + if (p < skb->head) + p = NULL; + } + + if (p != NULL) { + nf_log_buf_add(m, "%02x", *p++); + for (i = 1; i < len; i++) + nf_log_buf_add(m, ":%02x", *p++); + } + nf_log_buf_add(m, " "); + + if (dev->type == ARPHRD_SIT) { + const struct iphdr *iph = + (struct iphdr *)skb_mac_header(skb); + nf_log_buf_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr, + &iph->daddr); + } + } else { + nf_log_buf_add(m, " "); + } +} + +void nf_log_ip6_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct nf_log_buf *m; + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net)) + return; + + m = nf_log_buf_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, + loginfo, prefix); + + if (in != NULL) + dump_ipv6_mac_header(m, loginfo, skb); + + dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1); + + nf_log_buf_close(m); +} +EXPORT_SYMBOL_GPL(nf_log_ip6_packet); + +static struct nf_logger nf_ip6_logger __read_mostly = { + .name = "nf_log_ipv6", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_ip6_packet, + .me = THIS_MODULE, +}; + +static int __net_init nf_log_ipv6_net_init(struct net *net) +{ + nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger); + return 0; +} + +static void __net_exit nf_log_ipv6_net_exit(struct net *net) +{ + nf_log_unset(net, &nf_ip6_logger); +} + +static struct pernet_operations nf_log_ipv6_net_ops = { + .init = nf_log_ipv6_net_init, + .exit = nf_log_ipv6_net_exit, +}; + +static int __init nf_log_ipv6_init(void) +{ + int ret; + + ret = register_pernet_subsys(&nf_log_ipv6_net_ops); + if (ret < 0) + return ret; + + nf_log_register(NFPROTO_IPV6, &nf_ip6_logger); + return 0; +} + +static void __exit nf_log_ipv6_exit(void) +{ + unregister_pernet_subsys(&nf_log_ipv6_net_ops); + nf_log_unregister(&nf_ip6_logger); +} + +module_init(nf_log_ipv6_init); +module_exit(nf_log_ipv6_exit); + +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("Netfilter IPv4 packet logging"); +MODULE_LICENSE("GPL"); -- cgit v1.2.2 From fab4085f4e248b8a80bb1dadbbacb2bacd8017c3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 18 Jun 2014 19:38:25 +0200 Subject: netfilter: log: nf_log_packet() as real unified interface Before this patch, the nf_loginfo parameter specified the logging configuration in case the specified default logger was loaded. This patch updates the semantics of the nf_loginfo parameter in nf_log_packet() which now indicates the logger that you explicitly want to use. Thus, nf_log_packet() is exposed as an unified interface which internally routes the log message to the corresponding logger type by family. The module dependencies are expressed by the new nf_logger_find_get() and nf_logger_put() functions which bump the logger module refcount. Thus, you can not remove logger modules that are used by rules anymore. Another important effect of this change is that the family specific module is only loaded when required. Therefore, xt_LOG and nft_log will just trigger the autoload of the nf_log_{ip,ip6} modules according to the family. Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/nf_log_ipv6.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index 804060946d2b..7b17a0be93e7 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -338,12 +338,12 @@ fallback: } } -void nf_log_ip6_packet(struct net *net, u_int8_t pf, - unsigned int hooknum, const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) +static void nf_log_ip6_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) { struct nf_log_buf *m; @@ -366,7 +366,6 @@ void nf_log_ip6_packet(struct net *net, u_int8_t pf, nf_log_buf_close(m); } -EXPORT_SYMBOL_GPL(nf_log_ip6_packet); static struct nf_logger nf_ip6_logger __read_mostly = { .name = "nf_log_ipv6", @@ -415,3 +414,4 @@ module_exit(nf_log_ipv6_exit); MODULE_AUTHOR("Netfilter Core Team "); MODULE_DESCRIPTION("Netfilter IPv4 packet logging"); MODULE_LICENSE("GPL"); +MODULE_ALIAS_NF_LOGGER(AF_INET6, 0); -- cgit v1.2.2 From aa27fc501850030fb5d1ee705feb836ee6a21f2a Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:09:51 +0300 Subject: tcp: tcp_v[46]_conn_request: fix snt_synack initialization Commit 016818d07 (tcp: TCP Fast Open Server - take SYNACK RTT after completing 3WHS) changes the code to only take a snt_synack timestamp when a SYNACK transmit or retransmit succeeds. This behaviour is later broken by commit 843f4a55e (tcp: use tcp_v4_send_synack on first SYN-ACK), as snt_synack is now updated even if tcp_v4_send_synack fails. Also, commit 3a19ce0ee (tcp: IPv6 support for fastopen server) misses the required IPv6 updates for 016818d07. This patch makes sure that snt_synack is updated only when the SYNACK trasnmit/retransmit succeeds, for both IPv4 and IPv6. Cc: Cardwell Cc: Daniel Lee Cc: Yuchung Cheng Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 08ae3da0db4a..a962455471ba 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -497,6 +497,8 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, skb_set_queue_mapping(skb, queue_mapping); err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass); err = net_xmit_eval(err); + if (!tcp_rsk(req)->snt_synack && !err) + tcp_rsk(req)->snt_synack = tcp_time_stamp; } done: @@ -1100,7 +1102,6 @@ have_isn: goto drop_and_free; tcp_rsk(req)->snt_isn = isn; - tcp_rsk(req)->snt_synack = tcp_time_stamp; tcp_openreq_init_rwin(req, sk, dst); fastopen = !want_cookie && tcp_try_fastopen(sk, skb, req, &foc, dst); -- cgit v1.2.2 From 476eab8251641ea2ae4666ca8a1436ebc2b8e9c3 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:09:52 +0300 Subject: net: remove inet6_reqsk_alloc Since pktops is only used for IPv6 only and opts is used for IPv4 only, we can move these fields into a union and this allows us to drop the inet6_reqsk_alloc function as after this change it becomes equivalent with inet_reqsk_alloc. This patch also fixes a kmemcheck issue in the IPv6 stack: the flags field was not annotated after a request_sock was allocated. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv6/syncookies.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index a822b880689b..83cea1d39466 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -187,7 +187,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet6_reqsk_alloc(&tcp6_request_sock_ops); + req = inet_reqsk_alloc(&tcp6_request_sock_ops); if (!req) goto out; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a962455471ba..5e2d7e655c0f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1010,7 +1010,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) goto drop; } - req = inet6_reqsk_alloc(&tcp6_request_sock_ops); + req = inet_reqsk_alloc(&tcp6_request_sock_ops); if (req == NULL) goto drop; -- cgit v1.2.2 From 16bea70aa7302b6f3bf3502d5a0efb4ea2ce4712 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:09:53 +0300 Subject: tcp: add init_req method to tcp_request_sock_ops Move the specific IPv4/IPv6 intializations to a new method in tcp_request_sock_ops in preparation for unifying tcp_v4_conn_request and tcp_v6_conn_request. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 55 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 23 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5e2d7e655c0f..87a360c3eba9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -720,6 +720,31 @@ static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) } #endif +static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, + struct sk_buff *skb) +{ + struct inet_request_sock *ireq = inet_rsk(req); + struct ipv6_pinfo *np = inet6_sk(sk); + + ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; + ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; + + ireq->ir_iif = sk->sk_bound_dev_if; + + /* So that link locals have meaning */ + if (!sk->sk_bound_dev_if && + ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) + ireq->ir_iif = inet6_iif(skb); + + if (!TCP_SKB_CB(skb)->when && + (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || + np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || + np->rxopt.bits.rxohlim || np->repflow)) { + atomic_inc(&skb->users); + ireq->pktopts = skb; + } +} + struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .family = AF_INET6, .obj_size = sizeof(struct tcp6_request_sock), @@ -730,12 +755,13 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .syn_ack_timeout = tcp_syn_ack_timeout, }; -#ifdef CONFIG_TCP_MD5SIG static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { +#ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v6_reqsk_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, -}; #endif + .init_req = tcp_v6_init_req, +}; static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, @@ -983,13 +1009,13 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) struct tcp_options_received tmp_opt; struct request_sock *req; struct inet_request_sock *ireq; - struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); __u32 isn = TCP_SKB_CB(skb)->when; struct dst_entry *dst = NULL; struct tcp_fastopen_cookie foc = { .len = -1 }; bool want_cookie = false, fastopen; struct flowi6 fl6; + const struct tcp_request_sock_ops *af_ops; int err; if (skb->protocol == htons(ETH_P_IP)) @@ -1014,9 +1040,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (req == NULL) goto drop; -#ifdef CONFIG_TCP_MD5SIG - tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops; -#endif + af_ops = tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops; tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); @@ -1030,27 +1054,12 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tcp_openreq_init(req, &tmp_opt, skb, sk); ireq = inet_rsk(req); - ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; - ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; + af_ops->init_req(req, sk, skb); + if (!want_cookie || tmp_opt.tstamp_ok) TCP_ECN_create_request(req, skb, sock_net(sk)); - ireq->ir_iif = sk->sk_bound_dev_if; - - /* So that link locals have meaning */ - if (!sk->sk_bound_dev_if && - ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) - ireq->ir_iif = inet6_iif(skb); - if (!isn) { - if (ipv6_opt_accepted(sk, skb) || - np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || - np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || - np->repflow) { - atomic_inc(&skb->users); - ireq->pktopts = skb; - } - if (want_cookie) { isn = cookie_v6_init_sequence(sk, skb, &req->mss); req->cookie_ts = tmp_opt.tstamp_ok; -- cgit v1.2.2 From fb7b37a7f3d6f7b7ba05ee526fee96810d5b92a8 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:09:54 +0300 Subject: tcp: add init_cookie_seq method to tcp_request_sock_ops Move the specific IPv4/IPv6 cookie sequence initialization to a new method in tcp_request_sock_ops in preparation for unifying tcp_v4_conn_request and tcp_v6_conn_request. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 87a360c3eba9..17710cffddaa 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -761,6 +761,9 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .calc_md5_hash = tcp_v6_md5_hash_skb, #endif .init_req = tcp_v6_init_req, +#ifdef CONFIG_SYN_COOKIES + .cookie_init_seq = cookie_v6_init_sequence, +#endif }; static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, @@ -1061,7 +1064,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (!isn) { if (want_cookie) { - isn = cookie_v6_init_sequence(sk, skb, &req->mss); + isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); req->cookie_ts = tmp_opt.tstamp_ok; goto have_isn; } -- cgit v1.2.2 From d94e0417ad8d96d7d96b69335338ad942eaeecf1 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:09:55 +0300 Subject: tcp: add route_req method to tcp_request_sock_ops Create wrappers with same signature for the IPv4/IPv6 request routing calls and use these wrappers (via route_req method from tcp_request_sock_ops) in tcp_v4_conn_request and tcp_v6_conn_request with the purpose of unifying the two functions in a later patch. We can later drop the wrapper functions and modify inet_csk_route_req and inet6_cks_route_req to use the same signature. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 17710cffddaa..d780d8808566 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -745,6 +745,16 @@ static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, } } +static struct dst_entry *tcp_v6_route_req(struct sock *sk, struct flowi *fl, + const struct request_sock *req, + bool *strict) +{ + if (strict) + *strict = true; + return inet6_csk_route_req(sk, &fl->u.ip6, req); +} + + struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .family = AF_INET6, .obj_size = sizeof(struct tcp6_request_sock), @@ -764,6 +774,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { #ifdef CONFIG_SYN_COOKIES .cookie_init_seq = cookie_v6_init_sequence, #endif + .route_req = tcp_v6_route_req, }; static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, @@ -1078,10 +1089,10 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) * timewait bucket, so that all the necessary checks * are made in the function processing timewait state. */ - if (tmp_opt.saw_tstamp && - tcp_death_row.sysctl_tw_recycle && - (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) { - if (!tcp_peer_is_proven(req, dst, true)) { + if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) { + dst = af_ops->route_req(sk, (struct flowi *)&fl6, req, + NULL); + if (dst && !tcp_peer_is_proven(req, dst, true)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); goto drop_and_release; } @@ -1110,8 +1121,11 @@ have_isn: if (security_inet_conn_request(sk, skb, req)) goto drop_and_release; - if (!dst && (dst = inet6_csk_route_req(sk, &fl6, req)) == NULL) - goto drop_and_free; + if (!dst) { + dst = af_ops->route_req(sk, (struct flowi *)&fl6, req, NULL); + if (!dst) + goto drop_and_free; + } tcp_rsk(req)->snt_isn = isn; tcp_openreq_init_rwin(req, sk, dst); -- cgit v1.2.2 From 9403715977075c89b1dbcdd7713ab542807a04ac Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:09:56 +0300 Subject: tcp: move around a few calls in tcp_v6_conn_request Make the tcp_v6_conn_request calls flow similar with that of tcp_v4_conn_request. Note that want_cookie can be true only if isn is zero and that is why we can move the if (want_cookie) block out of the if (!isn) block. Moving security_inet_conn_request() has a couple of side effects: missing inet_rsk(req)->ecn_ok update and the req->cookie_ts update. However, neither SELinux nor Smack security hooks seems to check them. This change should also avoid future different behaviour for IPv4 and IPv6 in the security hooks. Signed-off-by: Octavian Purdila Acked-by: Paul Moore Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d780d8808566..91b8a2e699f3 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1070,16 +1070,16 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) ireq = inet_rsk(req); af_ops->init_req(req, sk, skb); + if (security_inet_conn_request(sk, skb, req)) + goto drop_and_release; + if (!want_cookie || tmp_opt.tstamp_ok) TCP_ECN_create_request(req, skb, sock_net(sk)); - if (!isn) { - if (want_cookie) { - isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); - req->cookie_ts = tmp_opt.tstamp_ok; - goto have_isn; - } - + if (want_cookie) { + isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); + req->cookie_ts = tmp_opt.tstamp_ok; + } else if (!isn) { /* VJ's idea. We save last timestamp seen * from the destination in peer table, when entering * state TIME-WAIT, and check against it before @@ -1116,10 +1116,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) isn = tcp_v6_init_sequence(skb); } -have_isn: - - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_release; if (!dst) { dst = af_ops->route_req(sk, (struct flowi *)&fl6, req, NULL); -- cgit v1.2.2 From 936b8bdb53f90840e658904530f9db8d02ac804b Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:09:57 +0300 Subject: tcp: add init_seq method to tcp_request_sock_ops More work in preparation of unifying tcp_v4_conn_request and tcp_v6_conn_request: indirect the init sequence calls via the tcp_request_sock_ops. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 91b8a2e699f3..2fd886fe8340 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -775,6 +775,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .cookie_init_seq = cookie_v6_init_sequence, #endif .route_req = tcp_v6_route_req, + .init_seq = tcp_v6_init_sequence, }; static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, @@ -1114,7 +1115,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_release; } - isn = tcp_v6_init_sequence(skb); + isn = af_ops->init_seq(skb); } if (!dst) { -- cgit v1.2.2 From d6274bd8d6ea84b7b54cc1c3fde6bcb6143b104f Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:09:58 +0300 Subject: tcp: add send_synack method to tcp_request_sock_ops Create a new tcp_request_sock_ops method to unify the IPv4/IPv6 signature for tcp_v[46]_send_synack. This allows us to later unify tcp_v4_rtx_synack with tcp_v6_rtx_synack and tcp_v4_conn_request with tcp_v4_conn_request. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2fd886fe8340..210b6105afed 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -470,13 +470,14 @@ out: static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, - struct flowi6 *fl6, + struct flowi *fl, struct request_sock *req, u16 queue_mapping, struct tcp_fastopen_cookie *foc) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); + struct flowi6 *fl6 = &fl->u.ip6; struct sk_buff *skb; int err = -ENOMEM; @@ -507,10 +508,11 @@ done: static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req) { - struct flowi6 fl6; + const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; + struct flowi fl; int res; - res = tcp_v6_send_synack(sk, NULL, &fl6, req, 0, NULL); + res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); @@ -754,7 +756,6 @@ static struct dst_entry *tcp_v6_route_req(struct sock *sk, struct flowi *fl, return inet6_csk_route_req(sk, &fl->u.ip6, req); } - struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .family = AF_INET6, .obj_size = sizeof(struct tcp6_request_sock), @@ -776,6 +777,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { #endif .route_req = tcp_v6_route_req, .init_seq = tcp_v6_init_sequence, + .send_synack = tcp_v6_send_synack, }; static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, @@ -1128,8 +1130,8 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tcp_openreq_init_rwin(req, sk, dst); fastopen = !want_cookie && tcp_try_fastopen(sk, skb, req, &foc, dst); - err = tcp_v6_send_synack(sk, dst, &fl6, req, - skb_get_queue_mapping(skb), &foc); + err = af_ops->send_synack(sk, dst, (struct flowi *)&fl6, req, + skb_get_queue_mapping(skb), &foc); if (!fastopen) { if (err || want_cookie) goto drop_and_free; -- cgit v1.2.2 From 5db92c994982ed826cf38f38d58bd09bc326aef6 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:09:59 +0300 Subject: tcp: unify tcp_v4_rtx_synack and tcp_v6_rtx_synack Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 210b6105afed..41389bbb08c0 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -506,19 +506,6 @@ done: return err; } -static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req) -{ - const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; - struct flowi fl; - int res; - - res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); - if (!res) { - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); - } - return res; -} static void tcp_v6_reqsk_destructor(struct request_sock *req) { @@ -759,7 +746,7 @@ static struct dst_entry *tcp_v6_route_req(struct sock *sk, struct flowi *fl, struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .family = AF_INET6, .obj_size = sizeof(struct tcp6_request_sock), - .rtx_syn_ack = tcp_v6_rtx_synack, + .rtx_syn_ack = tcp_rtx_synack, .send_ack = tcp_v6_reqsk_send_ack, .destructor = tcp_v6_reqsk_destructor, .send_reset = tcp_v6_send_reset, -- cgit v1.2.2 From 2aec4a297b21f3690486bbf8f7d5d29281ba6a48 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:10:00 +0300 Subject: tcp: add mss_clamp to tcp_request_sock_ops Add mss_clamp member to tcp_request_sock_ops so that we can later unify tcp_v4_conn_request and tcp_v6_conn_request. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 41389bbb08c0..ad658332cf7d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -754,6 +754,8 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = { }; static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { + .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - + sizeof(struct ipv6hdr), #ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v6_reqsk_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, @@ -1047,7 +1049,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) af_ops = tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops; tcp_clear_options(&tmp_opt); - tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); + tmp_opt.mss_clamp = af_ops->mss_clamp; tmp_opt.user_mss = tp->rx_opt.user_mss; tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); -- cgit v1.2.2 From 695da14eb0af21129187ed3810e329b21262e45f Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:10:01 +0300 Subject: tcp: add queue_add_hash to tcp_request_sock_ops Add queue_add_hash member to tcp_request_sock_ops so that we can later unify tcp_v4_conn_request and tcp_v6_conn_request. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ad658332cf7d..8232bc7423c6 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -767,6 +767,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .route_req = tcp_v6_route_req, .init_seq = tcp_v6_init_sequence, .send_synack = tcp_v6_send_synack, + .queue_hash_add = inet6_csk_reqsk_queue_hash_add, }; static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, @@ -1126,7 +1127,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; tcp_rsk(req)->listener = NULL; - inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); } return 0; -- cgit v1.2.2 From 1fb6f159fd21c640a28eb65fbd62ce8c9f6a777e Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Jun 2014 17:10:02 +0300 Subject: tcp: add tcp_conn_request Create tcp_conn_request and remove most of the code from tcp_v4_conn_request and tcp_v6_conn_request. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 120 +--------------------------------------------------- 1 file changed, 2 insertions(+), 118 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8232bc7423c6..bc24ee21339a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1008,133 +1008,17 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) return sk; } -/* FIXME: this is substantially similar to the ipv4 code. - * Can some kind of merge be done? -- erics - */ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { - struct tcp_options_received tmp_opt; - struct request_sock *req; - struct inet_request_sock *ireq; - struct tcp_sock *tp = tcp_sk(sk); - __u32 isn = TCP_SKB_CB(skb)->when; - struct dst_entry *dst = NULL; - struct tcp_fastopen_cookie foc = { .len = -1 }; - bool want_cookie = false, fastopen; - struct flowi6 fl6; - const struct tcp_request_sock_ops *af_ops; - int err; - if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_conn_request(sk, skb); if (!ipv6_unicast_destination(skb)) goto drop; - if ((sysctl_tcp_syncookies == 2 || - inet_csk_reqsk_queue_is_full(sk)) && !isn) { - want_cookie = tcp_syn_flood_action(sk, skb, "TCPv6"); - if (!want_cookie) - goto drop; - } + return tcp_conn_request(&tcp6_request_sock_ops, + &tcp_request_sock_ipv6_ops, sk, skb); - if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); - goto drop; - } - - req = inet_reqsk_alloc(&tcp6_request_sock_ops); - if (req == NULL) - goto drop; - - af_ops = tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops; - - tcp_clear_options(&tmp_opt); - tmp_opt.mss_clamp = af_ops->mss_clamp; - tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); - - if (want_cookie && !tmp_opt.saw_tstamp) - tcp_clear_options(&tmp_opt); - - tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; - tcp_openreq_init(req, &tmp_opt, skb, sk); - - ireq = inet_rsk(req); - af_ops->init_req(req, sk, skb); - - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_release; - - if (!want_cookie || tmp_opt.tstamp_ok) - TCP_ECN_create_request(req, skb, sock_net(sk)); - - if (want_cookie) { - isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); - req->cookie_ts = tmp_opt.tstamp_ok; - } else if (!isn) { - /* VJ's idea. We save last timestamp seen - * from the destination in peer table, when entering - * state TIME-WAIT, and check against it before - * accepting new connection request. - * - * If "isn" is not zero, this request hit alive - * timewait bucket, so that all the necessary checks - * are made in the function processing timewait state. - */ - if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) { - dst = af_ops->route_req(sk, (struct flowi *)&fl6, req, - NULL); - if (dst && !tcp_peer_is_proven(req, dst, true)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); - goto drop_and_release; - } - } - /* Kill the following clause, if you dislike this way. */ - else if (!sysctl_tcp_syncookies && - (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < - (sysctl_max_syn_backlog >> 2)) && - !tcp_peer_is_proven(req, dst, false)) { - /* Without syncookies last quarter of - * backlog is filled with destinations, - * proven to be alive. - * It means that we continue to communicate - * to destinations, already remembered - * to the moment of synflood. - */ - LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n", - &ireq->ir_v6_rmt_addr, ntohs(tcp_hdr(skb)->source)); - goto drop_and_release; - } - - isn = af_ops->init_seq(skb); - } - - if (!dst) { - dst = af_ops->route_req(sk, (struct flowi *)&fl6, req, NULL); - if (!dst) - goto drop_and_free; - } - - tcp_rsk(req)->snt_isn = isn; - tcp_openreq_init_rwin(req, sk, dst); - fastopen = !want_cookie && - tcp_try_fastopen(sk, skb, req, &foc, dst); - err = af_ops->send_synack(sk, dst, (struct flowi *)&fl6, req, - skb_get_queue_mapping(skb), &foc); - if (!fastopen) { - if (err || want_cookie) - goto drop_and_free; - - tcp_rsk(req)->listener = NULL; - af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); - } - return 0; - -drop_and_release: - dst_release(dst); -drop_and_free: - reqsk_free(req); drop: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return 0; /* don't send reset */ -- cgit v1.2.2 From c1878869c0c8e0def3df5397155f369442ce4e06 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 28 Jun 2014 18:39:01 +0200 Subject: netfilter: fix several Kconfig problems in NF_LOG_* warning: (NETFILTER_XT_TARGET_LOG) selects NF_LOG_IPV6 which has unmet direct dependencies (NET && INET && IPV6 && NETFILTER && IP6_NF_IPTABLES && NETFILTER_ADVANCED) warning: (NF_LOG_IPV4 && NF_LOG_IPV6) selects NF_LOG_COMMON which has unmet direct dependencies (NET && INET && NETFILTER && NF_CONNTRACK) Fixes: 83e96d4 ("netfilter: log: split family specific code to nf_log_{ip,ip6,common}.c files") Reported-by: Fengguang Wu Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/Kconfig | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 1537130e9f5b..ac93df16f5af 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -55,6 +55,11 @@ config NFT_REJECT_IPV6 default NFT_REJECT tristate +config NF_LOG_IPV6 + tristate "IPv6 packet logging" + depends on NETFILTER_ADVANCED + select NF_LOG_COMMON + config IP6_NF_IPTABLES tristate "IP6 tables support (required for filtering)" depends on INET && IPV6 @@ -227,11 +232,6 @@ config IP6_NF_SECURITY If unsure, say N. -config NF_LOG_IPV6 - tristate "IPv6 packet logging" - depends on NETFILTER_ADVANCED - select NF_LOG_COMMON - config NF_NAT_IPV6 tristate "IPv6 NAT" depends on NF_CONNTRACK_IPV6 -- cgit v1.2.2 From 24de3d377539e384621c5b8f8f8d8d01852dddc8 Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Mon, 30 Jun 2014 09:19:32 +0800 Subject: netfilter: use IS_ENABLED() macro replace: #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) with #if IS_ENABLED(CONFIG_NF_CT_NETLINK) replace: #if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE) with #if !IS_ENABLED(CONFIG_NF_NAT) replace: #if !defined(CONFIG_NF_CONNTRACK) && !defined(CONFIG_NF_CONNTRACK_MODULE) with #if !IS_ENABLED(CONFIG_NF_CONNTRACK) And add missing: IS_ENABLED(CONFIG_NF_CT_NETLINK) in net/ipv{4,6}/netfilter/nf_nat_l3proto_ipv{4,6}.c Signed-off-by: Duan Jiong Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/ipv6') diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index abfe75a2e316..fc8e49b2ff3e 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -158,6 +158,7 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, htons(oldlen), htons(datalen), 1); } +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range *range) { @@ -175,6 +176,7 @@ static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], return 0; } +#endif static const struct nf_nat_l3proto nf_nat_l3proto_ipv6 = { .l3proto = NFPROTO_IPV6, @@ -183,7 +185,9 @@ static const struct nf_nat_l3proto nf_nat_l3proto_ipv6 = { .manip_pkt = nf_nat_ipv6_manip_pkt, .csum_update = nf_nat_ipv6_csum_update, .csum_recalc = nf_nat_ipv6_csum_recalc, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_ipv6_nlattr_to_range, +#endif #ifdef CONFIG_XFRM .decode_session = nf_nat_ipv6_decode_session, #endif -- cgit v1.2.2 From f2a762d8a97032e58c09c5798832e25268e1c111 Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Wed, 25 Jun 2014 14:44:52 -0700 Subject: ipv6: Add more debugging around accept-ra logic. This is disabled by default, just like similar debug info already in this module. But, makes it easier to find out why RA is not being accepted when debugging strange behaviour. Signed-off-by: Ben Greear Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 51 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 8 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index ca8d4ea48a5d..736c11c6d266 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1070,6 +1070,9 @@ static void ndisc_router_discovery(struct sk_buff *skb) optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) - sizeof(struct ra_msg); + ND_PRINTK(2, info, + "RA: %s, dev: %s\n", + __func__, skb->dev->name); if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "RA: source address is not link-local\n"); return; @@ -1102,13 +1105,21 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - if (!ipv6_accept_ra(in6_dev)) + if (!ipv6_accept_ra(in6_dev)) { + ND_PRINTK(2, info, + "RA: %s, did not accept ra for dev: %s\n", + __func__, skb->dev->name); goto skip_linkparms; + } #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific parameters from interior routers */ - if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) + if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { + ND_PRINTK(2, info, + "RA: %s, nodetype is NODEFAULT, dev: %s\n", + __func__, skb->dev->name); goto skip_linkparms; + } #endif if (in6_dev->if_flags & IF_RS_SENT) { @@ -1130,11 +1141,20 @@ static void ndisc_router_discovery(struct sk_buff *skb) (ra_msg->icmph.icmp6_addrconf_other ? IF_RA_OTHERCONF : 0); - if (!in6_dev->cnf.accept_ra_defrtr) + if (!in6_dev->cnf.accept_ra_defrtr) { + ND_PRINTK(2, info, + "RA: %s, defrtr is false for dev: %s\n", + __func__, skb->dev->name); goto skip_defrtr; + } - if (ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, NULL, 0)) + if (ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, + NULL, 0)) { + ND_PRINTK(2, info, + "RA: %s, chk_addr failed for dev: %s\n", + __func__, skb->dev->name); goto skip_defrtr; + } lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); @@ -1163,8 +1183,10 @@ static void ndisc_router_discovery(struct sk_buff *skb) rt = NULL; } + ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, for dev: %s\n", + rt, lifetime, skb->dev->name); if (rt == NULL && lifetime) { - ND_PRINTK(3, dbg, "RA: adding default router\n"); + ND_PRINTK(3, info, "RA: adding default router\n"); rt = rt6_add_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev, pref); if (rt == NULL) { @@ -1260,12 +1282,21 @@ skip_linkparms: NEIGH_UPDATE_F_ISROUTER); } - if (!ipv6_accept_ra(in6_dev)) + if (!ipv6_accept_ra(in6_dev)) { + ND_PRINTK(2, info, + "RA: %s, accept_ra is false for dev: %s\n", + __func__, skb->dev->name); goto out; + } #ifdef CONFIG_IPV6_ROUTE_INFO - if (ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, NULL, 0)) + if (ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, + NULL, 0)) { + ND_PRINTK(2, info, + "RA: %s, chk-addr (route info) is false for dev: %s\n", + __func__, skb->dev->name); goto skip_routeinfo; + } if (in6_dev->cnf.accept_ra_rtr_pref && ndopts.nd_opts_ri) { struct nd_opt_hdr *p; @@ -1293,8 +1324,12 @@ skip_routeinfo: #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific ndopts from interior routers */ - if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) + if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { + ND_PRINTK(2, info, + "RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n", + __func__, skb->dev->name); goto out; + } #endif if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) { -- cgit v1.2.2 From d93331965729850303f6111381c1a4a9e9b8ae5a Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Wed, 25 Jun 2014 14:44:53 -0700 Subject: ipv6: Allow accepting RA from local IP addresses. This can be used in virtual networking applications, and may have other uses as well. The option is disabled by default. A specific use case is setting up virtual routers, bridges, and hosts on a single OS without the use of network namespaces or virtual machines. With proper use of ip rules, routing tables, veth interface pairs and/or other virtual interfaces, and applications that can bind to interfaces and/or IP addresses, it is possibly to create one or more virtual routers with multiple hosts attached. The host interfaces can act as IPv6 systems, with radvd running on the ports in the virtual routers. With the option provided in this patch enabled, those hosts can now properly obtain IPv6 addresses from the radvd. Signed-off-by: Ben Greear Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 10 ++++++++++ net/ipv6/ndisc.c | 21 +++++++++++++-------- 2 files changed, 23 insertions(+), 8 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5667b3003af9..358edd2272ac 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -186,6 +186,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .max_desync_factor = MAX_DESYNC_FACTOR, .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, + .accept_ra_from_local = 0, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -222,6 +223,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .max_desync_factor = MAX_DESYNC_FACTOR, .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, + .accept_ra_from_local = 0, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -4321,6 +4323,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao; array[DEVCONF_NDISC_NOTIFY] = cnf->ndisc_notify; array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc; + array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local; } static inline size_t inet6_ifla6_size(void) @@ -5167,6 +5170,13 @@ static struct addrconf_sysctl_table .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "accept_ra_from_local", + .data = &ipv6_devconf.accept_ra_from_local, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { /* sentinel */ } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 736c11c6d266..a845e3d2057e 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1148,11 +1148,15 @@ static void ndisc_router_discovery(struct sk_buff *skb) goto skip_defrtr; } - if (ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, - NULL, 0)) { + /* Do not accept RA with source-addr found on local machine unless + * accept_ra_from_local is set to true. + */ + if (!(in6_dev->cnf.accept_ra_from_local || + ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, + NULL, 0))) { ND_PRINTK(2, info, - "RA: %s, chk_addr failed for dev: %s\n", - __func__, skb->dev->name); + "RA from local address detected on dev: %s: default router ignored\n", + skb->dev->name); goto skip_defrtr; } @@ -1290,11 +1294,12 @@ skip_linkparms: } #ifdef CONFIG_IPV6_ROUTE_INFO - if (ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, - NULL, 0)) { + if (!(in6_dev->cnf.accept_ra_from_local || + ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, + NULL, 0))) { ND_PRINTK(2, info, - "RA: %s, chk-addr (route info) is false for dev: %s\n", - __func__, skb->dev->name); + "RA from local address detected on dev: %s: router info ignored.\n", + skb->dev->name); goto skip_routeinfo; } -- cgit v1.2.2 From 9fe516ba3fb29b6f6a752ffd93342fdee500ec01 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Jun 2014 08:36:16 -0700 Subject: inet: move ipv6only in sock_common When an UDP application switches from AF_INET to AF_INET6 sockets, we have a small performance degradation for IPv4 communications because of extra cache line misses to access ipv6only information. This can also be noticed for TCP listeners, as ipv6_only_sock() is also used from __inet_lookup_listener()->compute_score() This is magnified when SO_REUSEPORT is used. Move ipv6only into struct sock_common so that it is available at no extra cost in lookups. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 6 +++--- net/ipv6/ipv6_sockglue.c | 4 ++-- net/ipv6/udp.c | 3 +-- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 7cb4392690dd..a426cd7099bb 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -197,7 +197,7 @@ lookup_protocol: np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; np->mc_loop = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; - np->ipv6only = net->ipv6.sysctl.bindv6only; + sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; /* Init the ipv4 part of the socket since we can have sockets * using v6 API for ipv4. @@ -294,7 +294,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* Binding to v4-mapped address on a v6-only socket * makes no sense */ - if (np->ipv6only) { + if (sk->sk_ipv6only) { err = -EINVAL; goto out; } @@ -371,7 +371,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_type != IPV6_ADDR_ANY) { sk->sk_userlocks |= SOCK_BINDADDR_LOCK; if (addr_type != IPV6_ADDR_MAPPED) - np->ipv6only = 1; + sk->sk_ipv6only = 1; } if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index edb58aff4ae7..cc34f65179e4 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -235,7 +235,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (optlen < sizeof(int) || inet_sk(sk)->inet_num) goto e_inval; - np->ipv6only = valbool; + sk->sk_ipv6only = valbool; retv = 0; break; @@ -1058,7 +1058,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, } case IPV6_V6ONLY: - val = np->ipv6only; + val = sk->sk_ipv6only; break; case IPV6_RECVPKTINFO: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 95c834799288..c2bd28fd43e4 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -79,7 +79,6 @@ static unsigned int udp6_ehashfn(struct net *net, int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) { const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); - int sk_ipv6only = ipv6_only_sock(sk); int sk2_ipv6only = inet_v6_ipv6only(sk2); int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; @@ -95,7 +94,7 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) return 1; if (addr_type == IPV6_ADDR_ANY && - !(sk_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) + !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED)) return 1; if (sk2_rcv_saddr6 && -- cgit v1.2.2 From 86c6a2c75ab97fe31844985169e26aea335432f9 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 30 Jun 2014 15:09:49 -0400 Subject: tcp: switch snt_synack back to measuring transmit time of first SYNACK Always store in snt_synack the time at which the server received the first client SYN and attempted to send the first SYNACK. Recent commit aa27fc501 ("tcp: tcp_v[46]_conn_request: fix snt_synack initialization") resolved an inconsistency between IPv4 and IPv6 in the initialization of snt_synack. This commit brings back the idea from 843f4a55e (tcp: use tcp_v4_send_synack on first SYN-ACK), which was going for the original behavior of snt_synack from the commit where it was added in 9ad7c049f0f79 ("tcp: RFC2988bis + taking RTT sample from 3WHS for the passive open side") in v3.1. In addition to being simpler (and probably a tiny bit faster), unconditionally storing the time of the first SYNACK attempt has been useful because it allows calculating a performance metric quantifying how long it took to establish a passive TCP connection. Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Cc: Octavian Purdila Cc: Jerry Chu Acked-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index bc24ee21339a..a97c95585da8 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -498,8 +498,6 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, skb_set_queue_mapping(skb, queue_mapping); err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass); err = net_xmit_eval(err); - if (!tcp_rsk(req)->snt_synack && !err) - tcp_rsk(req)->snt_synack = tcp_time_stamp; } done: -- cgit v1.2.2 From b73c3d0e4f0e1961e15bec18720e48aabebe2109 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Jul 2014 21:32:17 -0700 Subject: net: Save TX flow hash in sock and set in skbuf on xmit For a connected socket we can precompute the flow hash for setting in skb->hash on output. This is a performance advantage over calculating the skb->hash for every packet on the connection. The computation is done using the common hash algorithm to be consistent with computations done for packets of the connection in other states where thers is no socket (e.g. time-wait, syn-recv, syn-cookies). This patch adds sk_txhash to the sock structure. inet_set_txhash and ip6_set_txhash functions are added which are called from points in TCP and UDP where socket moves to established state. skb_set_hash_from_sk is a function which sets skb->hash from the sock txhash value. This is called in UDP and TCP transmit path when transmitting within the context of a socket. Tested: ran super_netperf with 200 TCP_RR streams over a vxlan interface (in this case skb_get_hash called on every TX packet to create a UDP source port). Before fix: 95.02% CPU utilization 154/256/505 90/95/99% latencies 1.13042e+06 tps Time in functions: 0.28% skb_flow_dissect 0.21% __skb_get_hash After fix: 94.95% CPU utilization 156/254/485 90/95/99% latencies 1.15447e+06 Neither __skb_get_hash nor skb_flow_dissect appear in perf Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/datagram.c | 1 + net/ipv6/tcp_ipv6.c | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'net/ipv6') diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index c3bf2d2e519e..2753319524f1 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -199,6 +199,7 @@ ipv4_connected: NULL); sk->sk_state = TCP_ESTABLISHED; + ip6_set_txhash(sk); out: fl6_sock_release(flowlabel); return err; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a97c95585da8..22055b098428 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -198,6 +198,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_v6_daddr = usin->sin6_addr; np->flow_label = fl6.flowlabel; + ip6_set_txhash(sk); + /* * TCP over IPv4 */ @@ -1132,6 +1134,8 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; newsk->sk_bound_dev_if = ireq->ir_iif; + ip6_set_txhash(newsk); + /* Now IPv6 options... First: no IPv4 options. -- cgit v1.2.2 From cb1ce2ef387b01686469487edd45994872d52d73 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Jul 2014 21:33:10 -0700 Subject: ipv6: Implement automatic flow label generation on transmit Automatically generate flow labels for IPv6 packets on transmit. The flow label is computed based on skb_get_hash. The flow label will only automatically be set when it is zero otherwise (i.e. flow label manager hasn't set one). This supports the transmit side functionality of RFC 6438. Added an IPv6 sysctl auto_flowlabels to enable/disable this behavior system wide, and added IPV6_AUTOFLOWLABEL socket option to enable this functionality per socket. By default, auto flowlabels are disabled to avoid possible conflicts with flow label manager, however if this feature proves useful we may want to enable it by default. It should also be noted that FreeBSD has already implemented automatic flow labels (including the sysctl and socket option). In FreeBSD, automatic flow labels default to enabled. Performance impact: Running super_netperf with 200 flows for TCP_RR and UDP_RR for IPv6. Note that in UDP case, __skb_get_hash will be called for every packet with explains slight regression. In the TCP case the hash is saved in the socket so there is no regression. Automatic flow labels disabled: TCP_RR: 86.53% CPU utilization 127/195/322 90/95/99% latencies 1.40498e+06 tps UDP_RR: 90.70% CPU utilization 118/168/243 90/95/99% latencies 1.50309e+06 tps Automatic flow labels enabled: TCP_RR: 85.90% CPU utilization 128/199/337 90/95/99% latencies 1.40051e+06 UDP_RR 92.61% CPU utilization 115/164/236 90/95/99% latencies 1.4687e+06 Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 1 + net/ipv6/ip6_gre.c | 7 +++++-- net/ipv6/ip6_output.c | 7 +++++-- net/ipv6/ip6_tunnel.c | 3 ++- net/ipv6/ipv6_sockglue.c | 8 ++++++++ net/ipv6/sysctl_net_ipv6.c | 8 ++++++++ 6 files changed, 29 insertions(+), 5 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index a426cd7099bb..2daa3a133e49 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -765,6 +765,7 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.bindv6only = 0; net->ipv6.sysctl.icmpv6_time = 1*HZ; net->ipv6.sysctl.flowlabel_consistency = 1; + net->ipv6.sysctl.auto_flowlabels = 0; atomic_set(&net->ipv6.rt_genid, 0); err = ipv6_init_mibs(net); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 3873181ed856..365b2b6f3942 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -723,7 +723,8 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, * Push down and install the IP header. */ ipv6h = ipv6_hdr(skb); - ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), fl6->flowlabel); + ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), + ip6_make_flowlabel(net, skb, fl6->flowlabel, false)); ipv6h->hop_limit = tunnel->parms.hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; @@ -1174,7 +1175,9 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb_push(skb, t->hlen); __be16 *p = (__be16 *)(ipv6h+1); - ip6_flow_hdr(ipv6h, 0, t->fl.u.ip6.flowlabel); + ip6_flow_hdr(ipv6h, 0, + ip6_make_flowlabel(dev_net(dev), skb, + t->fl.u.ip6.flowlabel, false)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = NEXTHDR_GRE; ipv6h->saddr = t->parms.laddr; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index cb9df0eb4023..fa83bdd4c3dd 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -205,7 +205,8 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); - ip6_flow_hdr(hdr, tclass, fl6->flowlabel); + ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, + np->autoflowlabel)); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; @@ -1569,7 +1570,9 @@ int ip6_push_pending_frames(struct sock *sk) skb_reset_network_header(skb); hdr = ipv6_hdr(skb); - ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel); + ip6_flow_hdr(hdr, np->cork.tclass, + ip6_make_flowlabel(net, skb, fl6->flowlabel, + np->autoflowlabel)); hdr->hop_limit = np->cork.hop_limit; hdr->nexthdr = proto; hdr->saddr = fl6->saddr; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index afa082458360..51a1eb185ea7 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1046,7 +1046,8 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); - ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), fl6->flowlabel); + ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), + ip6_make_flowlabel(net, skb, fl6->flowlabel, false)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index cc34f65179e4..b50b9e54cf53 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -834,6 +834,10 @@ pref_skip_coa: np->dontfrag = valbool; retv = 0; break; + case IPV6_AUTOFLOWLABEL: + np->autoflowlabel = valbool; + retv = 0; + break; } release_sock(sk); @@ -1273,6 +1277,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->dontfrag; break; + case IPV6_AUTOFLOWLABEL: + val = np->autoflowlabel; + break; + default: return -ENOPROTOOPT; } diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 058f3eca2e53..5bf7b61f8ae8 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -38,6 +38,13 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "auto_flowlabels", + .data = &init_net.ipv6.sysctl.auto_flowlabels, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { .procname = "fwmark_reflect", .data = &init_net.ipv6.sysctl.fwmark_reflect, @@ -74,6 +81,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[0].data = &net->ipv6.sysctl.bindv6only; ipv6_table[1].data = &net->ipv6.sysctl.anycast_src_echo_reply; ipv6_table[2].data = &net->ipv6.sysctl.flowlabel_consistency; + ipv6_table[3].data = &net->ipv6.sysctl.auto_flowlabels; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) -- cgit v1.2.2 From b6428817190c5444294e0cc45bd571bfafbbb537 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 10 Jul 2014 18:02:46 +0800 Subject: ipv6: fix the check when handle RA d9333196572(ipv6: Allow accepting RA from local IP addresses.) made the wrong check, whether or not to accept RA with source-addr found on local machine, when accept_ra_from_local is 0. Fixes: d9333196572(ipv6: Allow accepting RA from local IP addresses.) Cc: Ben Greear Cc: Hannes Frederic Sowa Signed-off-by: Li RongQing Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index a845e3d2057e..b7ece278dd49 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1151,9 +1151,9 @@ static void ndisc_router_discovery(struct sk_buff *skb) /* Do not accept RA with source-addr found on local machine unless * accept_ra_from_local is set to true. */ - if (!(in6_dev->cnf.accept_ra_from_local || - ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, - NULL, 0))) { + if (!in6_dev->cnf.accept_ra_from_local && + ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, + NULL, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: default router ignored\n", skb->dev->name); @@ -1294,9 +1294,9 @@ skip_linkparms: } #ifdef CONFIG_IPV6_ROUTE_INFO - if (!(in6_dev->cnf.accept_ra_from_local || - ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, - NULL, 0))) { + if (!in6_dev->cnf.accept_ra_from_local && + ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, + NULL, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: router info ignored.\n", skb->dev->name); -- cgit v1.2.2 From bc91b0f07ada5535427373a4e2050877bcc12218 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 11 Jul 2014 21:10:18 +0200 Subject: ipv6: addrconf: implement address generation modes This patch introduces a possibility for userspace to set various (so far two) modes of generating addresses. This is useful for example for NetworkManager because it can set the mode to NONE and take care of link local addresses itself. That allow it to have the interface up, monitoring carrier but still don't have any addresses on it. One more use-case by Dan Williams: WWAN devices often have their LL address provided by the firmware of the device, which sometimes refuses to respond to incorrect LL addresses when doing DHCPv6 or IPv6 ND. The kernel cannot generate the correct LL address for two reasons: 1) WWAN pseudo-ethernet interfaces often construct a fake MAC address, or read a meaningless MAC address from the firmware. Thus the EUI64 and the IPv6LL address the kernel assigns will be wrong. The real LL address is often retrieved from the firmware with AT or proprietary commands. 2) WWAN PPP interfaces receive their LL address from IPV6CP, not from kernel assignments. Only after IPV6CP has completed do we know the LL address of the PPP interface and its peer. But the kernel has already assigned an incorrect LL address to the interface. So being able to suppress the kernel LL address generation and assign the one retrieved from the firmware is less complicated and more robust. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 56 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 18 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 358edd2272ac..4c03c2843094 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2730,9 +2730,25 @@ static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr } } +static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) +{ + if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) { + struct in6_addr addr; + + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); + /* addrconf_add_linklocal also adds a prefix_route and we + * only need to care about prefix routes if ipv6_generate_eui64 + * couldn't generate one. + */ + if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0) + addrconf_add_linklocal(idev, &addr); + else if (prefix_route) + addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); + } +} + static void addrconf_dev_config(struct net_device *dev) { - struct in6_addr addr; struct inet6_dev *idev; ASSERT_RTNL(); @@ -2753,11 +2769,7 @@ static void addrconf_dev_config(struct net_device *dev) if (IS_ERR(idev)) return; - memset(&addr, 0, sizeof(struct in6_addr)); - addr.s6_addr32[0] = htonl(0xFE800000); - - if (ipv6_generate_eui64(addr.s6_addr + 8, dev) == 0) - addrconf_add_linklocal(idev, &addr); + addrconf_addr_gen(idev, false); } #if IS_ENABLED(CONFIG_IPV6_SIT) @@ -2779,11 +2791,7 @@ static void addrconf_sit_config(struct net_device *dev) } if (dev->priv_flags & IFF_ISATAP) { - struct in6_addr addr; - - ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); - if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) - addrconf_add_linklocal(idev, &addr); + addrconf_addr_gen(idev, false); return; } @@ -2798,7 +2806,6 @@ static void addrconf_sit_config(struct net_device *dev) static void addrconf_gre_config(struct net_device *dev) { struct inet6_dev *idev; - struct in6_addr addr; ASSERT_RTNL(); @@ -2807,11 +2814,7 @@ static void addrconf_gre_config(struct net_device *dev) return; } - ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); - if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) - addrconf_add_linklocal(idev, &addr); - else - addrconf_prefix_route(&addr, 64, dev, 0, 0); + addrconf_addr_gen(idev, true); } #endif @@ -4423,6 +4426,10 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev) nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr)); if (nla == NULL) goto nla_put_failure; + + if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->addr_gen_mode)) + goto nla_put_failure; + read_lock_bh(&idev->lock); memcpy(nla_data(nla), idev->token.s6_addr, nla_len(nla)); read_unlock_bh(&idev->lock); @@ -4527,8 +4534,21 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) if (nla_parse_nested(tb, IFLA_INET6_MAX, nla, NULL) < 0) BUG(); - if (tb[IFLA_INET6_TOKEN]) + if (tb[IFLA_INET6_TOKEN]) { err = inet6_set_iftoken(idev, nla_data(tb[IFLA_INET6_TOKEN])); + if (err) + return err; + } + + if (tb[IFLA_INET6_ADDR_GEN_MODE]) { + u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]); + + if (mode != IN6_ADDR_GEN_MODE_EUI64 && + mode != IN6_ADDR_GEN_MODE_NONE) + return -EINVAL; + idev->addr_gen_mode = mode; + err = 0; + } return err; } -- cgit v1.2.2 From 8242fc33925c8da802b651a702a902a204142e22 Mon Sep 17 00:00:00 2001 From: Himangi Saraogi Date: Sat, 12 Jul 2014 01:55:38 +0530 Subject: net: ipv6: Use BUG_ON The semantic patch that makes the transformation is as follows: // @@ expression e; @@ -if (e) BUG(); +BUG_ON(e); // Signed-off-by: Himangi Saraogi Acked-by: Julia Lawall Signed-off-by: David S. Miller --- net/ipv6/raw.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index b2dc60b0c764..dee80fb1aa86 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -588,8 +588,7 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, } offset += skb_transport_offset(skb); - if (skb_copy_bits(skb, offset, &csum, 2)) - BUG(); + BUG_ON(skb_copy_bits(skb, offset, &csum, 2)); /* in case cksum was not initialized */ if (unlikely(csum)) @@ -601,8 +600,7 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, if (csum == 0 && fl6->flowi6_proto == IPPROTO_UDP) csum = CSUM_MANGLED_0; - if (skb_store_bits(skb, offset, &csum, 2)) - BUG(); + BUG_ON(skb_store_bits(skb, offset, &csum, 2)); send: err = ip6_push_pending_frames(sk); -- cgit v1.2.2 From e3f0b86b996d86940357e5ca9788771618d731f1 Mon Sep 17 00:00:00 2001 From: Himangi Saraogi Date: Sat, 12 Jul 2014 01:57:17 +0530 Subject: ipv6: Use BUG_ON The semantic patch that makes this transformation is as follows: // @@ expression e; @@ -if (e) BUG(); +BUG_ON(e); // Signed-off-by: Himangi Saraogi Acked-by: Julia Lawall Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index fa83bdd4c3dd..9b395c639a07 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -801,8 +801,8 @@ slow_path: /* * Copy a block of the IP datagram. */ - if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len)) - BUG(); + BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), + len)); left -= len; fh->frag_off = htons(offset); -- cgit v1.2.2 From c835a677331495cf137a7f8a023463afd9f032f8 Mon Sep 17 00:00:00 2001 From: Tom Gundersen Date: Mon, 14 Jul 2014 16:37:24 +0200 Subject: net: set name_assign_type in alloc_netdev() Extend alloc_netdev{,_mq{,s}}() to take name_assign_type as argument, and convert all users to pass NET_NAME_UNKNOWN. Coccinelle patch: @@ expression sizeof_priv, name, setup, txqs, rxqs, count; @@ ( -alloc_netdev_mqs(sizeof_priv, name, setup, txqs, rxqs) +alloc_netdev_mqs(sizeof_priv, name, NET_NAME_UNKNOWN, setup, txqs, rxqs) | -alloc_netdev_mq(sizeof_priv, name, setup, count) +alloc_netdev_mq(sizeof_priv, name, NET_NAME_UNKNOWN, setup, count) | -alloc_netdev(sizeof_priv, name, setup) +alloc_netdev(sizeof_priv, name, NET_NAME_UNKNOWN, setup) ) v9: move comments here from the wrong commit Signed-off-by: Tom Gundersen Reviewed-by: David Herrmann Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 6 ++++-- net/ipv6/ip6_tunnel.c | 5 +++-- net/ipv6/ip6_vti.c | 4 ++-- net/ipv6/ip6mr.c | 2 +- net/ipv6/sit.c | 4 +++- 5 files changed, 13 insertions(+), 8 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 365b2b6f3942..5f19dfbc4c6a 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -322,7 +322,8 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, else strcpy(name, "ip6gre%d"); - dev = alloc_netdev(sizeof(*t), name, ip6gre_tunnel_setup); + dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, + ip6gre_tunnel_setup); if (!dev) return NULL; @@ -1326,7 +1327,8 @@ static int __net_init ip6gre_init_net(struct net *net) int err; ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0", - ip6gre_tunnel_setup); + NET_NAME_UNKNOWN, + ip6gre_tunnel_setup); if (!ign->fb_tunnel_dev) { err = -ENOMEM; goto err_alloc_dev; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 51a1eb185ea7..f9de5a695072 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -315,7 +315,8 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) else sprintf(name, "ip6tnl%%d"); - dev = alloc_netdev(sizeof (*t), name, ip6_tnl_dev_setup); + dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, + ip6_tnl_dev_setup); if (dev == NULL) goto failed; @@ -1773,7 +1774,7 @@ static int __net_init ip6_tnl_init_net(struct net *net) err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", - ip6_tnl_dev_setup); + NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!ip6n->fb_tnl_dev) goto err_alloc_dev; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 9aaa6bb229e4..17ee4fc32dfe 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -204,7 +204,7 @@ static struct ip6_tnl *vti6_tnl_create(struct net *net, struct __ip6_tnl_parm *p else sprintf(name, "ip6_vti%%d"); - dev = alloc_netdev(sizeof(*t), name, vti6_dev_setup); + dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, vti6_dev_setup); if (dev == NULL) goto failed; @@ -1020,7 +1020,7 @@ static int __net_init vti6_init_net(struct net *net) err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6_vti0", - vti6_dev_setup); + NET_NAME_UNKNOWN, vti6_dev_setup); if (!ip6n->fb_tnl_dev) goto err_alloc_dev; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 8250474ab7dc..f9a3fd320d1d 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -744,7 +744,7 @@ static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt) else sprintf(name, "pim6reg%u", mrt->id); - dev = alloc_netdev(0, name, reg_vif_setup); + dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup); if (dev == NULL) return NULL; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 4f408176dc64..2e9ba035fb5f 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -250,7 +250,8 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, else strcpy(name, "sit%d"); - dev = alloc_netdev(sizeof(*t), name, ipip6_tunnel_setup); + dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, + ipip6_tunnel_setup); if (dev == NULL) return NULL; @@ -1729,6 +1730,7 @@ static int __net_init sit_init_net(struct net *net) sitn->tunnels[3] = sitn->tunnels_r_l; sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0", + NET_NAME_UNKNOWN, ipip6_tunnel_setup); if (!sitn->fb_tunnel_dev) { err = -ENOMEM; -- cgit v1.2.2 From d518825eab84409341d9e90375138ef6d9355b8b Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 14 Jul 2014 18:36:33 +0200 Subject: netfilter: remove unnecessary break after return Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ipv6/netfilter/ip6t_ipv6header.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c index 54bd9790603f..8b147440fbdc 100644 --- a/net/ipv6/netfilter/ip6t_ipv6header.c +++ b/net/ipv6/netfilter/ip6t_ipv6header.c @@ -94,7 +94,6 @@ ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par) break; default: return false; - break; } nexthdr = hp->nexthdr; -- cgit v1.2.2 From 66635dc121e1ceb15a4481c221d0721ce1699523 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 14 Jul 2014 18:36:34 +0200 Subject: ipv6: remove unnecessary break after return Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/ipv6/ipv6_sockglue.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index b50b9e54cf53..0c289982796d 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -1162,7 +1162,6 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, return -EFAULT; return 0; - break; } case IPV6_TRANSPARENT: -- cgit v1.2.2 From 11878b40ed5c5bc20d6a115bae156a5b90b0fb3e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 14 Jul 2014 17:55:06 -0400 Subject: net-timestamp: SOCK_RAW and PING timestamping Add SO_TIMESTAMPING to sockets of type PF_INET[6]/SOCK_RAW: Add the necessary sock_tx_timestamp calls to the datapath for RAW sockets (ping sockets already had these calls). Fix the IP output path to pass the timestamp flags on the first fragment also for these sockets. The existing code relies on transhdrlen != 0 to indicate a first fragment. For these sockets, that assumption does not hold. This fixes http://bugzilla.kernel.org/show_bug.cgi?id=77221 Tested SOCK_RAW on IPv4 and IPv6, not PING. Signed-off-by: Willem de Bruijn Acked-by: Richard Cochran Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 9b395c639a07..759456f0c207 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1271,7 +1271,7 @@ emsgsize: } /* For UDP, check if TX timestamp is enabled */ - if (sk->sk_type == SOCK_DGRAM) + if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) sock_tx_timestamp(sk, &tx_flags); /* @@ -1380,12 +1380,6 @@ alloc_new_skb: sk->sk_allocation); if (unlikely(skb == NULL)) err = -ENOBUFS; - else { - /* Only the initial fragment - * is time stamped. - */ - tx_flags = 0; - } } if (skb == NULL) goto error; @@ -1399,8 +1393,9 @@ alloc_new_skb: skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + dst_exthdrlen); - if (sk->sk_type == SOCK_DGRAM) - skb_shinfo(skb)->tx_flags = tx_flags; + /* Only the initial fragment is time stamped */ + skb_shinfo(skb)->tx_flags = tx_flags; + tx_flags = 0; /* * Find where to start putting bytes -- cgit v1.2.2 From 5cf3d46192fccf68b4a4759e4d7346e41c669a76 Mon Sep 17 00:00:00 2001 From: David Held Date: Tue, 15 Jul 2014 23:28:31 -0400 Subject: udp: Simplify __udp*_lib_mcast_deliver. Switch to using sk_nulls_for_each which shortens the code and makes it easier to update. Signed-off-by: David Held Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/udp.c | 88 +++++++++++++++++++++++----------------------------------- 1 file changed, 35 insertions(+), 53 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b4481df3d5fa..7d3bd80085be 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -702,43 +702,26 @@ drop: return -1; } -static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk, - __be16 loc_port, const struct in6_addr *loc_addr, - __be16 rmt_port, const struct in6_addr *rmt_addr, - int dif) +static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, + __be16 loc_port, const struct in6_addr *loc_addr, + __be16 rmt_port, const struct in6_addr *rmt_addr, + int dif, unsigned short hnum) { - struct hlist_nulls_node *node; - unsigned short num = ntohs(loc_port); - - sk_nulls_for_each_from(sk, node) { - struct inet_sock *inet = inet_sk(sk); - - if (!net_eq(sock_net(sk), net)) - continue; - - if (udp_sk(sk)->udp_port_hash == num && - sk->sk_family == PF_INET6) { - if (inet->inet_dport) { - if (inet->inet_dport != rmt_port) - continue; - } - if (!ipv6_addr_any(&sk->sk_v6_daddr) && - !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) - continue; - - if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) - continue; + struct inet_sock *inet = inet_sk(sk); - if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { - if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)) - continue; - } - if (!inet6_mc_check(sk, loc_addr, rmt_addr)) - continue; - return sk; - } - } - return NULL; + if (!net_eq(sock_net(sk), net)) + return false; + + if (udp_sk(sk)->udp_port_hash != hnum || + sk->sk_family != PF_INET6 || + (inet->inet_dport && inet->inet_dport != rmt_port) || + (!ipv6_addr_any(&sk->sk_v6_daddr) && + !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) || + (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + return false; + if (!inet6_mc_check(sk, loc_addr, rmt_addr)) + return false; + return true; } static void flush_stack(struct sock **stack, unsigned int count, @@ -787,28 +770,27 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, { struct sock *sk, *stack[256 / sizeof(struct sock *)]; const struct udphdr *uh = udp_hdr(skb); - struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); - int dif; + struct hlist_nulls_node *node; + unsigned short hnum = ntohs(uh->dest); + struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); + int dif = inet6_iif(skb); unsigned int i, count = 0; spin_lock(&hslot->lock); - sk = sk_nulls_head(&hslot->head); - dif = inet6_iif(skb); - sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); - while (sk) { - /* If zero checksum and no_check is not on for - * the socket then skip it. - */ - if (uh->check || udp_sk(sk)->no_check6_rx) + sk_nulls_for_each(sk, node, &hslot->head) { + if (__udp_v6_is_mcast_sock(net, sk, + uh->dest, daddr, + uh->source, saddr, + dif, hnum) && + /* If zero checksum and no_check is not on for + * the socket then skip it. + */ + (uh->check || udp_sk(sk)->no_check6_rx)) { + if (unlikely(count == ARRAY_SIZE(stack))) { + flush_stack(stack, count, skb, ~0); + count = 0; + } stack[count++] = sk; - - sk = udp_v6_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr, - uh->source, saddr, dif); - if (unlikely(count == ARRAY_SIZE(stack))) { - if (!sk) - break; - flush_stack(stack, count, skb, ~0); - count = 0; } } /* -- cgit v1.2.2 From 2dc41cff7545d55c6294525c811594576f8e119c Mon Sep 17 00:00:00 2001 From: David Held Date: Tue, 15 Jul 2014 23:28:32 -0400 Subject: udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver. Many multicast sources can have the same port which can result in a very large list when hashing by port only. Hash by address and port instead if this is the case. This makes multicast more similar to unicast. On a 24-core machine receiving from 500 multicast sockets on the same port, before this patch 80% of system CPU was used up by spin locking and only ~25% of packets were successfully delivered. With this patch, all packets are delivered and kernel overhead is ~8% system CPU on spinlocks. Signed-off-by: David Held Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/udp.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7d3bd80085be..f9d8800bb72f 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -745,6 +745,7 @@ static void flush_stack(struct sock **stack, unsigned int count, if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0) skb1 = NULL; + sock_put(sk); } if (unlikely(skb1)) kfree_skb(skb1); @@ -774,10 +775,20 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, unsigned short hnum = ntohs(uh->dest); struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); int dif = inet6_iif(skb); - unsigned int i, count = 0; + unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); + unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); + + if (use_hash2) { + hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) & + udp_table.mask; + hash2 = udp6_portaddr_hash(net, daddr, hnum) & udp_table.mask; +start_lookup: + hslot = &udp_table.hash2[hash2]; + offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); + } spin_lock(&hslot->lock); - sk_nulls_for_each(sk, node, &hslot->head) { + sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) { if (__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr, uh->source, saddr, @@ -791,21 +802,20 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, count = 0; } stack[count++] = sk; + sock_hold(sk); } } - /* - * before releasing the lock, we must take reference on sockets - */ - for (i = 0; i < count; i++) - sock_hold(stack[i]); spin_unlock(&hslot->lock); + /* Also lookup *:port if we are using hash2 and haven't done so yet. */ + if (use_hash2 && hash2 != hash2_any) { + hash2 = hash2_any; + goto start_lookup; + } + if (count) { flush_stack(stack, count, skb, count - 1); - - for (i = 0; i < count; i++) - sock_put(stack[i]); } else { kfree_skb(skb); } -- cgit v1.2.2 From 274f482d33a309c87096f2983601ceda2761094e Mon Sep 17 00:00:00 2001 From: Sorin Dumitru Date: Tue, 22 Jul 2014 21:16:51 +0300 Subject: sock: remove skb argument from sk_rcvqueues_full It hasn't been used since commit 0fd7bac(net: relax rcvbuf limits). Signed-off-by: Sorin Dumitru Signed-off-by: David S. Miller --- net/ipv6/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index f9d8800bb72f..5b6091de5868 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -673,7 +673,7 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) goto csum_error; } - if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { + if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, is_udplite); goto drop; -- cgit v1.2.2 From 56ec0fb10c9f8b35a2991871ea879840d1a0cbde Mon Sep 17 00:00:00 2001 From: Himangi Saraogi Date: Fri, 25 Jul 2014 01:49:37 +0530 Subject: neigh: remove exceptional & on function name In this file, function names are otherwise used as pointers without &. A simplified version of the Coccinelle semantic patch that makes this change is as follows: // @r@ identifier f; @@ f(...) { ... } @@ identifier r.f; @@ - &f + f // Signed-off-by: Himangi Saraogi Acked-by: Julia Lawall Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index b7ece278dd49..339078f95d1b 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1768,7 +1768,7 @@ int __init ndisc_init(void) #ifdef CONFIG_SYSCTL err = neigh_sysctl_register(NULL, &nd_tbl.parms, - &ndisc_ifinfo_sysctl_change); + ndisc_ifinfo_sysctl_change); if (err) goto out_unregister_pernet; out: -- cgit v1.2.2 From ac3d2e5a9ef2f4d8f57c50070c4883ecb7cec29f Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 25 Jul 2014 14:45:11 +0800 Subject: ipv6: remove obsolete comment in ip6_append_data() After 11878b40e[net-timestamp: SOCK_RAW and PING timestamping], this comment becomes obsolete since the codes check not only UDP socket, but also RAW sock; and the codes are clear, not need the comments Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 759456f0c207..2e339d241b69 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1270,7 +1270,6 @@ emsgsize: } } - /* For UDP, check if TX timestamp is enabled */ if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) sock_tx_timestamp(sk, &tx_flags); -- cgit v1.2.2 From 36c7778218b93d96d88d68f116a711f6a598b72f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 24 Jul 2014 16:50:29 +0200 Subject: inet: frag: constify match, hashfn and constructor arguments Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +- net/ipv6/reassembly.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 0d5279fd852a..c2c3f2116bc5 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -156,7 +156,7 @@ static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr, } -static unsigned int nf_hashfn(struct inet_frag_queue *q) +static unsigned int nf_hashfn(const struct inet_frag_queue *q) { const struct frag_queue *nq; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index cc85a9ba5010..0c6932cc08cb 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -94,18 +94,18 @@ static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr, return c & (INETFRAGS_HASHSZ - 1); } -static unsigned int ip6_hashfn(struct inet_frag_queue *q) +static unsigned int ip6_hashfn(const struct inet_frag_queue *q) { - struct frag_queue *fq; + const struct frag_queue *fq; fq = container_of(q, struct frag_queue, q); return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr); } -bool ip6_frag_match(struct inet_frag_queue *q, void *a) +bool ip6_frag_match(const struct inet_frag_queue *q, const void *a) { - struct frag_queue *fq; - struct ip6_create_arg *arg = a; + const struct frag_queue *fq; + const struct ip6_create_arg *arg = a; fq = container_of(q, struct frag_queue, q); return fq->id == arg->id && @@ -115,10 +115,10 @@ bool ip6_frag_match(struct inet_frag_queue *q, void *a) } EXPORT_SYMBOL(ip6_frag_match); -void ip6_frag_init(struct inet_frag_queue *q, void *a) +void ip6_frag_init(struct inet_frag_queue *q, const void *a) { struct frag_queue *fq = container_of(q, struct frag_queue, q); - struct ip6_create_arg *arg = a; + const struct ip6_create_arg *arg = a; fq->id = arg->id; fq->user = arg->user; -- cgit v1.2.2 From fb3cfe6e75b9d05c87265e85e67d7caf6e5b44a7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 24 Jul 2014 16:50:30 +0200 Subject: inet: frag: remove hash size assumptions from callers hide actual hash size from individual users: The _find function will now fold the given hash value into the required range. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 7 ++----- net/ipv6/reassembly.c | 8 ++------ 2 files changed, 4 insertions(+), 11 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index c2c3f2116bc5..607e4a94ef41 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -147,12 +147,9 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr, const struct in6_addr *daddr) { - u32 c; - net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd)); - c = jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), - (__force u32)id, nf_frags.rnd); - return c & (INETFRAGS_HASHSZ - 1); + return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), + (__force u32)id, nf_frags.rnd); } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 0c6932cc08cb..2b76549a1016 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -85,13 +85,9 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr, const struct in6_addr *daddr) { - u32 c; - net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd)); - c = jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), - (__force u32)id, ip6_frags.rnd); - - return c & (INETFRAGS_HASHSZ - 1); + return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), + (__force u32)id, ip6_frags.rnd); } static unsigned int ip6_hashfn(const struct inet_frag_queue *q) -- cgit v1.2.2 From 86e93e470cadedda9181a2bd9aee1d9d2e5e9c0f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 24 Jul 2014 16:50:31 +0200 Subject: inet: frag: move evictor calls into frag_find function First step to move eviction handling into a work queue. We lose two spots that accounted evicted fragments in MIB counters. Accounting will be restored since the upcoming work-queue evictor invokes the frag queue timer callbacks instead. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 4 ---- net/ipv6/reassembly.c | 6 ------ 2 files changed, 10 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 607e4a94ef41..58e32cf91c95 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -594,10 +594,6 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) hdr = ipv6_hdr(clone); fhdr = (struct frag_hdr *)skb_transport_header(clone); - local_bh_disable(); - inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false); - local_bh_enable(); - fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, ip6_frag_ecn(hdr)); if (fq == NULL) { diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 2b76549a1016..97acbc490d9e 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -519,7 +519,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb) struct frag_queue *fq; const struct ipv6hdr *hdr = ipv6_hdr(skb); struct net *net = dev_net(skb_dst(skb)->dev); - int evicted; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) goto fail_hdr; @@ -548,11 +547,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return 1; } - evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags, false); - if (evicted) - IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_REASMFAILS, evicted); - fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, ip6_frag_ecn(hdr)); if (fq != NULL) { -- cgit v1.2.2 From b13d3cbfb8e8a8f53930af67d1ebf05149f32c24 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 24 Jul 2014 16:50:32 +0200 Subject: inet: frag: move eviction of queues to work queue When the high_thresh limit is reached we try to toss the 'oldest' incomplete fragment queues until memory limits are below the low_thresh value. This happens in softirq/packet processing context. This has two drawbacks: 1) processors might evict a queue that was about to be completed by another cpu, because they will compete wrt. resource usage and resource reclaim. 2) LRU list maintenance is expensive. But when constantly overloaded, even the 'least recently used' element is recent, so removing 'lru' queue first is not 'fairer' than removing any other fragment queue. This moves eviction out of the fast path: When the low threshold is reached, a work queue is scheduled which then iterates over the table and removes the queues that exceed the memory limits of the namespace. It sets a new flag called INET_FRAG_EVICTED on the evicted queues so the proper counters will get incremented when the queue is forcefully expired. When the high threshold is reached, no more fragment queues are created until we're below the limit again. The LRU list is now unused and will be removed in a followup patch. Joint work with Nikolay Aleksandrov. Suggested-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv6/reassembly.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 97acbc490d9e..b3924b10dff3 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -141,7 +141,9 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, if (!dev) goto out_rcu_unlock; - IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); + if (!(fq->q.last_in & INET_FRAG_EVICTED)) + IP6_INC_STATS_BH(net, __in6_dev_get(dev), + IPSTATS_MIB_REASMTIMEOUT); IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); /* Don't send error if the first segment did not arrive. */ -- cgit v1.2.2 From 434d305405ab86414f6ea3f261307d443a2c3506 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 24 Jul 2014 16:50:33 +0200 Subject: inet: frag: don't account number of fragment queues The 'nqueues' counter is protected by the lru list lock, once thats removed this needs to be converted to atomic counter. Given this isn't used for anything except for reporting it to userspace via /proc, just remove it. We still report the memory currently used by fragment reassembly queues. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv6/proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 3317440ea341..2d6f860e5c1e 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -33,6 +33,7 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; + unsigned int frag_mem = ip6_frag_mem(net); seq_printf(seq, "TCP6: inuse %d\n", sock_prot_inuse_get(net, &tcpv6_prot)); @@ -42,8 +43,7 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplitev6_prot)); seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); - seq_printf(seq, "FRAG6: inuse %d memory %d\n", - ip6_frag_nqueues(net), ip6_frag_mem(net)); + seq_printf(seq, "FRAG6: inuse %u memory %u\n", !!frag_mem, frag_mem); return 0; } -- cgit v1.2.2 From 3fd588eb90bfbba17091381006ecafe29c45db4a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 24 Jul 2014 16:50:34 +0200 Subject: inet: frag: remove lru list no longer used. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 1 - net/ipv6/reassembly.c | 1 - 2 files changed, 2 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 58e32cf91c95..fb0f72a0ff31 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -349,7 +349,6 @@ found: fq->q.last_in |= INET_FRAG_FIRST_IN; } - inet_frag_lru_move(&fq->q); return 0; discard_fq: diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index b3924b10dff3..af85551682c2 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -351,7 +351,6 @@ found: } skb_dst_drop(skb); - inet_frag_lru_move(&fq->q); return -1; discard_fq: -- cgit v1.2.2 From e3a57d18b06179d68fcf7a0a06ad844493c65e06 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 24 Jul 2014 16:50:35 +0200 Subject: inet: frag: remove periodic secret rebuild timer merge functionality into the eviction workqueue. Instead of rebuilding every n seconds, take advantage of the upper hash chain length limit. If we hit it, mark table for rebuild and schedule workqueue. To prevent frequent rebuilds when we're completely overloaded, don't rebuild more than once every 5 seconds. ipfrag_secret_interval sysctl is now obsolete and has been marked as deprecated, it still can be changed so scripts won't be broken but it won't have any effect. A comment is left above each unused secret_timer variable to avoid confusion. Joint work with Nikolay Aleksandrov. Signed-off-by: Florian Westphal Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 1 - net/ipv6/reassembly.c | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index fb0f72a0ff31..3b3ef9774cc2 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -669,7 +669,6 @@ int nf_ct_frag6_init(void) nf_frags.qsize = sizeof(struct frag_queue); nf_frags.match = ip6_frag_match; nf_frags.frag_expire = nf_ct_frag6_expire; - nf_frags.secret_interval = 10 * 60 * HZ; inet_frags_init(&nf_frags); ret = register_pernet_subsys(&nf_ct_net_ops); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index af85551682c2..987fea46b915 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -604,10 +604,12 @@ static struct ctl_table ip6_frags_ns_ctl_table[] = { { } }; +/* secret interval has been deprecated */ +static int ip6_frags_secret_interval_unused; static struct ctl_table ip6_frags_ctl_table[] = { { .procname = "ip6frag_secret_interval", - .data = &ip6_frags.secret_interval, + .data = &ip6_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -737,7 +739,6 @@ int __init ipv6_frag_init(void) ip6_frags.qsize = sizeof(struct frag_queue); ip6_frags.match = ip6_frag_match; ip6_frags.frag_expire = ip6_frag_expire; - ip6_frags.secret_interval = 10 * 60 * HZ; inet_frags_init(&ip6_frags); out: return ret; -- cgit v1.2.2 From ab1c724f633080ed2e8a0cfe61654599b55cf8f9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 24 Jul 2014 16:50:36 +0200 Subject: inet: frag: use seqlock for hash rebuild rehash is rare operation, don't force readers to take the read-side rwlock. Instead, we only have to detect the (rare) case where the secret was altered while we are trying to insert a new inetfrag queue into the table. If it was changed, drop the bucket lock and recompute the hash to get the 'new' chain bucket that we have to insert into. Joint work with Nikolay Aleksandrov. Signed-off-by: Florian Westphal Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +- net/ipv6/reassembly.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 3b3ef9774cc2..4d9da1e35f8c 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -193,7 +193,7 @@ static inline struct frag_queue *fq_find(struct net *net, __be32 id, arg.dst = dst; arg.ecn = ecn; - read_lock_bh(&nf_frags.lock); + local_bh_disable(); hash = nf_hash_frag(id, src, dst); q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 987fea46b915..57a9707b2032 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -190,7 +190,6 @@ fq_find(struct net *net, __be32 id, const struct in6_addr *src, arg.dst = dst; arg.ecn = ecn; - read_lock(&ip6_frags.lock); hash = inet6_hash_frag(id, src, dst); q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); -- cgit v1.2.2 From 1bab4c75075b84675b96992ac47580a57c26958d Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 24 Jul 2014 16:50:37 +0200 Subject: inet: frag: set limits and make init_net's high_thresh limit global This patch makes init_net's high_thresh limit to be the maximum for all namespaces, thus introducing a global memory limit threshold equal to the sum of the individual high_thresh limits which are capped. It also introduces some sane minimums for low_thresh as it shouldn't be able to drop below 0 (or > high_thresh in the unsigned case), and overall low_thresh should not ever be above high_thresh, so we make the following relations for a namespace: init_net: high_thresh - max(not capped), min(init_net low_thresh) low_thresh - max(init_net high_thresh), min (0) all other namespaces: high_thresh = max(init_net high_thresh), min(namespace's low_thresh) low_thresh = max(namespace's high_thresh), min(0) The major issue with having low_thresh > high_thresh is that we'll schedule eviction but never evict anything and thus rely only on the timers. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 12 ++++++++++-- net/ipv6/reassembly.c | 12 ++++++++++-- 2 files changed, 20 insertions(+), 4 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 4d9da1e35f8c..3d4bccf6d67d 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -63,6 +63,8 @@ struct nf_ct_frag6_skb_cb static struct inet_frags nf_frags; #ifdef CONFIG_SYSCTL +static int zero; + static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_timeout", @@ -76,14 +78,17 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = { .data = &init_net.nf_frag.frags.low_thresh, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &init_net.nf_frag.frags.high_thresh }, { .procname = "nf_conntrack_frag6_high_thresh", .data = &init_net.nf_frag.frags.high_thresh, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.nf_frag.frags.low_thresh }, { } }; @@ -102,7 +107,10 @@ static int nf_ct_frag6_sysctl_register(struct net *net) table[0].data = &net->nf_frag.frags.timeout; table[1].data = &net->nf_frag.frags.low_thresh; + table[1].extra2 = &net->nf_frag.frags.high_thresh; table[2].data = &net->nf_frag.frags.high_thresh; + table[2].extra1 = &net->nf_frag.frags.low_thresh; + table[2].extra2 = &init_net.nf_frag.frags.high_thresh; } hdr = register_net_sysctl(net, "net/netfilter", table); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 57a9707b2032..f1709c4a289a 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -578,20 +578,25 @@ static const struct inet6_protocol frag_protocol = }; #ifdef CONFIG_SYSCTL +static int zero; + static struct ctl_table ip6_frags_ns_ctl_table[] = { { .procname = "ip6frag_high_thresh", .data = &init_net.ipv6.frags.high_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.ipv6.frags.low_thresh }, { .procname = "ip6frag_low_thresh", .data = &init_net.ipv6.frags.low_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &init_net.ipv6.frags.high_thresh }, { .procname = "ip6frag_time", @@ -628,7 +633,10 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net) goto err_alloc; table[0].data = &net->ipv6.frags.high_thresh; + table[0].extra1 = &net->ipv6.frags.low_thresh; + table[0].extra2 = &init_net.ipv6.frags.high_thresh; table[1].data = &net->ipv6.frags.low_thresh; + table[1].extra2 = &net->ipv6.frags.high_thresh; table[2].data = &net->ipv6.frags.timeout; /* Don't export sysctls to unprivileged users */ -- cgit v1.2.2 From a317a2f19da7d0af1f068cf4d3ae80cdd73643b7 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Fri, 25 Jul 2014 15:25:09 -0700 Subject: ipv6: fail early when creating netdev named all or default We create a proc dir for each network device, this will cause conflicts when the devices have name "all" or "default". Rather than emitting an ugly kernel warning, we could just fail earlier by checking the device name. Reported-by: Stephane Chazelas Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 82 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 28 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4c03c2843094..0b239fc1816e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -108,11 +108,12 @@ static inline u32 cstamp_delta(unsigned long cstamp) } #ifdef CONFIG_SYSCTL -static void addrconf_sysctl_register(struct inet6_dev *idev); +static int addrconf_sysctl_register(struct inet6_dev *idev); static void addrconf_sysctl_unregister(struct inet6_dev *idev); #else -static inline void addrconf_sysctl_register(struct inet6_dev *idev) +static inline int addrconf_sysctl_register(struct inet6_dev *idev) { + return 0; } static inline void addrconf_sysctl_unregister(struct inet6_dev *idev) @@ -310,16 +311,16 @@ err_ip: static struct inet6_dev *ipv6_add_dev(struct net_device *dev) { struct inet6_dev *ndev; + int err = -ENOMEM; ASSERT_RTNL(); if (dev->mtu < IPV6_MIN_MTU) - return NULL; + return ERR_PTR(-EINVAL); ndev = kzalloc(sizeof(struct inet6_dev), GFP_KERNEL); - if (ndev == NULL) - return NULL; + return ERR_PTR(err); rwlock_init(&ndev->lock); ndev->dev = dev; @@ -332,7 +333,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); if (ndev->nd_parms == NULL) { kfree(ndev); - return NULL; + return ERR_PTR(err); } if (ndev->cnf.forwarding) dev_disable_lro(dev); @@ -346,17 +347,14 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) neigh_parms_release(&nd_tbl, ndev->nd_parms); dev_put(dev); kfree(ndev); - return NULL; + return ERR_PTR(err); } if (snmp6_register_dev(ndev) < 0) { ADBG(KERN_WARNING "%s: cannot create /proc/net/dev_snmp6/%s\n", __func__, dev->name); - neigh_parms_release(&nd_tbl, ndev->nd_parms); - ndev->dead = 1; - in6_dev_finish_destroy(ndev); - return NULL; + goto err_release; } /* One reference from device. We must do this before @@ -394,7 +392,12 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) ipv6_mc_init_dev(ndev); ndev->tstamp = jiffies; - addrconf_sysctl_register(ndev); + err = addrconf_sysctl_register(ndev); + if (err) { + ipv6_mc_destroy_dev(ndev); + del_timer(&ndev->regen_timer); + goto err_release; + } /* protected by rtnl_lock */ rcu_assign_pointer(dev->ip6_ptr, ndev); @@ -409,6 +412,12 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters); return ndev; + +err_release: + neigh_parms_release(&nd_tbl, ndev->nd_parms); + ndev->dead = 1; + in6_dev_finish_destroy(ndev); + return ERR_PTR(err); } static struct inet6_dev *ipv6_find_idev(struct net_device *dev) @@ -420,7 +429,7 @@ static struct inet6_dev *ipv6_find_idev(struct net_device *dev) idev = __in6_dev_get(dev); if (!idev) { idev = ipv6_add_dev(dev); - if (!idev) + if (IS_ERR(idev)) return NULL; } @@ -2830,8 +2839,8 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, case NETDEV_REGISTER: if (!idev && dev->mtu >= IPV6_MIN_MTU) { idev = ipv6_add_dev(dev); - if (!idev) - return notifier_from_errno(-ENOMEM); + if (IS_ERR(idev)) + return notifier_from_errno(PTR_ERR(idev)); } break; @@ -2851,7 +2860,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (!idev && dev->mtu >= IPV6_MIN_MTU) idev = ipv6_add_dev(dev); - if (idev) { + if (!IS_ERR_OR_NULL(idev)) { idev->if_flags |= IF_READY; run_pending = 1; } @@ -2894,7 +2903,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, break; } - if (idev) { + if (!IS_ERR_OR_NULL(idev)) { if (run_pending) addrconf_dad_run(idev); @@ -2929,7 +2938,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (!idev && dev->mtu >= IPV6_MIN_MTU) { idev = ipv6_add_dev(dev); - if (idev) + if (!IS_ERR(idev)) break; } @@ -2950,10 +2959,14 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (idev) { snmp6_unregister_dev(idev); addrconf_sysctl_unregister(idev); - addrconf_sysctl_register(idev); - err = snmp6_register_dev(idev); + err = addrconf_sysctl_register(idev); if (err) return notifier_from_errno(err); + err = snmp6_register_dev(idev); + if (err) { + addrconf_sysctl_unregister(idev); + return notifier_from_errno(err); + } } break; @@ -5248,12 +5261,23 @@ static void __addrconf_sysctl_unregister(struct ipv6_devconf *p) kfree(t); } -static void addrconf_sysctl_register(struct inet6_dev *idev) +static int addrconf_sysctl_register(struct inet6_dev *idev) { - neigh_sysctl_register(idev->dev, idev->nd_parms, - &ndisc_ifinfo_sysctl_change); - __addrconf_sysctl_register(dev_net(idev->dev), idev->dev->name, - idev, &idev->cnf); + int err; + + if (!sysctl_dev_name_is_allowed(idev->dev->name)) + return -EINVAL; + + err = neigh_sysctl_register(idev->dev, idev->nd_parms, + &ndisc_ifinfo_sysctl_change); + if (err) + return err; + err = __addrconf_sysctl_register(dev_net(idev->dev), idev->dev->name, + idev, &idev->cnf); + if (err) + neigh_sysctl_unregister(idev->nd_parms); + + return err; } static void addrconf_sysctl_unregister(struct inet6_dev *idev) @@ -5338,6 +5362,7 @@ static struct rtnl_af_ops inet6_ops = { int __init addrconf_init(void) { + struct inet6_dev *idev; int i, err; err = ipv6_addr_label_init(); @@ -5376,11 +5401,12 @@ int __init addrconf_init(void) * device and it being up should be removed. */ rtnl_lock(); - if (!ipv6_add_dev(init_net.loopback_dev)) - err = -ENOMEM; + idev = ipv6_add_dev(init_net.loopback_dev); rtnl_unlock(); - if (err) + if (IS_ERR(idev)) { + err = PTR_ERR(idev); goto errlo; + } for (i = 0; i < IN6_ADDR_HSIZE; i++) INIT_HLIST_HEAD(&inet6_addr_lst[i]); -- cgit v1.2.2 From 7304fe4681634a8e0511a5922c972aa132ffb43d Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Thu, 31 Jul 2014 17:54:32 +0800 Subject: net: fix the counter ICMP_MIB_INERRORS/ICMP6_MIB_INERRORS When dealing with ICMPv[46] Error Message, function icmp_socket_deliver() and icmpv6_notify() do some valid checks on packet's length, but then some protocols check packet's length redaudantly. So remove those duplicated statements, and increase counter ICMP_MIB_INERRORS/ICMP6_MIB_INERRORS in function icmp_socket_deliver() and icmpv6_notify() respectively. In addition, add missed counter in udp6/udplite6 when socket is NULL. Signed-off-by: Duan Jiong Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 13 ++++++++----- net/ipv6/udp.c | 8 ++++++-- 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index f6c84a6eb238..06ba3e58320b 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -626,9 +626,10 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) int inner_offset; __be16 frag_off; u8 nexthdr; + struct net *net = dev_net(skb->dev); if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - return; + goto out; nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr; if (ipv6_ext_hdr(nexthdr)) { @@ -636,14 +637,14 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); if (inner_offset<0) - return; + goto out; } else { inner_offset = sizeof(struct ipv6hdr); } /* Checkin header including 8 bytes of inner protocol header. */ if (!pskb_may_pull(skb, inner_offset+8)) - return; + goto out; /* BUGGG_FUTURE: we should try to parse exthdrs in this packet. Without this we will not able f.e. to make source routed @@ -652,13 +653,15 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) --ANK (980726) */ - rcu_read_lock(); ipprot = rcu_dereference(inet6_protos[nexthdr]); if (ipprot && ipprot->err_handler) ipprot->err_handler(skb, NULL, type, code, inner_offset, info); - rcu_read_unlock(); raw6_icmp_error(skb, nexthdr, type, code, inner_offset, info); + return; + +out: + ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); } /* diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 5b6091de5868..c6941a1ac977 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -533,11 +533,15 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct udphdr *uh = (struct udphdr*)(skb->data+offset); struct sock *sk; int err; + struct net *net = dev_net(skb->dev); - sk = __udp6_lib_lookup(dev_net(skb->dev), daddr, uh->dest, + sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, inet6_iif(skb), udptable); - if (sk == NULL) + if (sk == NULL) { + ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), + ICMP6_MIB_INERRORS); return; + } if (type == ICMPV6_PKT_TOOBIG) { if (!ip6_sk_accept_pmtu(sk)) -- cgit v1.2.2 From 4330487acfff0cf1d7b14d238583a182e0a444bb Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Fri, 1 Aug 2014 09:52:58 +0800 Subject: net: use inet6_iif instead of IP6CB()->iif Signed-off-by: Duan Jiong Signed-off-by: David S. Miller --- net/ipv6/raw.c | 8 ++++---- net/ipv6/udp.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index dee80fb1aa86..39d44226e402 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -176,7 +176,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) goto out; net = dev_net(skb->dev); - sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr, IP6CB(skb)->iif); + sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr, inet6_iif(skb)); while (sk) { int filtered; @@ -220,7 +220,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) } } sk = __raw_v6_lookup(net, sk_next(sk), nexthdr, daddr, saddr, - IP6CB(skb)->iif); + inet6_iif(skb)); } out: read_unlock(&raw_v6_hashinfo.lock); @@ -375,7 +375,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, net = dev_net(skb->dev); while ((sk = __raw_v6_lookup(net, sk, nexthdr, saddr, daddr, - IP6CB(skb)->iif))) { + inet6_iif(skb)))) { rawv6_err(sk, skb, NULL, type, code, inner_offset, info); sk = sk_next(sk); @@ -506,7 +506,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, sin6->sin6_addr = ipv6_hdr(skb)->saddr; sin6->sin6_flowinfo = 0; sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, - IP6CB(skb)->iif); + inet6_iif(skb)); *addr_len = sizeof(*sin6); } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c6941a1ac977..4836af8f582d 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -472,7 +472,7 @@ try_again: sin6->sin6_addr = ipv6_hdr(skb)->saddr; sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, - IP6CB(skb)->iif); + inet6_iif(skb)); } *addr_len = sizeof(*sin6); } -- cgit v1.2.2 From d2373862b3589260f0139a6e4969478f84154369 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 1 Aug 2014 12:29:43 +0200 Subject: inet: frags: use INC_STATS_BH in the ipv6 reassembly code Softirqs are already disabled so no need to do it again, thus let's be consistent and use the IP6_INC_STATS_BH variant. Signed-off-by: Nikolay Aleksandrov Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/reassembly.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index f1709c4a289a..512ccc027ce3 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -355,8 +355,8 @@ found: discard_fq: inet_frag_kill(&fq->q, &ip6_frags); err: - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_REASMFAILS); + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -1; } @@ -566,7 +566,8 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return -1; fail_hdr: - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb)); return -1; } -- cgit v1.2.2 From 06aa8b8a0345c78f4d9a1fb3f852952b12a0e40c Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 1 Aug 2014 12:29:44 +0200 Subject: inet: frags: rename last_in to flags The last_in field has been used to store various flags different from first/last frag in so give it a more descriptive name: flags. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 12 ++++++------ net/ipv6/reassembly.c | 18 +++++++++--------- 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 3d4bccf6d67d..cca686e42b97 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -222,7 +222,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, int offset, end; u8 ecn; - if (fq->q.last_in & INET_FRAG_COMPLETE) { + if (fq->q.flags & INET_FRAG_COMPLETE) { pr_debug("Already completed\n"); goto err; } @@ -253,11 +253,11 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, * or have different end, the segment is corrupted. */ if (end < fq->q.len || - ((fq->q.last_in & INET_FRAG_LAST_IN) && end != fq->q.len)) { + ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) { pr_debug("already received last fragment\n"); goto err; } - fq->q.last_in |= INET_FRAG_LAST_IN; + fq->q.flags |= INET_FRAG_LAST_IN; fq->q.len = end; } else { /* Check if the fragment is rounded to 8 bytes. @@ -272,7 +272,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, } if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ - if (fq->q.last_in & INET_FRAG_LAST_IN) { + if (fq->q.flags & INET_FRAG_LAST_IN) { pr_debug("last packet already reached.\n"); goto err; } @@ -354,7 +354,7 @@ found: */ if (offset == 0) { fq->nhoffset = nhoff; - fq->q.last_in |= INET_FRAG_FIRST_IN; + fq->q.flags |= INET_FRAG_FIRST_IN; } return 0; @@ -617,7 +617,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) goto ret_orig; } - if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { ret_skb = nf_ct_frag6_reasm(fq, dev); if (ret_skb == NULL) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 512ccc027ce3..b4baceed0d0d 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -131,7 +131,7 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, spin_lock(&fq->q.lock); - if (fq->q.last_in & INET_FRAG_COMPLETE) + if (fq->q.flags & INET_FRAG_COMPLETE) goto out; inet_frag_kill(&fq->q, frags); @@ -141,13 +141,13 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, if (!dev) goto out_rcu_unlock; - if (!(fq->q.last_in & INET_FRAG_EVICTED)) + if (!(fq->q.flags & INET_FRAG_EVICTED)) IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); /* Don't send error if the first segment did not arrive. */ - if (!(fq->q.last_in & INET_FRAG_FIRST_IN) || !fq->q.fragments) + if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !fq->q.fragments) goto out_rcu_unlock; /* @@ -209,7 +209,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, struct net *net = dev_net(skb_dst(skb)->dev); u8 ecn; - if (fq->q.last_in & INET_FRAG_COMPLETE) + if (fq->q.flags & INET_FRAG_COMPLETE) goto err; offset = ntohs(fhdr->frag_off) & ~0x7; @@ -240,9 +240,9 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, * or have different end, the segment is corrupted. */ if (end < fq->q.len || - ((fq->q.last_in & INET_FRAG_LAST_IN) && end != fq->q.len)) + ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) goto err; - fq->q.last_in |= INET_FRAG_LAST_IN; + fq->q.flags |= INET_FRAG_LAST_IN; fq->q.len = end; } else { /* Check if the fragment is rounded to 8 bytes. @@ -260,7 +260,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, } if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ - if (fq->q.last_in & INET_FRAG_LAST_IN) + if (fq->q.flags & INET_FRAG_LAST_IN) goto err; fq->q.len = end; } @@ -335,10 +335,10 @@ found: */ if (offset == 0) { fq->nhoffset = nhoff; - fq->q.last_in |= INET_FRAG_FIRST_IN; + fq->q.flags |= INET_FRAG_FIRST_IN; } - if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { int res; unsigned long orefdst = skb->_skb_refdst; -- cgit v1.2.2 From 2e404f632f44979ddf0ce0808a438249a72d7015 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 1 Aug 2014 12:29:47 +0200 Subject: inet: frags: use INET_FRAG_EVICTED to prevent icmp messages Now that we have INET_FRAG_EVICTED we might as well use it to stop sending icmp messages in the "frag_expire" functions instead of stripping INET_FRAG_FIRST_IN from their flags when evicting. Also fix the comment style in ip6_expire_frag_queue(). Signed-off-by: Nikolay Aleksandrov Reviewed-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv6/reassembly.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index b4baceed0d0d..beb6872a8fa5 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -141,19 +141,20 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, if (!dev) goto out_rcu_unlock; - if (!(fq->q.flags & INET_FRAG_EVICTED)) - IP6_INC_STATS_BH(net, __in6_dev_get(dev), - IPSTATS_MIB_REASMTIMEOUT); IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); + if (fq->q.flags & INET_FRAG_EVICTED) + goto out_rcu_unlock; + + IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); + /* Don't send error if the first segment did not arrive. */ if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !fq->q.fragments) goto out_rcu_unlock; - /* - But use as source device on which LAST ARRIVED - segment was received. And do not use fq->dev - pointer directly, device might already disappeared. + /* But use as source device on which LAST ARRIVED + * segment was received. And do not use fq->dev + * pointer directly, device might already disappeared. */ fq->q.fragments->dev = dev; icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); -- cgit v1.2.2 From d4ad4d22e7ac6b8711b35d7e86eb29f03f8ac153 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 1 Aug 2014 12:29:48 +0200 Subject: inet: frags: use kmem_cache for inet_frag_queue Use kmem_cache to allocate/free inet_frag_queue objects since they're all the same size per inet_frags user and are alloced/freed in high volumes thus making it a perfect case for kmem_cache. Signed-off-by: Nikolay Aleksandrov Acked-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 8 ++++++-- net/ipv6/reassembly.c | 7 ++++++- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index cca686e42b97..6f187c8d8a1b 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -50,6 +50,7 @@ #include #include +static const char nf_frags_cache_name[] = "nf-frags"; struct nf_ct_frag6_skb_cb { @@ -677,12 +678,15 @@ int nf_ct_frag6_init(void) nf_frags.qsize = sizeof(struct frag_queue); nf_frags.match = ip6_frag_match; nf_frags.frag_expire = nf_ct_frag6_expire; - inet_frags_init(&nf_frags); - + nf_frags.frags_cache_name = nf_frags_cache_name; + ret = inet_frags_init(&nf_frags); + if (ret) + goto out; ret = register_pernet_subsys(&nf_ct_net_ops); if (ret) inet_frags_fini(&nf_frags); +out: return ret; } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index beb6872a8fa5..c6557d9f7808 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -60,6 +60,8 @@ #include #include +static const char ip6_frag_cache_name[] = "ip6-frags"; + struct ip6frag_skb_cb { struct inet6_skb_parm h; @@ -748,7 +750,10 @@ int __init ipv6_frag_init(void) ip6_frags.qsize = sizeof(struct frag_queue); ip6_frags.match = ip6_frag_match; ip6_frags.frag_expire = ip6_frag_expire; - inet_frags_init(&ip6_frags); + ip6_frags.frags_cache_name = ip6_frag_cache_name; + ret = inet_frags_init(&ip6_frags); + if (ret) + goto err_pernet; out: return ret; -- cgit v1.2.2 From 166bd890a3d851b7cfdf3e417917878bfe4671f1 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Fri, 1 Aug 2014 14:41:10 +0200 Subject: ipv6: data of fwmark_reflect sysctl needs to be updated on netns construction Fixes: e110861f86094cd ("net: add a sysctl to reflect the fwmark on replies") Cc: Lorenzo Colitti Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/sysctl_net_ipv6.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv6') diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 058f3eca2e53..818334619abb 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -74,6 +74,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[0].data = &net->ipv6.sysctl.bindv6only; ipv6_table[1].data = &net->ipv6.sysctl.anycast_src_echo_reply; ipv6_table[2].data = &net->ipv6.sysctl.flowlabel_consistency; + ipv6_table[3].data = &net->ipv6.sysctl.fwmark_reflect; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) -- cgit v1.2.2 From 09c2d251b70723650ba47e83571ff49281320f7c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 Aug 2014 22:11:47 -0400 Subject: net-timestamp: add key to disambiguate concurrent datagrams Datagrams timestamped on transmission can coexist in the kernel stack and be reordered in packet scheduling. When reading looped datagrams from the socket error queue it is not always possible to unique correlate looped data with original send() call (for application level retransmits). Even if possible, it may be expensive and complex, requiring packet inspection. Introduce a data-independent ID mechanism to associate timestamps with send calls. Pass an ID alongside the timestamp in field ee_data of sock_extended_err. The ID is a simple 32 bit unsigned int that is associated with the socket and incremented on each send() call for which software tx timestamp generation is enabled. The feature is enabled only if SOF_TIMESTAMPING_OPT_ID is set, to avoid changing ee_data for existing applications that expect it 0. The counter is reset each time the flag is reenabled. Reenabling does not change the ID of already submitted data. It is possible to receive out of order IDs if the timestamp stream is not quiesced first. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f5dafe609f8b..315a55d66079 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1157,6 +1157,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int err; int offset = 0; __u8 tx_flags = 0; + u32 tskey = 0; if (flags&MSG_PROBE) return 0; @@ -1272,8 +1273,12 @@ emsgsize: } } - if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) + if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { sock_tx_timestamp(sk, &tx_flags); + if (tx_flags & SKBTX_ANY_SW_TSTAMP && + sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + tskey = sk->sk_tskey++; + } /* * Let's try using as much space as possible. @@ -1397,6 +1402,8 @@ alloc_new_skb: /* Only the initial fragment is time stamped */ skb_shinfo(skb)->tx_flags = tx_flags; tx_flags = 0; + skb_shinfo(skb)->tskey = tskey; + tskey = 0; /* * Find where to start putting bytes -- cgit v1.2.2 From 9ea88a153001ffeb3d8810917e8eea62ca9b6f25 Mon Sep 17 00:00:00 2001 From: Dmitry Popov Date: Thu, 7 Aug 2014 02:38:22 +0400 Subject: tcp: md5: check md5 signature without socket lock Since a8afca032 (tcp: md5: protects md5sig_info with RCU) tcp_md5_do_lookup doesn't require socket lock, rcu_read_lock is enough. Therefore socket lock is no longer required for tcp_v{4,6}_inbound_md5_hash too, so we can move these calls (wrapped with rcu_read_{,un}lock) before bh_lock_sock: from tcp_v{4,6}_do_rcv to tcp_v{4,6}_rcv. Signed-off-by: Dmitry Popov Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 22055b098428..f2ce95502392 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -667,7 +667,8 @@ clear_hash_noput: return 1; } -static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) +static int __tcp_v6_inbound_md5_hash(struct sock *sk, + const struct sk_buff *skb) { const __u8 *hash_location = NULL; struct tcp_md5sig_key *hash_expected; @@ -707,6 +708,18 @@ static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) } return 0; } + +static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) +{ + int ret; + + rcu_read_lock(); + ret = __tcp_v6_inbound_md5_hash(sk, skb); + rcu_read_unlock(); + + return ret; +} + #endif static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, @@ -1247,11 +1260,6 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); -#ifdef CONFIG_TCP_MD5SIG - if (tcp_v6_inbound_md5_hash(sk, skb)) - goto discard; -#endif - if (sk_filter(sk, skb)) goto discard; @@ -1424,6 +1432,11 @@ process: if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; +#ifdef CONFIG_TCP_MD5SIG + if (tcp_v6_inbound_md5_hash(sk, skb)) + goto discard_and_relse; +#endif + if (sk_filter(sk, skb)) goto discard_and_relse; -- cgit v1.2.2 From 1d023284c31a4e40a94d5bbcb7dbb7a35ee0bcbc Mon Sep 17 00:00:00 2001 From: Ken Helias Date: Wed, 6 Aug 2014 16:09:16 -0700 Subject: list: fix order of arguments for hlist_add_after(_rcu) All other add functions for lists have the new item as first argument and the position where it is added as second argument. This was changed for no good reason in this function and makes using it unnecessary confusing. The name was changed to hlist_add_behind() to cause unconverted code to generate a compile error instead of using the wrong parameter order. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Ken Helias Cc: "Paul E. McKenney" Acked-by: Jeff Kirsher [intel driver bits] Cc: Hugh Dickins Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/ipv6/addrlabel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 731e1e1722d9..fd0dc47f471d 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -277,7 +277,7 @@ static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) last = p; } if (last) - hlist_add_after_rcu(&last->list, &newp->list); + hlist_add_behind_rcu(&newp->list, &last->list); else hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); out: -- cgit v1.2.2 From bc8fc7b8f825ef17a0fb9e68c18ce94fa66ab337 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Thu, 14 Aug 2014 15:27:20 +0300 Subject: sit: Fix ipip6_tunnel_lookup device matching criteria As of 4fddbf5d78 ("sit: strictly restrict incoming traffic to tunnel link device"), when looking up a tunnel, tunnel's underlying interface (t->parms.link) is verified to match incoming traffic's ingress device. However the comparison was incorrectly based on skb->dev->iflink. Instead, dev->ifindex should be used, which correctly represents the interface from which the IP stack hands the ipip6 packets. This allows setting up sit tunnels bound to vlan interfaces (otherwise incoming ipip6 traffic on the vlan interface was dropped due to ipip6_tunnel_lookup match failure). Signed-off-by: Shmulik Ladkani Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/sit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 2e9ba035fb5f..6163f851dc01 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -101,19 +101,19 @@ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net, for_each_ip_tunnel_rcu(t, sitn->tunnels_r_l[h0 ^ h1]) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && - (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (!dev || !t->parms.link || dev->ifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } for_each_ip_tunnel_rcu(t, sitn->tunnels_r[h0]) { if (remote == t->parms.iph.daddr && - (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (!dev || !t->parms.link || dev->ifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } for_each_ip_tunnel_rcu(t, sitn->tunnels_l[h1]) { if (local == t->parms.iph.saddr && - (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (!dev || !t->parms.link || dev->ifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } -- cgit v1.2.2 From 4fab9071950c2021d846e18351e0f46a1cffd67b Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 14 Aug 2014 12:40:05 -0400 Subject: tcp: fix tcp_release_cb() to dispatch via address family for mtu_reduced() Make sure we use the correct address-family-specific function for handling MTU reductions from within tcp_release_cb(). Previously AF_INET6 sockets were incorrectly always using the IPv6 code path when sometimes they were handling IPv4 traffic and thus had an IPv4 dst. Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Diagnosed-by: Willem de Bruijn Fixes: 563d34d057862 ("tcp: dont drop MTU reduction indications") Reviewed-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f2ce95502392..29964c3d363c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1595,6 +1595,7 @@ static const struct inet_connection_sock_af_ops ipv6_specific = { .compat_setsockopt = compat_ipv6_setsockopt, .compat_getsockopt = compat_ipv6_getsockopt, #endif + .mtu_reduced = tcp_v6_mtu_reduced, }; #ifdef CONFIG_TCP_MD5SIG @@ -1625,6 +1626,7 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { .compat_setsockopt = compat_ipv6_setsockopt, .compat_getsockopt = compat_ipv6_getsockopt, #endif + .mtu_reduced = tcp_v4_mtu_reduced, }; #ifdef CONFIG_TCP_MD5SIG @@ -1864,7 +1866,6 @@ struct proto tcpv6_prot = { .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, - .mtu_reduced = tcp_v6_mtu_reduced, .hash = tcp_v6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, -- cgit v1.2.2 From 8993cf8edf42527119186b558766539243b791a5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 11 Aug 2014 18:21:49 +0200 Subject: netfilter: move NAT Kconfig switches out of the iptables scope Currently, the NAT configs depend on iptables and ip6tables. However, users should be capable of enabling NAT for nft without having to switch on iptables. Fix this by adding new specific IP_NF_NAT and IP6_NF_NAT config switches for iptables and ip6tables NAT support. I have also moved the original NF_NAT_IPV4 and NF_NAT_IPV6 configs out of the scope of iptables to make them independent of it. This patch also adds NETFILTER_XT_NAT which selects the xt_nat combo that provides snat/dnat for iptables. We cannot use NF_NAT anymore since nf_tables can select this. Reported-by: Matteo Croce Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/Kconfig | 26 +++++++++++++++++++------- net/ipv6/netfilter/Makefile | 2 +- 2 files changed, 20 insertions(+), 8 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index ac93df16f5af..cf0b88f30f6f 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -60,6 +60,16 @@ config NF_LOG_IPV6 depends on NETFILTER_ADVANCED select NF_LOG_COMMON +config NF_NAT_IPV6 + tristate "IPv6 NAT" + depends on NF_CONNTRACK_IPV6 + depends on NETFILTER_ADVANCED + select NF_NAT + help + The IPv6 NAT option allows masquerading, port forwarding and other + forms of full Network Address Port Translation. This can be + controlled by iptables or nft. + config IP6_NF_IPTABLES tristate "IP6 tables support (required for filtering)" depends on INET && IPV6 @@ -232,19 +242,21 @@ config IP6_NF_SECURITY If unsure, say N. -config NF_NAT_IPV6 - tristate "IPv6 NAT" +config IP6_NF_NAT + tristate "ip6tables NAT support" depends on NF_CONNTRACK_IPV6 depends on NETFILTER_ADVANCED select NF_NAT + select NF_NAT_IPV6 + select NETFILTER_XT_NAT help - The IPv6 NAT option allows masquerading, port forwarding and other - forms of full Network Address Port Translation. It is controlled by - the `nat' table in ip6tables, see the man page for ip6tables(8). + This enables the `nat' table in ip6tables. This allows masquerading, + port forwarding and other forms of full Network Address Port + Translation. To compile it as a module, choose M here. If unsure, say N. -if NF_NAT_IPV6 +if IP6_NF_NAT config IP6_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" @@ -265,7 +277,7 @@ config IP6_NF_TARGET_NPT To compile it as a module, choose M here. If unsure, say N. -endif # NF_NAT_IPV6 +endif # IP6_NF_NAT endif # IP6_NF_IPTABLES diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index c0b263104ed2..c3d3286db4bb 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o -obj-$(CONFIG_NF_NAT_IPV6) += ip6table_nat.o +obj-$(CONFIG_IP6_NF_NAT) += ip6table_nat.o # objects for l3 independent conntrack nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o -- cgit v1.2.2 From 793c3b4000a1ef611ae7e5c89bd2a9c6b776cb5e Mon Sep 17 00:00:00 2001 From: Benjamin Block Date: Thu, 21 Aug 2014 19:37:48 +0200 Subject: net: ipv6: fib: don't sleep inside atomic lock The function fib6_commit_metrics() allocates a piece of memory in mode GFP_KERNEL while holding an atomic lock from higher up in the stack, in the function __ip6_ins_rt(). This produces the following BUG: > BUG: sleeping function called from invalid context at mm/slub.c:1250 > in_atomic(): 1, irqs_disabled(): 0, pid: 2909, name: dhcpcd > 2 locks held by dhcpcd/2909: > #0: (rtnl_mutex){+.+.+.}, at: [] rtnl_lock+0x17/0x20 > #1: (&tb->tb6_lock){++--+.}, at: [] ip6_route_add+0x65a/0x800 > CPU: 1 PID: 2909 Comm: dhcpcd Not tainted 3.17.0-rc1 #1 > Hardware name: ASUS All Series/Q87T, BIOS 0216 10/16/2013 > 0000000000000008 ffff8800c8f13858 ffffffff81af135a 0000000000000000 > ffff880212202430 ffff8800c8f13878 ffffffff810f8d3a ffff880212202c98 > 0000000000000010 ffff8800c8f138c8 ffffffff8121ad0e 0000000000000001 > Call Trace: > [] dump_stack+0x4e/0x68 > [] __might_sleep+0x10a/0x120 > [] kmem_cache_alloc_trace+0x4e/0x190 > [] ? fib6_commit_metrics+0x66/0x110 > [] fib6_commit_metrics+0x66/0x110 > [] fib6_add+0x883/0xa80 > [] ? ip6_route_add+0x65a/0x800 > [] ip6_route_add+0x675/0x800 > [] ? ip6_route_add+0x6a/0x800 > [] inet6_rtm_newroute+0x5c/0x80 > [] rtnetlink_rcv_msg+0x211/0x260 > [] ? rtnl_lock+0x17/0x20 > [] ? lock_release_holdtime+0x28/0x180 > [] ? rtnl_lock+0x17/0x20 > [] ? __rtnl_unlock+0x20/0x20 > [] netlink_rcv_skb+0x6e/0xd0 > [] rtnetlink_rcv+0x25/0x40 > [] netlink_unicast+0xd9/0x180 > [] netlink_sendmsg+0x700/0x770 > [] ? local_clock+0x25/0x30 > [] sock_sendmsg+0x6c/0x90 > [] ? might_fault+0xa3/0xb0 > [] ? verify_iovec+0x7d/0xf0 > [] ___sys_sendmsg+0x37e/0x3b0 > [] ? trace_hardirqs_on_caller+0x185/0x220 > [] ? mutex_unlock+0xe/0x10 > [] ? netlink_insert+0xbc/0xe0 > [] ? netlink_autobind.isra.30+0x125/0x150 > [] ? netlink_autobind.isra.30+0x60/0x150 > [] ? netlink_bind+0x159/0x230 > [] ? might_fault+0x5a/0xb0 > [] ? SYSC_bind+0x7e/0xd0 > [] __sys_sendmsg+0x4d/0x80 > [] SyS_sendmsg+0x12/0x20 > [] system_call_fastpath+0x16/0x1b Fixing this by replacing the mode GFP_KERNEL with GFP_ATOMIC. Signed-off-by: Benjamin Block Acked-by: David Rientjes Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index cb4459bd1d29..76b7f5ee8f4c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -643,7 +643,7 @@ static int fib6_commit_metrics(struct dst_entry *dst, if (dst->flags & DST_HOST) { mp = dst_metrics_write_ptr(dst); } else { - mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); if (!mp) return -ENOMEM; dst_init_metrics(dst, mp, 0); -- cgit v1.2.2 From 41ad82f7f8f59a472c8e8ccca56a9c6bdf3c5092 Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Tue, 2 Sep 2014 14:26:17 +0200 Subject: netfilter: fix missing dependencies in NETFILTER_XT_TARGET_LOG make defconfig reports: warning: (NETFILTER_XT_TARGET_LOG) selects NF_LOG_IPV6 which has unmet direct dependencies (NET && INET && IPV6 && NETFILTER && NETFILTER_ADVANCED) Fixes: d79a61d netfilter: NETFILTER_XT_TARGET_LOG selects NF_LOG_* Reported-by: kbuild test robot Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/ipv6/netfilter/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index cf0b88f30f6f..2812816aabdc 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -57,7 +57,7 @@ config NFT_REJECT_IPV6 config NF_LOG_IPV6 tristate "IPv6 packet logging" - depends on NETFILTER_ADVANCED + default m if NETFILTER_ADVANCED=n select NF_LOG_COMMON config NF_NAT_IPV6 -- cgit v1.2.2 From a9ed4a2986e13011fcf4ed2d1a1647c53112f55b Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 2 Sep 2014 10:29:29 +0200 Subject: ipv6: fix rtnl locking in setsockopt for anycast and multicast Calling setsockopt with IPV6_JOIN_ANYCAST or IPV6_LEAVE_ANYCAST triggers the assertion in addrconf_join_solict()/addrconf_leave_solict() ipv6_sock_ac_join(), ipv6_sock_ac_drop(), ipv6_sock_ac_close() need to take RTNL before calling ipv6_dev_ac_inc/dec. Same thing with ipv6_sock_mc_join(), ipv6_sock_mc_drop(), ipv6_sock_mc_close() before calling ipv6_dev_mc_inc/dec. This patch moves ASSERT_RTNL() up a level in the call stack. Signed-off-by: Cong Wang Signed-off-by: Sabrina Dubroca Reported-by: Tommi Rantala Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 15 +++++---------- net/ipv6/anycast.c | 12 ++++++++++++ net/ipv6/mcast.c | 14 ++++++++++++++ 3 files changed, 31 insertions(+), 10 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 0b239fc1816e..aa0e135b808c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1690,14 +1690,12 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) addrconf_mod_dad_work(ifp, 0); } -/* Join to solicited addr multicast group. */ - +/* Join to solicited addr multicast group. + * caller must hold RTNL */ void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr) { struct in6_addr maddr; - ASSERT_RTNL(); - if (dev->flags&(IFF_LOOPBACK|IFF_NOARP)) return; @@ -1705,12 +1703,11 @@ void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr) ipv6_dev_mc_inc(dev, &maddr); } +/* caller must hold RTNL */ void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr) { struct in6_addr maddr; - ASSERT_RTNL(); - if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP)) return; @@ -1718,12 +1715,11 @@ void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr) __ipv6_dev_mc_dec(idev, &maddr); } +/* caller must hold RTNL */ static void addrconf_join_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; - ASSERT_RTNL(); - if (ifp->prefix_len >= 127) /* RFC 6164 */ return; ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); @@ -1732,12 +1728,11 @@ static void addrconf_join_anycast(struct inet6_ifaddr *ifp) ipv6_dev_ac_inc(ifp->idev->dev, &addr); } +/* caller must hold RTNL */ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; - ASSERT_RTNL(); - if (ifp->prefix_len >= 127) /* RFC 6164 */ return; ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 210183244689..45b9d81d91e8 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -77,6 +77,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) pac->acl_next = NULL; pac->acl_addr = *addr; + rtnl_lock(); rcu_read_lock(); if (ifindex == 0) { struct rt6_info *rt; @@ -137,6 +138,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) error: rcu_read_unlock(); + rtnl_unlock(); if (pac) sock_kfree_s(sk, pac, sizeof(*pac)); return err; @@ -171,13 +173,17 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) spin_unlock_bh(&ipv6_sk_ac_lock); + rtnl_lock(); rcu_read_lock(); dev = dev_get_by_index_rcu(net, pac->acl_ifindex); if (dev) ipv6_dev_ac_dec(dev, &pac->acl_addr); rcu_read_unlock(); + rtnl_unlock(); sock_kfree_s(sk, pac, sizeof(*pac)); + if (!dev) + return -ENODEV; return 0; } @@ -198,6 +204,7 @@ void ipv6_sock_ac_close(struct sock *sk) spin_unlock_bh(&ipv6_sk_ac_lock); prev_index = 0; + rtnl_lock(); rcu_read_lock(); while (pac) { struct ipv6_ac_socklist *next = pac->acl_next; @@ -212,6 +219,7 @@ void ipv6_sock_ac_close(struct sock *sk) pac = next; } rcu_read_unlock(); + rtnl_unlock(); } static void aca_put(struct ifacaddr6 *ac) @@ -233,6 +241,8 @@ int ipv6_dev_ac_inc(struct net_device *dev, const struct in6_addr *addr) struct rt6_info *rt; int err; + ASSERT_RTNL(); + idev = in6_dev_get(dev); if (idev == NULL) @@ -302,6 +312,8 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca, *prev_aca; + ASSERT_RTNL(); + write_lock_bh(&idev->lock); prev_aca = NULL; for (aca = idev->ac_list; aca; aca = aca->aca_next) { diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 617f0958e164..a23b655a7627 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -172,6 +172,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) mc_lst->next = NULL; mc_lst->addr = *addr; + rtnl_lock(); rcu_read_lock(); if (ifindex == 0) { struct rt6_info *rt; @@ -185,6 +186,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) if (dev == NULL) { rcu_read_unlock(); + rtnl_unlock(); sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return -ENODEV; } @@ -202,6 +204,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) if (err) { rcu_read_unlock(); + rtnl_unlock(); sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return err; } @@ -212,6 +215,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) spin_unlock(&ipv6_sk_mc_lock); rcu_read_unlock(); + rtnl_unlock(); return 0; } @@ -229,6 +233,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) if (!ipv6_addr_is_multicast(addr)) return -EINVAL; + rtnl_lock(); spin_lock(&ipv6_sk_mc_lock); for (lnk = &np->ipv6_mc_list; (mc_lst = rcu_dereference_protected(*lnk, @@ -252,12 +257,15 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) } else (void) ip6_mc_leave_src(sk, mc_lst, NULL); rcu_read_unlock(); + rtnl_unlock(); + atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); kfree_rcu(mc_lst, rcu); return 0; } } spin_unlock(&ipv6_sk_mc_lock); + rtnl_unlock(); return -EADDRNOTAVAIL; } @@ -302,6 +310,7 @@ void ipv6_sock_mc_close(struct sock *sk) if (!rcu_access_pointer(np->ipv6_mc_list)) return; + rtnl_lock(); spin_lock(&ipv6_sk_mc_lock); while ((mc_lst = rcu_dereference_protected(np->ipv6_mc_list, lockdep_is_held(&ipv6_sk_mc_lock))) != NULL) { @@ -328,6 +337,7 @@ void ipv6_sock_mc_close(struct sock *sk) spin_lock(&ipv6_sk_mc_lock); } spin_unlock(&ipv6_sk_mc_lock); + rtnl_unlock(); } int ip6_mc_source(int add, int omode, struct sock *sk, @@ -845,6 +855,8 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) struct ifmcaddr6 *mc; struct inet6_dev *idev; + ASSERT_RTNL(); + /* we need to take a reference on idev */ idev = in6_dev_get(dev); @@ -916,6 +928,8 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifmcaddr6 *ma, **map; + ASSERT_RTNL(); + write_lock_bh(&idev->lock); for (map = &idev->mc_list; (ma=*map) != NULL; map = &ma->next) { if (ipv6_addr_equal(&ma->mca_addr, addr)) { -- cgit v1.2.2 From f24062b07dda89b0e24fa48e7bc3865a725f5ee6 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 3 Sep 2014 23:59:21 +0200 Subject: ipv6: fix a refcnt leak with peer addr There is no reason to take a refcnt before deleting the peer address route. It's done some lines below for the local prefix route because inet6_ifa_finish_destroy() will release it at the end. For the peer address route, we want to free it right now. This bug has been introduced by commit caeaba79009c ("ipv6: add support of peer address"). Signed-off-by: Nicolas Dichtel Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index aa0e135b808c..ce761c7e6675 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4772,11 +4772,8 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) rt = rt6_lookup(dev_net(dev), &ifp->peer_addr, NULL, dev->ifindex, 1); - if (rt) { - dst_hold(&rt->dst); - if (ip6_del_rt(rt)) - dst_free(&rt->dst); - } + if (rt && ip6_del_rt(rt)) + dst_free(&rt->dst); } dst_hold(&ifp->rt->dst); -- cgit v1.2.2 From e7478dfc4656f4a739ed1b07cfd59c12f8eb112e Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 3 Sep 2014 23:59:22 +0200 Subject: ipv6: use addrconf_get_prefix_route() to remove peer addr addrconf_get_prefix_route() ensures to get the right route in the right table. Signed-off-by: Nicolas Dichtel Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ce761c7e6675..fc1fac2a0528 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4768,10 +4768,9 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) addrconf_leave_solict(ifp->idev, &ifp->addr); if (!ipv6_addr_any(&ifp->peer_addr)) { struct rt6_info *rt; - struct net_device *dev = ifp->idev->dev; - rt = rt6_lookup(dev_net(dev), &ifp->peer_addr, NULL, - dev->ifindex, 1); + rt = addrconf_get_prefix_route(&ifp->peer_addr, 128, + ifp->idev->dev, 0, 0); if (rt && ip6_del_rt(rt)) dst_free(&rt->dst); } -- cgit v1.2.2 From de185ab46cb02df9738b0d898b0c3a89181c5526 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Fri, 5 Sep 2014 14:33:00 -0700 Subject: ipv6: restore the behavior of ipv6_sock_ac_drop() It is possible that the interface is already gone after joining the list of anycast on this interface as we don't hold a refcount for the device, in this case we are safe to ignore the error. What's more important, for API compatibility we should not change this behavior for applications even if it were correct. Fixes: commit a9ed4a2986e13011 ("ipv6: fix rtnl locking in setsockopt for anycast and multicast") Cc: Sabrina Dubroca Cc: David S. Miller Signed-off-by: Cong Wang Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/anycast.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 45b9d81d91e8..ff2de7d9d8e6 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -182,8 +182,6 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) rtnl_unlock(); sock_kfree_s(sk, pac, sizeof(*pac)); - if (!dev) - return -ENODEV; return 0; } -- cgit v1.2.2 From 381f4dca48d23e155b936b86ccd3ff12f073cf0f Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 10 Sep 2014 23:23:02 +0200 Subject: ipv6: clean up anycast when an interface is destroyed If we try to rmmod the driver for an interface while sockets with setsockopt(JOIN_ANYCAST) are alive, some refcounts aren't cleaned up and we get stuck on: unregister_netdevice: waiting for ens3 to become free. Usage count = 1 If we LEAVE_ANYCAST/close everything before rmmod'ing, there is no problem. We need to perform a cleanup similar to the one for multicast in addrconf_ifdown(how == 1). Signed-off-by: Sabrina Dubroca Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 8 +++++--- net/ipv6/anycast.c | 21 +++++++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index fc1fac2a0528..3342ee64f2e3 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3094,11 +3094,13 @@ static int addrconf_ifdown(struct net_device *dev, int how) write_unlock_bh(&idev->lock); - /* Step 5: Discard multicast list */ - if (how) + /* Step 5: Discard anycast and multicast list */ + if (how) { + ipv6_ac_destroy_dev(idev); ipv6_mc_destroy_dev(idev); - else + } else { ipv6_mc_down(idev); + } idev->tstamp = jiffies; diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index ff2de7d9d8e6..9a386842fd62 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -351,6 +351,27 @@ static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr) return __ipv6_dev_ac_dec(idev, addr); } +void ipv6_ac_destroy_dev(struct inet6_dev *idev) +{ + struct ifacaddr6 *aca; + + write_lock_bh(&idev->lock); + while ((aca = idev->ac_list) != NULL) { + idev->ac_list = aca->aca_next; + write_unlock_bh(&idev->lock); + + addrconf_leave_solict(idev, &aca->aca_addr); + + dst_hold(&aca->aca_rt->dst); + ip6_del_rt(aca->aca_rt); + + aca_put(aca); + + write_lock_bh(&idev->lock); + } + write_unlock_bh(&idev->lock); +} + /* * check if the interface has this anycast address * called with rcu_read_lock() -- cgit v1.2.2 From f92ee61982d6da15a9e49664ecd6405a15a2ee56 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 16 Sep 2014 10:08:40 +0200 Subject: xfrm: Generate blackhole routes only from route lookup functions Currently we genarate a blackhole route route whenever we have matching policies but can not resolve the states. Here we assume that dst_output() is called to kill the balckholed packets. Unfortunately this assumption is not true in all cases, so it is possible that these packets leave the system unwanted. We fix this by generating blackhole routes only from the route lookup functions, here we can guarantee a call to dst_output() afterwards. Fixes: 2774c131b1d ("xfrm: Handle blackhole route creation via afinfo.") Reported-by: Konstantinos Kolelis Signed-off-by: Steffen Klassert --- net/ipv6/ip6_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 315a55d66079..0a3448b2888f 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1009,7 +1009,7 @@ struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, if (final_dst) fl6->daddr = *final_dst; - return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); + return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); } EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); @@ -1041,7 +1041,7 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, if (final_dst) fl6->daddr = *final_dst; - return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); + return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); } EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); -- cgit v1.2.2 From 5a4ee9a9a066b1600509d968e1e9eab37c8501d8 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 24 Sep 2014 11:03:00 +0200 Subject: ip6gre: add a rtnl link alias for ip6gretap With this alias, we don't need to load manually the module before adding an ip6gretap interface with iproute2. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 5f19dfbc4c6a..d172ec4ec9d3 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1724,4 +1724,5 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); MODULE_DESCRIPTION("GRE over IPv6 tunneling device"); MODULE_ALIAS_RTNL_LINK("ip6gre"); +MODULE_ALIAS_RTNL_LINK("ip6gretap"); MODULE_ALIAS_NETDEV("ip6gre0"); -- cgit v1.2.2 From 2b0bb01b6edb3e13c7f71e43bf3a173a795b7b66 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 22 Sep 2014 10:07:24 +0200 Subject: ip6_tunnel: Return an error when adding an existing tunnel. ip6_tnl_locate() should not return an existing tunnel if create is true. Otherwise it is possible to add the same tunnel multiple times without getting an error. So return NULL if the tunnel that should be created already exists. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index f9de5a695072..69a84b464009 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -364,8 +364,12 @@ static struct ip6_tnl *ip6_tnl_locate(struct net *net, (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && - ipv6_addr_equal(remote, &t->parms.raddr)) + ipv6_addr_equal(remote, &t->parms.raddr)) { + if (create) + return NULL; + return t; + } } if (!create) return NULL; -- cgit v1.2.2 From d814b847be7b47575a1cc2194760d3461e1c43c8 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 22 Sep 2014 10:07:25 +0200 Subject: ip6_vti: Return an error when adding an existing tunnel. vti6_locate() should not return an existing tunnel if create is true. Otherwise it is possible to add the same tunnel multiple times without getting an error. So return NULL if the tunnel that should be created already exists. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv6/ip6_vti.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 7f52fd9fa7b0..5833a2244467 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -253,8 +253,12 @@ static struct ip6_tnl *vti6_locate(struct net *net, struct __ip6_tnl_parm *p, (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && - ipv6_addr_equal(remote, &t->parms.raddr)) + ipv6_addr_equal(remote, &t->parms.raddr)) { + if (create) + return NULL; + return t; + } } if (!create) return NULL; -- cgit v1.2.2 From cd0a0bd9b8e157b19aa38eeac30c60f1a0d010bd Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 22 Sep 2014 10:07:26 +0200 Subject: ip6_gre: Return an error when adding an existing tunnel. ip6gre_tunnel_locate() should not return an existing tunnel if create is true. Otherwise it is possible to add the same tunnel multiple times without getting an error. So return NULL if the tunnel that should be created already exists. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index d172ec4ec9d3..f304471477dc 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -314,6 +314,8 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); t = ip6gre_tunnel_find(net, parms, ARPHRD_IP6GRE); + if (t && create) + return NULL; if (t || !create) return t; -- cgit v1.2.2 From 705f1c869d577c8055736dd02501f26a2507dd5b Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sun, 28 Sep 2014 00:46:06 +0200 Subject: ipv6: remove rt6i_genid Eric Dumazet noticed that all no-nonexthop or no-gateway routes which are already marked DST_HOST (e.g. input routes routes) will always be invalidated during sk_dst_check. Thus per-socket dst caching absolutely had no effect and early demuxing had no effect. Thus this patch removes rt6i_genid: fn_sernum already gets modified during add operations, so we only must ensure we mutate fn_sernum during ipv6 address remove operations. This is a fairly cost extensive operations, but address removal should not happen that often. Also our mtu update functions do the same and we heard no complains so far. xfrm policy changes also cause a call into fib6_flush_trees. Also plug a hole in rt6_info (no cacheline changes). I verified via tracing that this change has effect. Cc: Eric Dumazet Cc: YOSHIFUJI Hideaki Cc: Vlad Yasevich Cc: Nicolas Dichtel Cc: Martin Lau Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 3 ++- net/ipv6/addrconf_core.c | 7 +++++++ net/ipv6/ip6_fib.c | 20 ++++++++++++++++++++ net/ipv6/route.c | 4 ---- 4 files changed, 29 insertions(+), 5 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3342ee64f2e3..3e118dfddd02 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4780,10 +4780,11 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) if (ip6_del_rt(ifp->rt)) dst_free(&ifp->rt->dst); + + rt_genid_bump_ipv6(net); break; } atomic_inc(&net->ipv6.dev_addr_genid); - rt_genid_bump_ipv6(net); } static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index e6960457f625..98cc4cd570e2 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -8,6 +8,13 @@ #include #include +/* if ipv6 module registers this function is used by xfrm to force all + * sockets to relookup their nodes - this is fairly expensive, be + * careful + */ +void (*__fib6_flush_trees)(struct net *); +EXPORT_SYMBOL(__fib6_flush_trees); + #define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16) static inline unsigned int ipv6_addr_scope2type(unsigned int scope) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 76b7f5ee8f4c..97b9fa8de377 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1605,6 +1605,24 @@ static void fib6_prune_clones(struct net *net, struct fib6_node *fn) fib6_clean_tree(net, fn, fib6_prune_clone, 1, NULL); } +static int fib6_update_sernum(struct rt6_info *rt, void *arg) +{ + __u32 sernum = *(__u32 *)arg; + + if (rt->rt6i_node && + rt->rt6i_node->fn_sernum != sernum) + rt->rt6i_node->fn_sernum = sernum; + + return 0; +} + +static void fib6_flush_trees(struct net *net) +{ + __u32 new_sernum = fib6_new_sernum(); + + fib6_clean_all(net, fib6_update_sernum, &new_sernum); +} + /* * Garbage collection */ @@ -1788,6 +1806,8 @@ int __init fib6_init(void) NULL); if (ret) goto out_unregister_subsys; + + __fib6_flush_trees = fib6_flush_trees; out: return ret; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f23fbd28a501..bafde82324c5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -314,7 +314,6 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net, memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers); - rt->rt6i_genid = rt_genid_ipv6(net); INIT_LIST_HEAD(&rt->rt6i_siblings); } return rt; @@ -1098,9 +1097,6 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. */ - if (rt->rt6i_genid != rt_genid_ipv6(dev_net(rt->dst.dev))) - return NULL; - if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) return NULL; -- cgit v1.2.2