aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c21
-rw-r--r--net/ipv4/arp.c150
-rw-r--r--net/ipv4/cipso_ipv4.c42
-rw-r--r--net/ipv4/devinet.c91
-rw-r--r--net/ipv4/esp4.c2
-rw-r--r--net/ipv4/fib_frontend.c137
-rw-r--r--net/ipv4/fib_lookup.h4
-rw-r--r--net/ipv4/fib_rules.c39
-rw-r--r--net/ipv4/fib_semantics.c43
-rw-r--r--net/ipv4/fib_trie.c1767
-rw-r--r--net/ipv4/fou.c233
-rw-r--r--net/ipv4/geneve.c14
-rw-r--r--net/ipv4/gre_offload.c4
-rw-r--r--net/ipv4/icmp.c6
-rw-r--r--net/ipv4/igmp.c72
-rw-r--r--net/ipv4/inet_connection_sock.c232
-rw-r--r--net/ipv4/inet_diag.c480
-rw-r--r--net/ipv4/inet_fragment.c4
-rw-r--r--net/ipv4/inet_hashtables.c72
-rw-r--r--net/ipv4/inet_timewait_sock.c277
-rw-r--r--net/ipv4/ip_forward.c12
-rw-r--r--net/ipv4/ip_fragment.c25
-rw-r--r--net/ipv4/ip_gre.c14
-rw-r--r--net/ipv4/ip_input.c17
-rw-r--r--net/ipv4/ip_options.c2
-rw-r--r--net/ipv4/ip_output.c92
-rw-r--r--net/ipv4/ip_sockglue.c67
-rw-r--r--net/ipv4/ip_tunnel.c21
-rw-r--r--net/ipv4/ip_tunnel_core.c3
-rw-r--r--net/ipv4/ip_vti.c12
-rw-r--r--net/ipv4/ipcomp.c2
-rw-r--r--net/ipv4/ipconfig.c6
-rw-r--r--net/ipv4/ipip.c12
-rw-r--r--net/ipv4/ipmr.c88
-rw-r--r--net/ipv4/netfilter.c4
-rw-r--r--net/ipv4/netfilter/Kconfig38
-rw-r--r--net/ipv4/netfilter/arp_tables.c11
-rw-r--r--net/ipv4/netfilter/arptable_filter.c7
-rw-r--r--net/ipv4/netfilter/ip_tables.c19
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c17
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c17
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c6
-rw-r--r--net/ipv4/netfilter/iptable_filter.c8
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c19
-rw-r--r--net/ipv4/netfilter/iptable_nat.c29
-rw-r--r--net/ipv4/netfilter/iptable_raw.c7
-rw-r--r--net/ipv4/netfilter/iptable_security.c8
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c28
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c4
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_log_arp.c4
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c33
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c29
-rw-r--r--net/ipv4/netfilter/nf_tables_arp.c6
-rw-r--r--net/ipv4/netfilter/nf_tables_ipv4.c12
-rw-r--r--net/ipv4/netfilter/nft_chain_nat_ipv4.c29
-rw-r--r--net/ipv4/netfilter/nft_chain_route_ipv4.c6
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c9
-rw-r--r--net/ipv4/netfilter/nft_redir_ipv4.c11
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c9
-rw-r--r--net/ipv4/ping.c30
-rw-r--r--net/ipv4/proc.c2
-rw-r--r--net/ipv4/raw.c20
-rw-r--r--net/ipv4/route.c45
-rw-r--r--net/ipv4/syncookies.c24
-rw-r--r--net/ipv4/sysctl_net_ipv4.c20
-rw-r--r--net/ipv4/tcp.c81
-rw-r--r--net/ipv4/tcp_cong.c8
-rw-r--r--net/ipv4/tcp_cubic.c6
-rw-r--r--net/ipv4/tcp_dctcp.c5
-rw-r--r--net/ipv4/tcp_diag.c6
-rw-r--r--net/ipv4/tcp_fastopen.c20
-rw-r--r--net/ipv4/tcp_illinois.c6
-rw-r--r--net/ipv4/tcp_input.c217
-rw-r--r--net/ipv4/tcp_ipv4.c226
-rw-r--r--net/ipv4/tcp_metrics.c208
-rw-r--r--net/ipv4/tcp_minisocks.c66
-rw-r--r--net/ipv4/tcp_offload.c4
-rw-r--r--net/ipv4/tcp_output.c241
-rw-r--r--net/ipv4/tcp_timer.c21
-rw-r--r--net/ipv4/tcp_vegas.c5
-rw-r--r--net/ipv4/tcp_vegas.h2
-rw-r--r--net/ipv4/tcp_westwood.c6
-rw-r--r--net/ipv4/udp.c44
-rw-r--r--net/ipv4/udp_diag.c24
-rw-r--r--net/ipv4/udp_impl.h4
-rw-r--r--net/ipv4/udp_offload.c4
-rw-r--r--net/ipv4/udp_tunnel.c4
-rw-r--r--net/ipv4/xfrm4_input.c7
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c2
-rw-r--r--net/ipv4/xfrm4_output.c14
-rw-r--r--net/ipv4/xfrm4_policy.c3
93 files changed, 3105 insertions, 2711 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d2e49baaff63..8b47a4d79d04 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -217,7 +217,7 @@ int inet_listen(struct socket *sock, int backlog)
217 * shutdown() (rather than close()). 217 * shutdown() (rather than close()).
218 */ 218 */
219 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && 219 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
220 inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) { 220 !inet_csk(sk)->icsk_accept_queue.fastopenq) {
221 if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) 221 if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
222 err = fastopen_init_queue(sk, backlog); 222 err = fastopen_init_queue(sk, backlog);
223 else if ((sysctl_tcp_fastopen & 223 else if ((sysctl_tcp_fastopen &
@@ -314,11 +314,11 @@ lookup_protocol:
314 answer_flags = answer->flags; 314 answer_flags = answer->flags;
315 rcu_read_unlock(); 315 rcu_read_unlock();
316 316
317 WARN_ON(answer_prot->slab == NULL); 317 WARN_ON(!answer_prot->slab);
318 318
319 err = -ENOBUFS; 319 err = -ENOBUFS;
320 sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); 320 sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
321 if (sk == NULL) 321 if (!sk)
322 goto out; 322 goto out;
323 323
324 err = 0; 324 err = 0;
@@ -716,8 +716,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
716} 716}
717EXPORT_SYMBOL(inet_getname); 717EXPORT_SYMBOL(inet_getname);
718 718
719int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, 719int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
720 size_t size)
721{ 720{
722 struct sock *sk = sock->sk; 721 struct sock *sk = sock->sk;
723 722
@@ -728,7 +727,7 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
728 inet_autobind(sk)) 727 inet_autobind(sk))
729 return -EAGAIN; 728 return -EAGAIN;
730 729
731 return sk->sk_prot->sendmsg(iocb, sk, msg, size); 730 return sk->sk_prot->sendmsg(sk, msg, size);
732} 731}
733EXPORT_SYMBOL(inet_sendmsg); 732EXPORT_SYMBOL(inet_sendmsg);
734 733
@@ -750,8 +749,8 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
750} 749}
751EXPORT_SYMBOL(inet_sendpage); 750EXPORT_SYMBOL(inet_sendpage);
752 751
753int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, 752int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
754 size_t size, int flags) 753 int flags)
755{ 754{
756 struct sock *sk = sock->sk; 755 struct sock *sk = sock->sk;
757 int addr_len = 0; 756 int addr_len = 0;
@@ -759,7 +758,7 @@ int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
759 758
760 sock_rps_record_flow(sk); 759 sock_rps_record_flow(sk);
761 760
762 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, 761 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
763 flags & ~MSG_DONTWAIT, &addr_len); 762 flags & ~MSG_DONTWAIT, &addr_len);
764 if (err >= 0) 763 if (err >= 0)
765 msg->msg_namelen = addr_len; 764 msg->msg_namelen = addr_len;
@@ -1270,7 +1269,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
1270 if (udpfrag) { 1269 if (udpfrag) {
1271 iph->id = htons(id); 1270 iph->id = htons(id);
1272 iph->frag_off = htons(offset >> 3); 1271 iph->frag_off = htons(offset >> 3);
1273 if (skb->next != NULL) 1272 if (skb->next)
1274 iph->frag_off |= htons(IP_MF); 1273 iph->frag_off |= htons(IP_MF);
1275 offset += skb->len - nhoff - ihl; 1274 offset += skb->len - nhoff - ihl;
1276 } else { 1275 } else {
@@ -1675,7 +1674,7 @@ static int __init inet_init(void)
1675 struct list_head *r; 1674 struct list_head *r;
1676 int rc = -EINVAL; 1675 int rc = -EINVAL;
1677 1676
1678 BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); 1677 sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
1679 1678
1680 rc = proto_register(&tcp_prot, 1); 1679 rc = proto_register(&tcp_prot, 1);
1681 if (rc) 1680 if (rc)
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 205e1472aa78..933a92820d26 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -122,6 +122,7 @@
122 * Interface to generic neighbour cache. 122 * Interface to generic neighbour cache.
123 */ 123 */
124static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); 124static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd);
125static bool arp_key_eq(const struct neighbour *n, const void *pkey);
125static int arp_constructor(struct neighbour *neigh); 126static int arp_constructor(struct neighbour *neigh);
126static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); 127static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
127static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); 128static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -149,18 +150,12 @@ static const struct neigh_ops arp_direct_ops = {
149 .connected_output = neigh_direct_output, 150 .connected_output = neigh_direct_output,
150}; 151};
151 152
152static const struct neigh_ops arp_broken_ops = {
153 .family = AF_INET,
154 .solicit = arp_solicit,
155 .error_report = arp_error_report,
156 .output = neigh_compat_output,
157 .connected_output = neigh_compat_output,
158};
159
160struct neigh_table arp_tbl = { 153struct neigh_table arp_tbl = {
161 .family = AF_INET, 154 .family = AF_INET,
162 .key_len = 4, 155 .key_len = 4,
156 .protocol = cpu_to_be16(ETH_P_IP),
163 .hash = arp_hash, 157 .hash = arp_hash,
158 .key_eq = arp_key_eq,
164 .constructor = arp_constructor, 159 .constructor = arp_constructor,
165 .proxy_redo = parp_redo, 160 .proxy_redo = parp_redo,
166 .id = "arp_cache", 161 .id = "arp_cache",
@@ -216,7 +211,12 @@ static u32 arp_hash(const void *pkey,
216 const struct net_device *dev, 211 const struct net_device *dev,
217 __u32 *hash_rnd) 212 __u32 *hash_rnd)
218{ 213{
219 return arp_hashfn(*(u32 *)pkey, dev, *hash_rnd); 214 return arp_hashfn(pkey, dev, hash_rnd);
215}
216
217static bool arp_key_eq(const struct neighbour *neigh, const void *pkey)
218{
219 return neigh_key_eq32(neigh, pkey);
220} 220}
221 221
222static int arp_constructor(struct neighbour *neigh) 222static int arp_constructor(struct neighbour *neigh)
@@ -228,7 +228,7 @@ static int arp_constructor(struct neighbour *neigh)
228 228
229 rcu_read_lock(); 229 rcu_read_lock();
230 in_dev = __in_dev_get_rcu(dev); 230 in_dev = __in_dev_get_rcu(dev);
231 if (in_dev == NULL) { 231 if (!in_dev) {
232 rcu_read_unlock(); 232 rcu_read_unlock();
233 return -EINVAL; 233 return -EINVAL;
234 } 234 }
@@ -260,35 +260,6 @@ static int arp_constructor(struct neighbour *neigh)
260 in old paradigm. 260 in old paradigm.
261 */ 261 */
262 262
263#if 1
264 /* So... these "amateur" devices are hopeless.
265 The only thing, that I can say now:
266 It is very sad that we need to keep ugly obsolete
267 code to make them happy.
268
269 They should be moved to more reasonable state, now
270 they use rebuild_header INSTEAD OF hard_start_xmit!!!
271 Besides that, they are sort of out of date
272 (a lot of redundant clones/copies, useless in 2.1),
273 I wonder why people believe that they work.
274 */
275 switch (dev->type) {
276 default:
277 break;
278 case ARPHRD_ROSE:
279#if IS_ENABLED(CONFIG_AX25)
280 case ARPHRD_AX25:
281#if IS_ENABLED(CONFIG_NETROM)
282 case ARPHRD_NETROM:
283#endif
284 neigh->ops = &arp_broken_ops;
285 neigh->output = neigh->ops->output;
286 return 0;
287#else
288 break;
289#endif
290 }
291#endif
292 if (neigh->type == RTN_MULTICAST) { 263 if (neigh->type == RTN_MULTICAST) {
293 neigh->nud_state = NUD_NOARP; 264 neigh->nud_state = NUD_NOARP;
294 arp_mc_map(addr, neigh->ha, dev, 1); 265 arp_mc_map(addr, neigh->ha, dev, 1);
@@ -433,71 +404,6 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
433 return flag; 404 return flag;
434} 405}
435 406
436/* OBSOLETE FUNCTIONS */
437
438/*
439 * Find an arp mapping in the cache. If not found, post a request.
440 *
441 * It is very UGLY routine: it DOES NOT use skb->dst->neighbour,
442 * even if it exists. It is supposed that skb->dev was mangled
443 * by a virtual device (eql, shaper). Nobody but broken devices
444 * is allowed to use this function, it is scheduled to be removed. --ANK
445 */
446
447static int arp_set_predefined(int addr_hint, unsigned char *haddr,
448 __be32 paddr, struct net_device *dev)
449{
450 switch (addr_hint) {
451 case RTN_LOCAL:
452 pr_debug("arp called for own IP address\n");
453 memcpy(haddr, dev->dev_addr, dev->addr_len);
454 return 1;
455 case RTN_MULTICAST:
456 arp_mc_map(paddr, haddr, dev, 1);
457 return 1;
458 case RTN_BROADCAST:
459 memcpy(haddr, dev->broadcast, dev->addr_len);
460 return 1;
461 }
462 return 0;
463}
464
465
466int arp_find(unsigned char *haddr, struct sk_buff *skb)
467{
468 struct net_device *dev = skb->dev;
469 __be32 paddr;
470 struct neighbour *n;
471
472 if (!skb_dst(skb)) {
473 pr_debug("arp_find is called with dst==NULL\n");
474 kfree_skb(skb);
475 return 1;
476 }
477
478 paddr = rt_nexthop(skb_rtable(skb), ip_hdr(skb)->daddr);
479 if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
480 paddr, dev))
481 return 0;
482
483 n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
484
485 if (n) {
486 n->used = jiffies;
487 if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) {
488 neigh_ha_snapshot(haddr, n, dev);
489 neigh_release(n);
490 return 0;
491 }
492 neigh_release(n);
493 } else
494 kfree_skb(skb);
495 return 1;
496}
497EXPORT_SYMBOL(arp_find);
498
499/* END OF OBSOLETE FUNCTIONS */
500
501/* 407/*
502 * Check if we can use proxy ARP for this path 408 * Check if we can use proxy ARP for this path
503 */ 409 */
@@ -569,7 +475,7 @@ static inline int arp_fwd_pvlan(struct in_device *in_dev,
569 */ 475 */
570 476
571/* 477/*
572 * Create an arp packet. If (dest_hw == NULL), we create a broadcast 478 * Create an arp packet. If dest_hw is not set, we create a broadcast
573 * message. 479 * message.
574 */ 480 */
575struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, 481struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
@@ -589,7 +495,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
589 */ 495 */
590 496
591 skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC); 497 skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC);
592 if (skb == NULL) 498 if (!skb)
593 return NULL; 499 return NULL;
594 500
595 skb_reserve(skb, hlen); 501 skb_reserve(skb, hlen);
@@ -597,9 +503,9 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
597 arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev)); 503 arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev));
598 skb->dev = dev; 504 skb->dev = dev;
599 skb->protocol = htons(ETH_P_ARP); 505 skb->protocol = htons(ETH_P_ARP);
600 if (src_hw == NULL) 506 if (!src_hw)
601 src_hw = dev->dev_addr; 507 src_hw = dev->dev_addr;
602 if (dest_hw == NULL) 508 if (!dest_hw)
603 dest_hw = dev->broadcast; 509 dest_hw = dev->broadcast;
604 510
605 /* 511 /*
@@ -663,7 +569,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
663 break; 569 break;
664#endif 570#endif
665 default: 571 default:
666 if (target_hw != NULL) 572 if (target_hw)
667 memcpy(arp_ptr, target_hw, dev->addr_len); 573 memcpy(arp_ptr, target_hw, dev->addr_len);
668 else 574 else
669 memset(arp_ptr, 0, dev->addr_len); 575 memset(arp_ptr, 0, dev->addr_len);
@@ -685,7 +591,8 @@ EXPORT_SYMBOL(arp_create);
685void arp_xmit(struct sk_buff *skb) 591void arp_xmit(struct sk_buff *skb)
686{ 592{
687 /* Send it off, maybe filter it using firewalling first. */ 593 /* Send it off, maybe filter it using firewalling first. */
688 NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit); 594 NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, NULL, skb,
595 NULL, skb->dev, dev_queue_xmit_sk);
689} 596}
690EXPORT_SYMBOL(arp_xmit); 597EXPORT_SYMBOL(arp_xmit);
691 598
@@ -708,7 +615,7 @@ void arp_send(int type, int ptype, __be32 dest_ip,
708 615
709 skb = arp_create(type, ptype, dest_ip, dev, src_ip, 616 skb = arp_create(type, ptype, dest_ip, dev, src_ip,
710 dest_hw, src_hw, target_hw); 617 dest_hw, src_hw, target_hw);
711 if (skb == NULL) 618 if (!skb)
712 return; 619 return;
713 620
714 arp_xmit(skb); 621 arp_xmit(skb);
@@ -719,7 +626,7 @@ EXPORT_SYMBOL(arp_send);
719 * Process an arp request. 626 * Process an arp request.
720 */ 627 */
721 628
722static int arp_process(struct sk_buff *skb) 629static int arp_process(struct sock *sk, struct sk_buff *skb)
723{ 630{
724 struct net_device *dev = skb->dev; 631 struct net_device *dev = skb->dev;
725 struct in_device *in_dev = __in_dev_get_rcu(dev); 632 struct in_device *in_dev = __in_dev_get_rcu(dev);
@@ -738,7 +645,7 @@ static int arp_process(struct sk_buff *skb)
738 * is ARP'able. 645 * is ARP'able.
739 */ 646 */
740 647
741 if (in_dev == NULL) 648 if (!in_dev)
742 goto out; 649 goto out;
743 650
744 arp = arp_hdr(skb); 651 arp = arp_hdr(skb);
@@ -902,7 +809,7 @@ static int arp_process(struct sk_buff *skb)
902 is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip && 809 is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip &&
903 inet_addr_type(net, sip) == RTN_UNICAST; 810 inet_addr_type(net, sip) == RTN_UNICAST;
904 811
905 if (n == NULL && 812 if (!n &&
906 ((arp->ar_op == htons(ARPOP_REPLY) && 813 ((arp->ar_op == htons(ARPOP_REPLY) &&
907 inet_addr_type(net, sip) == RTN_UNICAST) || is_garp)) 814 inet_addr_type(net, sip) == RTN_UNICAST) || is_garp))
908 n = __neigh_lookup(&arp_tbl, &sip, dev, 1); 815 n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
@@ -940,7 +847,7 @@ out:
940 847
941static void parp_redo(struct sk_buff *skb) 848static void parp_redo(struct sk_buff *skb)
942{ 849{
943 arp_process(skb); 850 arp_process(NULL, skb);
944} 851}
945 852
946 853
@@ -973,7 +880,8 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
973 880
974 memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); 881 memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
975 882
976 return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); 883 return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, NULL, skb,
884 dev, NULL, arp_process);
977 885
978consumeskb: 886consumeskb:
979 consume_skb(skb); 887 consume_skb(skb);
@@ -994,7 +902,7 @@ out_of_mem:
994 902
995static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) 903static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
996{ 904{
997 if (dev == NULL) { 905 if (!dev) {
998 IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; 906 IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
999 return 0; 907 return 0;
1000 } 908 }
@@ -1020,7 +928,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
1020 return -ENODEV; 928 return -ENODEV;
1021 } 929 }
1022 if (mask) { 930 if (mask) {
1023 if (pneigh_lookup(&arp_tbl, net, &ip, dev, 1) == NULL) 931 if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1))
1024 return -ENOBUFS; 932 return -ENOBUFS;
1025 return 0; 933 return 0;
1026 } 934 }
@@ -1041,7 +949,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1041 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; 949 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
1042 if (r->arp_flags & ATF_PERM) 950 if (r->arp_flags & ATF_PERM)
1043 r->arp_flags |= ATF_COM; 951 r->arp_flags |= ATF_COM;
1044 if (dev == NULL) { 952 if (!dev) {
1045 struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); 953 struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
1046 954
1047 if (IS_ERR(rt)) 955 if (IS_ERR(rt))
@@ -1161,7 +1069,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
1161 return arp_req_delete_public(net, r, dev); 1069 return arp_req_delete_public(net, r, dev);
1162 1070
1163 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; 1071 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
1164 if (dev == NULL) { 1072 if (!dev) {
1165 struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); 1073 struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
1166 if (IS_ERR(rt)) 1074 if (IS_ERR(rt))
1167 return PTR_ERR(rt); 1075 return PTR_ERR(rt);
@@ -1210,7 +1118,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1210 if (r.arp_dev[0]) { 1118 if (r.arp_dev[0]) {
1211 err = -ENODEV; 1119 err = -ENODEV;
1212 dev = __dev_get_by_name(net, r.arp_dev); 1120 dev = __dev_get_by_name(net, r.arp_dev);
1213 if (dev == NULL) 1121 if (!dev)
1214 goto out; 1122 goto out;
1215 1123
1216 /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ 1124 /* Mmmm... It is wrong... ARPHRD_NETROM==0 */
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index e361ea6f3fc8..bdb2a07ec363 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -255,7 +255,7 @@ static int __init cipso_v4_cache_init(void)
255 cipso_v4_cache = kcalloc(CIPSO_V4_CACHE_BUCKETS, 255 cipso_v4_cache = kcalloc(CIPSO_V4_CACHE_BUCKETS,
256 sizeof(struct cipso_v4_map_cache_bkt), 256 sizeof(struct cipso_v4_map_cache_bkt),
257 GFP_KERNEL); 257 GFP_KERNEL);
258 if (cipso_v4_cache == NULL) 258 if (!cipso_v4_cache)
259 return -ENOMEM; 259 return -ENOMEM;
260 260
261 for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) { 261 for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) {
@@ -339,7 +339,7 @@ static int cipso_v4_cache_check(const unsigned char *key,
339 secattr->cache = entry->lsm_data; 339 secattr->cache = entry->lsm_data;
340 secattr->flags |= NETLBL_SECATTR_CACHE; 340 secattr->flags |= NETLBL_SECATTR_CACHE;
341 secattr->type = NETLBL_NLTYPE_CIPSOV4; 341 secattr->type = NETLBL_NLTYPE_CIPSOV4;
342 if (prev_entry == NULL) { 342 if (!prev_entry) {
343 spin_unlock_bh(&cipso_v4_cache[bkt].lock); 343 spin_unlock_bh(&cipso_v4_cache[bkt].lock);
344 return 0; 344 return 0;
345 } 345 }
@@ -393,10 +393,10 @@ int cipso_v4_cache_add(const unsigned char *cipso_ptr,
393 cipso_ptr_len = cipso_ptr[1]; 393 cipso_ptr_len = cipso_ptr[1];
394 394
395 entry = kzalloc(sizeof(*entry), GFP_ATOMIC); 395 entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
396 if (entry == NULL) 396 if (!entry)
397 return -ENOMEM; 397 return -ENOMEM;
398 entry->key = kmemdup(cipso_ptr, cipso_ptr_len, GFP_ATOMIC); 398 entry->key = kmemdup(cipso_ptr, cipso_ptr_len, GFP_ATOMIC);
399 if (entry->key == NULL) { 399 if (!entry->key) {
400 ret_val = -ENOMEM; 400 ret_val = -ENOMEM;
401 goto cache_add_failure; 401 goto cache_add_failure;
402 } 402 }
@@ -502,7 +502,7 @@ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def,
502 atomic_set(&doi_def->refcount, 1); 502 atomic_set(&doi_def->refcount, 1);
503 503
504 spin_lock(&cipso_v4_doi_list_lock); 504 spin_lock(&cipso_v4_doi_list_lock);
505 if (cipso_v4_doi_search(doi_def->doi) != NULL) { 505 if (cipso_v4_doi_search(doi_def->doi)) {
506 spin_unlock(&cipso_v4_doi_list_lock); 506 spin_unlock(&cipso_v4_doi_list_lock);
507 ret_val = -EEXIST; 507 ret_val = -EEXIST;
508 goto doi_add_return; 508 goto doi_add_return;
@@ -513,7 +513,7 @@ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def,
513 513
514doi_add_return: 514doi_add_return:
515 audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_ADD, audit_info); 515 audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_ADD, audit_info);
516 if (audit_buf != NULL) { 516 if (audit_buf) {
517 const char *type_str; 517 const char *type_str;
518 switch (doi_type) { 518 switch (doi_type) {
519 case CIPSO_V4_MAP_TRANS: 519 case CIPSO_V4_MAP_TRANS:
@@ -547,7 +547,7 @@ doi_add_return:
547 */ 547 */
548void cipso_v4_doi_free(struct cipso_v4_doi *doi_def) 548void cipso_v4_doi_free(struct cipso_v4_doi *doi_def)
549{ 549{
550 if (doi_def == NULL) 550 if (!doi_def)
551 return; 551 return;
552 552
553 switch (doi_def->type) { 553 switch (doi_def->type) {
@@ -598,7 +598,7 @@ int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info)
598 598
599 spin_lock(&cipso_v4_doi_list_lock); 599 spin_lock(&cipso_v4_doi_list_lock);
600 doi_def = cipso_v4_doi_search(doi); 600 doi_def = cipso_v4_doi_search(doi);
601 if (doi_def == NULL) { 601 if (!doi_def) {
602 spin_unlock(&cipso_v4_doi_list_lock); 602 spin_unlock(&cipso_v4_doi_list_lock);
603 ret_val = -ENOENT; 603 ret_val = -ENOENT;
604 goto doi_remove_return; 604 goto doi_remove_return;
@@ -617,7 +617,7 @@ int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info)
617 617
618doi_remove_return: 618doi_remove_return:
619 audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_DEL, audit_info); 619 audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_DEL, audit_info);
620 if (audit_buf != NULL) { 620 if (audit_buf) {
621 audit_log_format(audit_buf, 621 audit_log_format(audit_buf,
622 " cipso_doi=%u res=%u", 622 " cipso_doi=%u res=%u",
623 doi, ret_val == 0 ? 1 : 0); 623 doi, ret_val == 0 ? 1 : 0);
@@ -644,7 +644,7 @@ struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi)
644 644
645 rcu_read_lock(); 645 rcu_read_lock();
646 doi_def = cipso_v4_doi_search(doi); 646 doi_def = cipso_v4_doi_search(doi);
647 if (doi_def == NULL) 647 if (!doi_def)
648 goto doi_getdef_return; 648 goto doi_getdef_return;
649 if (!atomic_inc_not_zero(&doi_def->refcount)) 649 if (!atomic_inc_not_zero(&doi_def->refcount))
650 doi_def = NULL; 650 doi_def = NULL;
@@ -664,7 +664,7 @@ doi_getdef_return:
664 */ 664 */
665void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def) 665void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def)
666{ 666{
667 if (doi_def == NULL) 667 if (!doi_def)
668 return; 668 return;
669 669
670 if (!atomic_dec_and_test(&doi_def->refcount)) 670 if (!atomic_dec_and_test(&doi_def->refcount))
@@ -1642,7 +1642,7 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
1642 1642
1643 rcu_read_lock(); 1643 rcu_read_lock();
1644 doi_def = cipso_v4_doi_search(get_unaligned_be32(&opt[2])); 1644 doi_def = cipso_v4_doi_search(get_unaligned_be32(&opt[2]));
1645 if (doi_def == NULL) { 1645 if (!doi_def) {
1646 err_offset = 2; 1646 err_offset = 2;
1647 goto validate_return_locked; 1647 goto validate_return_locked;
1648 } 1648 }
@@ -1736,7 +1736,7 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
1736 * not the loopback device drop the packet. Further, 1736 * not the loopback device drop the packet. Further,
1737 * there is no legitimate reason for setting this from 1737 * there is no legitimate reason for setting this from
1738 * userspace so reject it if skb is NULL. */ 1738 * userspace so reject it if skb is NULL. */
1739 if (skb == NULL || !(skb->dev->flags & IFF_LOOPBACK)) { 1739 if (!skb || !(skb->dev->flags & IFF_LOOPBACK)) {
1740 err_offset = opt_iter; 1740 err_offset = opt_iter;
1741 goto validate_return_locked; 1741 goto validate_return_locked;
1742 } 1742 }
@@ -1897,7 +1897,7 @@ int cipso_v4_sock_setattr(struct sock *sk,
1897 * defined yet but it is not a problem as the only users of these 1897 * defined yet but it is not a problem as the only users of these
1898 * "lite" PF_INET sockets are functions which do an accept() call 1898 * "lite" PF_INET sockets are functions which do an accept() call
1899 * afterwards so we will label the socket as part of the accept(). */ 1899 * afterwards so we will label the socket as part of the accept(). */
1900 if (sk == NULL) 1900 if (!sk)
1901 return 0; 1901 return 0;
1902 1902
1903 /* We allocate the maximum CIPSO option size here so we are probably 1903 /* We allocate the maximum CIPSO option size here so we are probably
@@ -1905,7 +1905,7 @@ int cipso_v4_sock_setattr(struct sock *sk,
1905 * on and after all we are only talking about 40 bytes. */ 1905 * on and after all we are only talking about 40 bytes. */
1906 buf_len = CIPSO_V4_OPT_LEN_MAX; 1906 buf_len = CIPSO_V4_OPT_LEN_MAX;
1907 buf = kmalloc(buf_len, GFP_ATOMIC); 1907 buf = kmalloc(buf_len, GFP_ATOMIC);
1908 if (buf == NULL) { 1908 if (!buf) {
1909 ret_val = -ENOMEM; 1909 ret_val = -ENOMEM;
1910 goto socket_setattr_failure; 1910 goto socket_setattr_failure;
1911 } 1911 }
@@ -1921,7 +1921,7 @@ int cipso_v4_sock_setattr(struct sock *sk,
1921 * set the IPOPT_CIPSO option. */ 1921 * set the IPOPT_CIPSO option. */
1922 opt_len = (buf_len + 3) & ~3; 1922 opt_len = (buf_len + 3) & ~3;
1923 opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC); 1923 opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC);
1924 if (opt == NULL) { 1924 if (!opt) {
1925 ret_val = -ENOMEM; 1925 ret_val = -ENOMEM;
1926 goto socket_setattr_failure; 1926 goto socket_setattr_failure;
1927 } 1927 }
@@ -1981,7 +1981,7 @@ int cipso_v4_req_setattr(struct request_sock *req,
1981 * on and after all we are only talking about 40 bytes. */ 1981 * on and after all we are only talking about 40 bytes. */
1982 buf_len = CIPSO_V4_OPT_LEN_MAX; 1982 buf_len = CIPSO_V4_OPT_LEN_MAX;
1983 buf = kmalloc(buf_len, GFP_ATOMIC); 1983 buf = kmalloc(buf_len, GFP_ATOMIC);
1984 if (buf == NULL) { 1984 if (!buf) {
1985 ret_val = -ENOMEM; 1985 ret_val = -ENOMEM;
1986 goto req_setattr_failure; 1986 goto req_setattr_failure;
1987 } 1987 }
@@ -1997,7 +1997,7 @@ int cipso_v4_req_setattr(struct request_sock *req,
1997 * set the IPOPT_CIPSO option. */ 1997 * set the IPOPT_CIPSO option. */
1998 opt_len = (buf_len + 3) & ~3; 1998 opt_len = (buf_len + 3) & ~3;
1999 opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC); 1999 opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC);
2000 if (opt == NULL) { 2000 if (!opt) {
2001 ret_val = -ENOMEM; 2001 ret_val = -ENOMEM;
2002 goto req_setattr_failure; 2002 goto req_setattr_failure;
2003 } 2003 }
@@ -2102,7 +2102,7 @@ void cipso_v4_sock_delattr(struct sock *sk)
2102 2102
2103 sk_inet = inet_sk(sk); 2103 sk_inet = inet_sk(sk);
2104 opt = rcu_dereference_protected(sk_inet->inet_opt, 1); 2104 opt = rcu_dereference_protected(sk_inet->inet_opt, 1);
2105 if (opt == NULL || opt->opt.cipso == 0) 2105 if (!opt || opt->opt.cipso == 0)
2106 return; 2106 return;
2107 2107
2108 hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt); 2108 hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt);
@@ -2128,7 +2128,7 @@ void cipso_v4_req_delattr(struct request_sock *req)
2128 2128
2129 req_inet = inet_rsk(req); 2129 req_inet = inet_rsk(req);
2130 opt = req_inet->opt; 2130 opt = req_inet->opt;
2131 if (opt == NULL || opt->opt.cipso == 0) 2131 if (!opt || opt->opt.cipso == 0)
2132 return; 2132 return;
2133 2133
2134 cipso_v4_delopt(&req_inet->opt); 2134 cipso_v4_delopt(&req_inet->opt);
@@ -2157,7 +2157,7 @@ int cipso_v4_getattr(const unsigned char *cipso,
2157 doi = get_unaligned_be32(&cipso[2]); 2157 doi = get_unaligned_be32(&cipso[2]);
2158 rcu_read_lock(); 2158 rcu_read_lock();
2159 doi_def = cipso_v4_doi_search(doi); 2159 doi_def = cipso_v4_doi_search(doi);
2160 if (doi_def == NULL) 2160 if (!doi_def)
2161 goto getattr_return; 2161 goto getattr_return;
2162 /* XXX - This code assumes only one tag per CIPSO option which isn't 2162 /* XXX - This code assumes only one tag per CIPSO option which isn't
2163 * really a good assumption to make but since we only support the MAC 2163 * really a good assumption to make but since we only support the MAC
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 3a8985c94581..419d23c53ec7 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -107,7 +107,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
107 107
108static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; 108static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
109 109
110static u32 inet_addr_hash(struct net *net, __be32 addr) 110static u32 inet_addr_hash(const struct net *net, __be32 addr)
111{ 111{
112 u32 val = (__force u32) addr ^ net_hash_mix(net); 112 u32 val = (__force u32) addr ^ net_hash_mix(net);
113 113
@@ -548,6 +548,26 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
548 return NULL; 548 return NULL;
549} 549}
550 550
551static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa)
552{
553 struct ip_mreqn mreq = {
554 .imr_multiaddr.s_addr = ifa->ifa_address,
555 .imr_ifindex = ifa->ifa_dev->dev->ifindex,
556 };
557 int ret;
558
559 ASSERT_RTNL();
560
561 lock_sock(sk);
562 if (join)
563 ret = ip_mc_join_group(sk, &mreq);
564 else
565 ret = ip_mc_leave_group(sk, &mreq);
566 release_sock(sk);
567
568 return ret;
569}
570
551static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) 571static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
552{ 572{
553 struct net *net = sock_net(skb->sk); 573 struct net *net = sock_net(skb->sk);
@@ -565,7 +585,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
565 585
566 ifm = nlmsg_data(nlh); 586 ifm = nlmsg_data(nlh);
567 in_dev = inetdev_by_index(net, ifm->ifa_index); 587 in_dev = inetdev_by_index(net, ifm->ifa_index);
568 if (in_dev == NULL) { 588 if (!in_dev) {
569 err = -ENODEV; 589 err = -ENODEV;
570 goto errout; 590 goto errout;
571 } 591 }
@@ -573,7 +593,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
573 for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; 593 for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
574 ifap = &ifa->ifa_next) { 594 ifap = &ifa->ifa_next) {
575 if (tb[IFA_LOCAL] && 595 if (tb[IFA_LOCAL] &&
576 ifa->ifa_local != nla_get_be32(tb[IFA_LOCAL])) 596 ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL]))
577 continue; 597 continue;
578 598
579 if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label)) 599 if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label))
@@ -581,9 +601,11 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
581 601
582 if (tb[IFA_ADDRESS] && 602 if (tb[IFA_ADDRESS] &&
583 (ifm->ifa_prefixlen != ifa->ifa_prefixlen || 603 (ifm->ifa_prefixlen != ifa->ifa_prefixlen ||
584 !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa))) 604 !inet_ifa_match(nla_get_in_addr(tb[IFA_ADDRESS]), ifa)))
585 continue; 605 continue;
586 606
607 if (ipv4_is_multicast(ifa->ifa_address))
608 ip_mc_config(net->ipv4.mc_autojoin_sk, false, ifa);
587 __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid); 609 __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid);
588 return 0; 610 return 0;
589 } 611 }
@@ -733,21 +755,21 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
733 755
734 ifm = nlmsg_data(nlh); 756 ifm = nlmsg_data(nlh);
735 err = -EINVAL; 757 err = -EINVAL;
736 if (ifm->ifa_prefixlen > 32 || tb[IFA_LOCAL] == NULL) 758 if (ifm->ifa_prefixlen > 32 || !tb[IFA_LOCAL])
737 goto errout; 759 goto errout;
738 760
739 dev = __dev_get_by_index(net, ifm->ifa_index); 761 dev = __dev_get_by_index(net, ifm->ifa_index);
740 err = -ENODEV; 762 err = -ENODEV;
741 if (dev == NULL) 763 if (!dev)
742 goto errout; 764 goto errout;
743 765
744 in_dev = __in_dev_get_rtnl(dev); 766 in_dev = __in_dev_get_rtnl(dev);
745 err = -ENOBUFS; 767 err = -ENOBUFS;
746 if (in_dev == NULL) 768 if (!in_dev)
747 goto errout; 769 goto errout;
748 770
749 ifa = inet_alloc_ifa(); 771 ifa = inet_alloc_ifa();
750 if (ifa == NULL) 772 if (!ifa)
751 /* 773 /*
752 * A potential indev allocation can be left alive, it stays 774 * A potential indev allocation can be left alive, it stays
753 * assigned to its device and is destroy with it. 775 * assigned to its device and is destroy with it.
@@ -758,7 +780,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
758 neigh_parms_data_state_setall(in_dev->arp_parms); 780 neigh_parms_data_state_setall(in_dev->arp_parms);
759 in_dev_hold(in_dev); 781 in_dev_hold(in_dev);
760 782
761 if (tb[IFA_ADDRESS] == NULL) 783 if (!tb[IFA_ADDRESS])
762 tb[IFA_ADDRESS] = tb[IFA_LOCAL]; 784 tb[IFA_ADDRESS] = tb[IFA_LOCAL];
763 785
764 INIT_HLIST_NODE(&ifa->hash); 786 INIT_HLIST_NODE(&ifa->hash);
@@ -769,11 +791,11 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
769 ifa->ifa_scope = ifm->ifa_scope; 791 ifa->ifa_scope = ifm->ifa_scope;
770 ifa->ifa_dev = in_dev; 792 ifa->ifa_dev = in_dev;
771 793
772 ifa->ifa_local = nla_get_be32(tb[IFA_LOCAL]); 794 ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]);
773 ifa->ifa_address = nla_get_be32(tb[IFA_ADDRESS]); 795 ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]);
774 796
775 if (tb[IFA_BROADCAST]) 797 if (tb[IFA_BROADCAST])
776 ifa->ifa_broadcast = nla_get_be32(tb[IFA_BROADCAST]); 798 ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]);
777 799
778 if (tb[IFA_LABEL]) 800 if (tb[IFA_LABEL])
779 nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); 801 nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
@@ -838,6 +860,15 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
838 * userspace already relies on not having to provide this. 860 * userspace already relies on not having to provide this.
839 */ 861 */
840 set_ifa_lifetime(ifa, valid_lft, prefered_lft); 862 set_ifa_lifetime(ifa, valid_lft, prefered_lft);
863 if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) {
864 int ret = ip_mc_config(net->ipv4.mc_autojoin_sk,
865 true, ifa);
866
867 if (ret < 0) {
868 inet_free_ifa(ifa);
869 return ret;
870 }
871 }
841 return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); 872 return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
842 } else { 873 } else {
843 inet_free_ifa(ifa); 874 inet_free_ifa(ifa);
@@ -1259,7 +1290,7 @@ __be32 inet_confirm_addr(struct net *net, struct in_device *in_dev,
1259 __be32 addr = 0; 1290 __be32 addr = 0;
1260 struct net_device *dev; 1291 struct net_device *dev;
1261 1292
1262 if (in_dev != NULL) 1293 if (in_dev)
1263 return confirm_addr_indev(in_dev, dst, local, scope); 1294 return confirm_addr_indev(in_dev, dst, local, scope);
1264 1295
1265 rcu_read_lock(); 1296 rcu_read_lock();
@@ -1309,7 +1340,7 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
1309 if (named++ == 0) 1340 if (named++ == 0)
1310 goto skip; 1341 goto skip;
1311 dot = strchr(old, ':'); 1342 dot = strchr(old, ':');
1312 if (dot == NULL) { 1343 if (!dot) {
1313 sprintf(old, ":%d", named); 1344 sprintf(old, ":%d", named);
1314 dot = old; 1345 dot = old;
1315 } 1346 }
@@ -1478,7 +1509,7 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
1478 u32 preferred, valid; 1509 u32 preferred, valid;
1479 1510
1480 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags); 1511 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags);
1481 if (nlh == NULL) 1512 if (!nlh)
1482 return -EMSGSIZE; 1513 return -EMSGSIZE;
1483 1514
1484 ifm = nlmsg_data(nlh); 1515 ifm = nlmsg_data(nlh);
@@ -1510,11 +1541,11 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
1510 valid = INFINITY_LIFE_TIME; 1541 valid = INFINITY_LIFE_TIME;
1511 } 1542 }
1512 if ((ifa->ifa_address && 1543 if ((ifa->ifa_address &&
1513 nla_put_be32(skb, IFA_ADDRESS, ifa->ifa_address)) || 1544 nla_put_in_addr(skb, IFA_ADDRESS, ifa->ifa_address)) ||
1514 (ifa->ifa_local && 1545 (ifa->ifa_local &&
1515 nla_put_be32(skb, IFA_LOCAL, ifa->ifa_local)) || 1546 nla_put_in_addr(skb, IFA_LOCAL, ifa->ifa_local)) ||
1516 (ifa->ifa_broadcast && 1547 (ifa->ifa_broadcast &&
1517 nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || 1548 nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
1518 (ifa->ifa_label[0] && 1549 (ifa->ifa_label[0] &&
1519 nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || 1550 nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
1520 nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) || 1551 nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) ||
@@ -1597,7 +1628,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
1597 1628
1598 net = dev_net(ifa->ifa_dev->dev); 1629 net = dev_net(ifa->ifa_dev->dev);
1599 skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL); 1630 skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL);
1600 if (skb == NULL) 1631 if (!skb)
1601 goto errout; 1632 goto errout;
1602 1633
1603 err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0); 1634 err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0);
@@ -1634,7 +1665,7 @@ static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
1634 return -ENODATA; 1665 return -ENODATA;
1635 1666
1636 nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4); 1667 nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4);
1637 if (nla == NULL) 1668 if (!nla)
1638 return -EMSGSIZE; 1669 return -EMSGSIZE;
1639 1670
1640 for (i = 0; i < IPV4_DEVCONF_MAX; i++) 1671 for (i = 0; i < IPV4_DEVCONF_MAX; i++)
@@ -1723,7 +1754,7 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
1723 1754
1724 nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), 1755 nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
1725 flags); 1756 flags);
1726 if (nlh == NULL) 1757 if (!nlh)
1727 return -EMSGSIZE; 1758 return -EMSGSIZE;
1728 1759
1729 ncm = nlmsg_data(nlh); 1760 ncm = nlmsg_data(nlh);
@@ -1765,7 +1796,7 @@ void inet_netconf_notify_devconf(struct net *net, int type, int ifindex,
1765 int err = -ENOBUFS; 1796 int err = -ENOBUFS;
1766 1797
1767 skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_ATOMIC); 1798 skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_ATOMIC);
1768 if (skb == NULL) 1799 if (!skb)
1769 goto errout; 1800 goto errout;
1770 1801
1771 err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0, 1802 err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0,
@@ -1822,10 +1853,10 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
1822 break; 1853 break;
1823 default: 1854 default:
1824 dev = __dev_get_by_index(net, ifindex); 1855 dev = __dev_get_by_index(net, ifindex);
1825 if (dev == NULL) 1856 if (!dev)
1826 goto errout; 1857 goto errout;
1827 in_dev = __in_dev_get_rtnl(dev); 1858 in_dev = __in_dev_get_rtnl(dev);
1828 if (in_dev == NULL) 1859 if (!in_dev)
1829 goto errout; 1860 goto errout;
1830 devconf = &in_dev->cnf; 1861 devconf = &in_dev->cnf;
1831 break; 1862 break;
@@ -1833,7 +1864,7 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
1833 1864
1834 err = -ENOBUFS; 1865 err = -ENOBUFS;
1835 skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC); 1866 skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC);
1836 if (skb == NULL) 1867 if (!skb)
1837 goto errout; 1868 goto errout;
1838 1869
1839 err = inet_netconf_fill_devconf(skb, ifindex, devconf, 1870 err = inet_netconf_fill_devconf(skb, ifindex, devconf,
@@ -2184,7 +2215,7 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
2184{ 2215{
2185 struct devinet_sysctl_table *t = cnf->sysctl; 2216 struct devinet_sysctl_table *t = cnf->sysctl;
2186 2217
2187 if (t == NULL) 2218 if (!t)
2188 return; 2219 return;
2189 2220
2190 cnf->sysctl = NULL; 2221 cnf->sysctl = NULL;
@@ -2245,16 +2276,16 @@ static __net_init int devinet_init_net(struct net *net)
2245 2276
2246 if (!net_eq(net, &init_net)) { 2277 if (!net_eq(net, &init_net)) {
2247 all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL); 2278 all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL);
2248 if (all == NULL) 2279 if (!all)
2249 goto err_alloc_all; 2280 goto err_alloc_all;
2250 2281
2251 dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL); 2282 dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL);
2252 if (dflt == NULL) 2283 if (!dflt)
2253 goto err_alloc_dflt; 2284 goto err_alloc_dflt;
2254 2285
2255#ifdef CONFIG_SYSCTL 2286#ifdef CONFIG_SYSCTL
2256 tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL); 2287 tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL);
2257 if (tbl == NULL) 2288 if (!tbl)
2258 goto err_alloc_ctl; 2289 goto err_alloc_ctl;
2259 2290
2260 tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1]; 2291 tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
@@ -2274,7 +2305,7 @@ static __net_init int devinet_init_net(struct net *net)
2274 2305
2275 err = -ENOMEM; 2306 err = -ENOMEM;
2276 forw_hdr = register_net_sysctl(net, "net/ipv4", tbl); 2307 forw_hdr = register_net_sysctl(net, "net/ipv4", tbl);
2277 if (forw_hdr == NULL) 2308 if (!forw_hdr)
2278 goto err_reg_ctl; 2309 goto err_reg_ctl;
2279 net->ipv4.forw_hdr = forw_hdr; 2310 net->ipv4.forw_hdr = forw_hdr;
2280#endif 2311#endif
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 60173d4d3a0e..421a80b09b62 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -553,7 +553,7 @@ static int esp_init_authenc(struct xfrm_state *x)
553 int err; 553 int err;
554 554
555 err = -EINVAL; 555 err = -EINVAL;
556 if (x->ealg == NULL) 556 if (!x->ealg)
557 goto error; 557 goto error;
558 558
559 err = -ENAMETOOLONG; 559 err = -ENAMETOOLONG;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 57be71dd6a9e..872494e6e6eb 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -52,12 +52,12 @@ static int __net_init fib4_rules_init(struct net *net)
52{ 52{
53 struct fib_table *local_table, *main_table; 53 struct fib_table *local_table, *main_table;
54 54
55 local_table = fib_trie_table(RT_TABLE_LOCAL); 55 main_table = fib_trie_table(RT_TABLE_MAIN, NULL);
56 if (local_table == NULL) 56 if (!main_table)
57 return -ENOMEM; 57 return -ENOMEM;
58 58
59 main_table = fib_trie_table(RT_TABLE_MAIN); 59 local_table = fib_trie_table(RT_TABLE_LOCAL, main_table);
60 if (main_table == NULL) 60 if (!local_table)
61 goto fail; 61 goto fail;
62 62
63 hlist_add_head_rcu(&local_table->tb_hlist, 63 hlist_add_head_rcu(&local_table->tb_hlist,
@@ -67,14 +67,14 @@ static int __net_init fib4_rules_init(struct net *net)
67 return 0; 67 return 0;
68 68
69fail: 69fail:
70 fib_free_table(local_table); 70 fib_free_table(main_table);
71 return -ENOMEM; 71 return -ENOMEM;
72} 72}
73#else 73#else
74 74
75struct fib_table *fib_new_table(struct net *net, u32 id) 75struct fib_table *fib_new_table(struct net *net, u32 id)
76{ 76{
77 struct fib_table *tb; 77 struct fib_table *tb, *alias = NULL;
78 unsigned int h; 78 unsigned int h;
79 79
80 if (id == 0) 80 if (id == 0)
@@ -83,23 +83,23 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
83 if (tb) 83 if (tb)
84 return tb; 84 return tb;
85 85
86 tb = fib_trie_table(id); 86 if (id == RT_TABLE_LOCAL)
87 alias = fib_new_table(net, RT_TABLE_MAIN);
88
89 tb = fib_trie_table(id, alias);
87 if (!tb) 90 if (!tb)
88 return NULL; 91 return NULL;
89 92
90 switch (id) { 93 switch (id) {
91 case RT_TABLE_LOCAL: 94 case RT_TABLE_LOCAL:
92 net->ipv4.fib_local = tb; 95 rcu_assign_pointer(net->ipv4.fib_local, tb);
93 break; 96 break;
94
95 case RT_TABLE_MAIN: 97 case RT_TABLE_MAIN:
96 net->ipv4.fib_main = tb; 98 rcu_assign_pointer(net->ipv4.fib_main, tb);
97 break; 99 break;
98
99 case RT_TABLE_DEFAULT: 100 case RT_TABLE_DEFAULT:
100 net->ipv4.fib_default = tb; 101 rcu_assign_pointer(net->ipv4.fib_default, tb);
101 break; 102 break;
102
103 default: 103 default:
104 break; 104 break;
105 } 105 }
@@ -129,16 +129,62 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
129} 129}
130#endif /* CONFIG_IP_MULTIPLE_TABLES */ 130#endif /* CONFIG_IP_MULTIPLE_TABLES */
131 131
132static void fib_replace_table(struct net *net, struct fib_table *old,
133 struct fib_table *new)
134{
135#ifdef CONFIG_IP_MULTIPLE_TABLES
136 switch (new->tb_id) {
137 case RT_TABLE_LOCAL:
138 rcu_assign_pointer(net->ipv4.fib_local, new);
139 break;
140 case RT_TABLE_MAIN:
141 rcu_assign_pointer(net->ipv4.fib_main, new);
142 break;
143 case RT_TABLE_DEFAULT:
144 rcu_assign_pointer(net->ipv4.fib_default, new);
145 break;
146 default:
147 break;
148 }
149
150#endif
151 /* replace the old table in the hlist */
152 hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist);
153}
154
155int fib_unmerge(struct net *net)
156{
157 struct fib_table *old, *new;
158
159 /* attempt to fetch local table if it has been allocated */
160 old = fib_get_table(net, RT_TABLE_LOCAL);
161 if (!old)
162 return 0;
163
164 new = fib_trie_unmerge(old);
165 if (!new)
166 return -ENOMEM;
167
168 /* replace merged table with clean table */
169 if (new != old) {
170 fib_replace_table(net, old, new);
171 fib_free_table(old);
172 }
173
174 return 0;
175}
176
132static void fib_flush(struct net *net) 177static void fib_flush(struct net *net)
133{ 178{
134 int flushed = 0; 179 int flushed = 0;
135 struct fib_table *tb;
136 struct hlist_head *head;
137 unsigned int h; 180 unsigned int h;
138 181
139 for (h = 0; h < FIB_TABLE_HASHSZ; h++) { 182 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
140 head = &net->ipv4.fib_table_hash[h]; 183 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
141 hlist_for_each_entry(tb, head, tb_hlist) 184 struct hlist_node *tmp;
185 struct fib_table *tb;
186
187 hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
142 flushed += fib_table_flush(tb); 188 flushed += fib_table_flush(tb);
143 } 189 }
144 190
@@ -146,6 +192,19 @@ static void fib_flush(struct net *net)
146 rt_cache_flush(net); 192 rt_cache_flush(net);
147} 193}
148 194
195void fib_flush_external(struct net *net)
196{
197 struct fib_table *tb;
198 struct hlist_head *head;
199 unsigned int h;
200
201 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
202 head = &net->ipv4.fib_table_hash[h];
203 hlist_for_each_entry(tb, head, tb_hlist)
204 fib_table_flush_external(tb);
205 }
206}
207
149/* 208/*
150 * Find address type as if only "dev" was present in the system. If 209 * Find address type as if only "dev" was present in the system. If
151 * on_dev is NULL then all interfaces are taken into consideration. 210 * on_dev is NULL then all interfaces are taken into consideration.
@@ -427,7 +486,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
427 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) 486 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
428 if (strcmp(ifa->ifa_label, devname) == 0) 487 if (strcmp(ifa->ifa_label, devname) == 0)
429 break; 488 break;
430 if (ifa == NULL) 489 if (!ifa)
431 return -ENODEV; 490 return -ENODEV;
432 cfg->fc_prefsrc = ifa->ifa_local; 491 cfg->fc_prefsrc = ifa->ifa_local;
433 } 492 }
@@ -455,7 +514,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
455 int len = 0; 514 int len = 0;
456 515
457 mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL); 516 mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
458 if (mx == NULL) 517 if (!mx)
459 return -ENOMEM; 518 return -ENOMEM;
460 519
461 if (rt->rt_flags & RTF_MTU) 520 if (rt->rt_flags & RTF_MTU)
@@ -617,7 +676,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
617 goto errout; 676 goto errout;
618 677
619 tb = fib_get_table(net, cfg.fc_table); 678 tb = fib_get_table(net, cfg.fc_table);
620 if (tb == NULL) { 679 if (!tb) {
621 err = -ESRCH; 680 err = -ESRCH;
622 goto errout; 681 goto errout;
623 } 682 }
@@ -639,7 +698,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
639 goto errout; 698 goto errout;
640 699
641 tb = fib_new_table(net, cfg.fc_table); 700 tb = fib_new_table(net, cfg.fc_table);
642 if (tb == NULL) { 701 if (!tb) {
643 err = -ENOBUFS; 702 err = -ENOBUFS;
644 goto errout; 703 goto errout;
645 } 704 }
@@ -665,10 +724,12 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
665 s_h = cb->args[0]; 724 s_h = cb->args[0];
666 s_e = cb->args[1]; 725 s_e = cb->args[1];
667 726
727 rcu_read_lock();
728
668 for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { 729 for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
669 e = 0; 730 e = 0;
670 head = &net->ipv4.fib_table_hash[h]; 731 head = &net->ipv4.fib_table_hash[h];
671 hlist_for_each_entry(tb, head, tb_hlist) { 732 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
672 if (e < s_e) 733 if (e < s_e)
673 goto next; 734 goto next;
674 if (dumped) 735 if (dumped)
@@ -682,6 +743,8 @@ next:
682 } 743 }
683 } 744 }
684out: 745out:
746 rcu_read_unlock();
747
685 cb->args[1] = e; 748 cb->args[1] = e;
686 cb->args[0] = h; 749 cb->args[0] = h;
687 750
@@ -716,7 +779,7 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
716 else 779 else
717 tb = fib_new_table(net, RT_TABLE_LOCAL); 780 tb = fib_new_table(net, RT_TABLE_LOCAL);
718 781
719 if (tb == NULL) 782 if (!tb)
720 return; 783 return;
721 784
722 cfg.fc_table = tb->tb_id; 785 cfg.fc_table = tb->tb_id;
@@ -743,7 +806,7 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
743 806
744 if (ifa->ifa_flags & IFA_F_SECONDARY) { 807 if (ifa->ifa_flags & IFA_F_SECONDARY) {
745 prim = inet_ifa_byprefix(in_dev, prefix, mask); 808 prim = inet_ifa_byprefix(in_dev, prefix, mask);
746 if (prim == NULL) { 809 if (!prim) {
747 pr_warn("%s: bug: prim == NULL\n", __func__); 810 pr_warn("%s: bug: prim == NULL\n", __func__);
748 return; 811 return;
749 } 812 }
@@ -797,7 +860,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
797 860
798 if (ifa->ifa_flags & IFA_F_SECONDARY) { 861 if (ifa->ifa_flags & IFA_F_SECONDARY) {
799 prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); 862 prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
800 if (prim == NULL) { 863 if (!prim) {
801 pr_warn("%s: bug: prim == NULL\n", __func__); 864 pr_warn("%s: bug: prim == NULL\n", __func__);
802 return; 865 return;
803 } 866 }
@@ -967,7 +1030,7 @@ static void nl_fib_input(struct sk_buff *skb)
967 return; 1030 return;
968 1031
969 skb = netlink_skb_clone(skb, GFP_KERNEL); 1032 skb = netlink_skb_clone(skb, GFP_KERNEL);
970 if (skb == NULL) 1033 if (!skb)
971 return; 1034 return;
972 nlh = nlmsg_hdr(skb); 1035 nlh = nlmsg_hdr(skb);
973 1036
@@ -988,7 +1051,7 @@ static int __net_init nl_fib_lookup_init(struct net *net)
988 }; 1051 };
989 1052
990 sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg); 1053 sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
991 if (sk == NULL) 1054 if (!sk)
992 return -EAFNOSUPPORT; 1055 return -EAFNOSUPPORT;
993 net->ipv4.fibnl = sk; 1056 net->ipv4.fibnl = sk;
994 return 0; 1057 return 0;
@@ -1026,7 +1089,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
1026 case NETDEV_DOWN: 1089 case NETDEV_DOWN:
1027 fib_del_ifaddr(ifa, NULL); 1090 fib_del_ifaddr(ifa, NULL);
1028 atomic_inc(&net->ipv4.dev_addr_genid); 1091 atomic_inc(&net->ipv4.dev_addr_genid);
1029 if (ifa->ifa_dev->ifa_list == NULL) { 1092 if (!ifa->ifa_dev->ifa_list) {
1030 /* Last address was deleted from this interface. 1093 /* Last address was deleted from this interface.
1031 * Disable IP. 1094 * Disable IP.
1032 */ 1095 */
@@ -1094,7 +1157,7 @@ static int __net_init ip_fib_net_init(struct net *net)
1094 size = max_t(size_t, size, L1_CACHE_BYTES); 1157 size = max_t(size_t, size, L1_CACHE_BYTES);
1095 1158
1096 net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL); 1159 net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
1097 if (net->ipv4.fib_table_hash == NULL) 1160 if (!net->ipv4.fib_table_hash)
1098 return -ENOMEM; 1161 return -ENOMEM;
1099 1162
1100 err = fib4_rules_init(net); 1163 err = fib4_rules_init(net);
@@ -1111,23 +1174,27 @@ static void ip_fib_net_exit(struct net *net)
1111{ 1174{
1112 unsigned int i; 1175 unsigned int i;
1113 1176
1177 rtnl_lock();
1114#ifdef CONFIG_IP_MULTIPLE_TABLES 1178#ifdef CONFIG_IP_MULTIPLE_TABLES
1115 fib4_rules_exit(net); 1179 RCU_INIT_POINTER(net->ipv4.fib_local, NULL);
1180 RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
1181 RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
1116#endif 1182#endif
1117
1118 rtnl_lock();
1119 for (i = 0; i < FIB_TABLE_HASHSZ; i++) { 1183 for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
1120 struct fib_table *tb; 1184 struct hlist_head *head = &net->ipv4.fib_table_hash[i];
1121 struct hlist_head *head;
1122 struct hlist_node *tmp; 1185 struct hlist_node *tmp;
1186 struct fib_table *tb;
1123 1187
1124 head = &net->ipv4.fib_table_hash[i];
1125 hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { 1188 hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
1126 hlist_del(&tb->tb_hlist); 1189 hlist_del(&tb->tb_hlist);
1127 fib_table_flush(tb); 1190 fib_table_flush(tb);
1128 fib_free_table(tb); 1191 fib_free_table(tb);
1129 } 1192 }
1130 } 1193 }
1194
1195#ifdef CONFIG_IP_MULTIPLE_TABLES
1196 fib4_rules_exit(net);
1197#endif
1131 rtnl_unlock(); 1198 rtnl_unlock();
1132 kfree(net->ipv4.fib_table_hash); 1199 kfree(net->ipv4.fib_table_hash);
1133} 1200}
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 825981b1049a..c6211ed60b03 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -6,11 +6,13 @@
6#include <net/ip_fib.h> 6#include <net/ip_fib.h>
7 7
8struct fib_alias { 8struct fib_alias {
9 struct list_head fa_list; 9 struct hlist_node fa_list;
10 struct fib_info *fa_info; 10 struct fib_info *fa_info;
11 u8 fa_tos; 11 u8 fa_tos;
12 u8 fa_type; 12 u8 fa_type;
13 u8 fa_state; 13 u8 fa_state;
14 u8 fa_slen;
15 u32 tb_id;
14 struct rcu_head rcu; 16 struct rcu_head rcu;
15}; 17};
16 18
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index d3db718be51d..56151982f74e 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -153,7 +153,7 @@ static struct fib_table *fib_empty_table(struct net *net)
153 u32 id; 153 u32 id;
154 154
155 for (id = 1; id <= RT_TABLE_MAX; id++) 155 for (id = 1; id <= RT_TABLE_MAX; id++)
156 if (fib_get_table(net, id) == NULL) 156 if (!fib_get_table(net, id))
157 return fib_new_table(net, id); 157 return fib_new_table(net, id);
158 return NULL; 158 return NULL;
159} 159}
@@ -174,12 +174,17 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
174 if (frh->tos & ~IPTOS_TOS_MASK) 174 if (frh->tos & ~IPTOS_TOS_MASK)
175 goto errout; 175 goto errout;
176 176
177 /* split local/main if they are not already split */
178 err = fib_unmerge(net);
179 if (err)
180 goto errout;
181
177 if (rule->table == RT_TABLE_UNSPEC) { 182 if (rule->table == RT_TABLE_UNSPEC) {
178 if (rule->action == FR_ACT_TO_TBL) { 183 if (rule->action == FR_ACT_TO_TBL) {
179 struct fib_table *table; 184 struct fib_table *table;
180 185
181 table = fib_empty_table(net); 186 table = fib_empty_table(net);
182 if (table == NULL) { 187 if (!table) {
183 err = -ENOBUFS; 188 err = -ENOBUFS;
184 goto errout; 189 goto errout;
185 } 190 }
@@ -189,10 +194,10 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
189 } 194 }
190 195
191 if (frh->src_len) 196 if (frh->src_len)
192 rule4->src = nla_get_be32(tb[FRA_SRC]); 197 rule4->src = nla_get_in_addr(tb[FRA_SRC]);
193 198
194 if (frh->dst_len) 199 if (frh->dst_len)
195 rule4->dst = nla_get_be32(tb[FRA_DST]); 200 rule4->dst = nla_get_in_addr(tb[FRA_DST]);
196 201
197#ifdef CONFIG_IP_ROUTE_CLASSID 202#ifdef CONFIG_IP_ROUTE_CLASSID
198 if (tb[FRA_FLOW]) { 203 if (tb[FRA_FLOW]) {
@@ -209,21 +214,31 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
209 rule4->tos = frh->tos; 214 rule4->tos = frh->tos;
210 215
211 net->ipv4.fib_has_custom_rules = true; 216 net->ipv4.fib_has_custom_rules = true;
217 fib_flush_external(rule->fr_net);
218
212 err = 0; 219 err = 0;
213errout: 220errout:
214 return err; 221 return err;
215} 222}
216 223
217static void fib4_rule_delete(struct fib_rule *rule) 224static int fib4_rule_delete(struct fib_rule *rule)
218{ 225{
219 struct net *net = rule->fr_net; 226 struct net *net = rule->fr_net;
220#ifdef CONFIG_IP_ROUTE_CLASSID 227 int err;
221 struct fib4_rule *rule4 = (struct fib4_rule *) rule;
222 228
223 if (rule4->tclassid) 229 /* split local/main if they are not already split */
230 err = fib_unmerge(net);
231 if (err)
232 goto errout;
233
234#ifdef CONFIG_IP_ROUTE_CLASSID
235 if (((struct fib4_rule *)rule)->tclassid)
224 net->ipv4.fib_num_tclassid_users--; 236 net->ipv4.fib_num_tclassid_users--;
225#endif 237#endif
226 net->ipv4.fib_has_custom_rules = true; 238 net->ipv4.fib_has_custom_rules = true;
239 fib_flush_external(rule->fr_net);
240errout:
241 return err;
227} 242}
228 243
229static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, 244static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
@@ -245,10 +260,10 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
245 return 0; 260 return 0;
246#endif 261#endif
247 262
248 if (frh->src_len && (rule4->src != nla_get_be32(tb[FRA_SRC]))) 263 if (frh->src_len && (rule4->src != nla_get_in_addr(tb[FRA_SRC])))
249 return 0; 264 return 0;
250 265
251 if (frh->dst_len && (rule4->dst != nla_get_be32(tb[FRA_DST]))) 266 if (frh->dst_len && (rule4->dst != nla_get_in_addr(tb[FRA_DST])))
252 return 0; 267 return 0;
253 268
254 return 1; 269 return 1;
@@ -264,9 +279,9 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
264 frh->tos = rule4->tos; 279 frh->tos = rule4->tos;
265 280
266 if ((rule4->dst_len && 281 if ((rule4->dst_len &&
267 nla_put_be32(skb, FRA_DST, rule4->dst)) || 282 nla_put_in_addr(skb, FRA_DST, rule4->dst)) ||
268 (rule4->src_len && 283 (rule4->src_len &&
269 nla_put_be32(skb, FRA_SRC, rule4->src))) 284 nla_put_in_addr(skb, FRA_SRC, rule4->src)))
270 goto nla_put_failure; 285 goto nla_put_failure;
271#ifdef CONFIG_IP_ROUTE_CLASSID 286#ifdef CONFIG_IP_ROUTE_CLASSID
272 if (rule4->tclassid && 287 if (rule4->tclassid &&
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 1e2090ea663e..8d695b6659c7 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -213,7 +213,6 @@ static void free_fib_info_rcu(struct rcu_head *head)
213 rt_fibinfo_free(&nexthop_nh->nh_rth_input); 213 rt_fibinfo_free(&nexthop_nh->nh_rth_input);
214 } endfor_nexthops(fi); 214 } endfor_nexthops(fi);
215 215
216 release_net(fi->fib_net);
217 if (fi->fib_metrics != (u32 *) dst_default_metrics) 216 if (fi->fib_metrics != (u32 *) dst_default_metrics)
218 kfree(fi->fib_metrics); 217 kfree(fi->fib_metrics);
219 kfree(fi); 218 kfree(fi);
@@ -391,7 +390,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
391 int err = -ENOBUFS; 390 int err = -ENOBUFS;
392 391
393 skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL); 392 skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
394 if (skb == NULL) 393 if (!skb)
395 goto errout; 394 goto errout;
396 395
397 err = fib_dump_info(skb, info->portid, seq, event, tb_id, 396 err = fib_dump_info(skb, info->portid, seq, event, tb_id,
@@ -469,7 +468,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
469 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 468 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
470 469
471 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 470 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
472 nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; 471 nexthop_nh->nh_gw = nla ? nla_get_in_addr(nla) : 0;
473#ifdef CONFIG_IP_ROUTE_CLASSID 472#ifdef CONFIG_IP_ROUTE_CLASSID
474 nla = nla_find(attrs, attrlen, RTA_FLOW); 473 nla = nla_find(attrs, attrlen, RTA_FLOW);
475 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; 474 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
@@ -504,7 +503,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
504 } 503 }
505 504
506#ifdef CONFIG_IP_ROUTE_MULTIPATH 505#ifdef CONFIG_IP_ROUTE_MULTIPATH
507 if (cfg->fc_mp == NULL) 506 if (!cfg->fc_mp)
508 return 0; 507 return 0;
509 508
510 rtnh = cfg->fc_mp; 509 rtnh = cfg->fc_mp;
@@ -524,7 +523,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
524 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 523 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
525 524
526 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 525 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
527 if (nla && nla_get_be32(nla) != nh->nh_gw) 526 if (nla && nla_get_in_addr(nla) != nh->nh_gw)
528 return 1; 527 return 1;
529#ifdef CONFIG_IP_ROUTE_CLASSID 528#ifdef CONFIG_IP_ROUTE_CLASSID
530 nla = nla_find(attrs, attrlen, RTA_FLOW); 529 nla = nla_find(attrs, attrlen, RTA_FLOW);
@@ -647,7 +646,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
647 rcu_read_lock(); 646 rcu_read_lock();
648 err = -ENODEV; 647 err = -ENODEV;
649 in_dev = inetdev_by_index(net, nh->nh_oif); 648 in_dev = inetdev_by_index(net, nh->nh_oif);
650 if (in_dev == NULL) 649 if (!in_dev)
651 goto out; 650 goto out;
652 err = -ENETDOWN; 651 err = -ENETDOWN;
653 if (!(in_dev->dev->flags & IFF_UP)) 652 if (!(in_dev->dev->flags & IFF_UP))
@@ -804,7 +803,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
804 } 803 }
805 804
806 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); 805 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
807 if (fi == NULL) 806 if (!fi)
808 goto failure; 807 goto failure;
809 fib_info_cnt++; 808 fib_info_cnt++;
810 if (cfg->fc_mx) { 809 if (cfg->fc_mx) {
@@ -814,7 +813,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
814 } else 813 } else
815 fi->fib_metrics = (u32 *) dst_default_metrics; 814 fi->fib_metrics = (u32 *) dst_default_metrics;
816 815
817 fi->fib_net = hold_net(net); 816 fi->fib_net = net;
818 fi->fib_protocol = cfg->fc_protocol; 817 fi->fib_protocol = cfg->fc_protocol;
819 fi->fib_scope = cfg->fc_scope; 818 fi->fib_scope = cfg->fc_scope;
820 fi->fib_flags = cfg->fc_flags; 819 fi->fib_flags = cfg->fc_flags;
@@ -922,7 +921,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
922 nh->nh_scope = RT_SCOPE_NOWHERE; 921 nh->nh_scope = RT_SCOPE_NOWHERE;
923 nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif); 922 nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
924 err = -ENODEV; 923 err = -ENODEV;
925 if (nh->nh_dev == NULL) 924 if (!nh->nh_dev)
926 goto failure; 925 goto failure;
927 } else { 926 } else {
928 change_nexthops(fi) { 927 change_nexthops(fi) {
@@ -996,7 +995,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
996 struct rtmsg *rtm; 995 struct rtmsg *rtm;
997 996
998 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags); 997 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
999 if (nlh == NULL) 998 if (!nlh)
1000 return -EMSGSIZE; 999 return -EMSGSIZE;
1001 1000
1002 rtm = nlmsg_data(nlh); 1001 rtm = nlmsg_data(nlh);
@@ -1016,7 +1015,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
1016 rtm->rtm_protocol = fi->fib_protocol; 1015 rtm->rtm_protocol = fi->fib_protocol;
1017 1016
1018 if (rtm->rtm_dst_len && 1017 if (rtm->rtm_dst_len &&
1019 nla_put_be32(skb, RTA_DST, dst)) 1018 nla_put_in_addr(skb, RTA_DST, dst))
1020 goto nla_put_failure; 1019 goto nla_put_failure;
1021 if (fi->fib_priority && 1020 if (fi->fib_priority &&
1022 nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) 1021 nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
@@ -1025,11 +1024,11 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
1025 goto nla_put_failure; 1024 goto nla_put_failure;
1026 1025
1027 if (fi->fib_prefsrc && 1026 if (fi->fib_prefsrc &&
1028 nla_put_be32(skb, RTA_PREFSRC, fi->fib_prefsrc)) 1027 nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
1029 goto nla_put_failure; 1028 goto nla_put_failure;
1030 if (fi->fib_nhs == 1) { 1029 if (fi->fib_nhs == 1) {
1031 if (fi->fib_nh->nh_gw && 1030 if (fi->fib_nh->nh_gw &&
1032 nla_put_be32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw)) 1031 nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
1033 goto nla_put_failure; 1032 goto nla_put_failure;
1034 if (fi->fib_nh->nh_oif && 1033 if (fi->fib_nh->nh_oif &&
1035 nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif)) 1034 nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
@@ -1046,12 +1045,12 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
1046 struct nlattr *mp; 1045 struct nlattr *mp;
1047 1046
1048 mp = nla_nest_start(skb, RTA_MULTIPATH); 1047 mp = nla_nest_start(skb, RTA_MULTIPATH);
1049 if (mp == NULL) 1048 if (!mp)
1050 goto nla_put_failure; 1049 goto nla_put_failure;
1051 1050
1052 for_nexthops(fi) { 1051 for_nexthops(fi) {
1053 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 1052 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1054 if (rtnh == NULL) 1053 if (!rtnh)
1055 goto nla_put_failure; 1054 goto nla_put_failure;
1056 1055
1057 rtnh->rtnh_flags = nh->nh_flags & 0xFF; 1056 rtnh->rtnh_flags = nh->nh_flags & 0xFF;
@@ -1059,7 +1058,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
1059 rtnh->rtnh_ifindex = nh->nh_oif; 1058 rtnh->rtnh_ifindex = nh->nh_oif;
1060 1059
1061 if (nh->nh_gw && 1060 if (nh->nh_gw &&
1062 nla_put_be32(skb, RTA_GATEWAY, nh->nh_gw)) 1061 nla_put_in_addr(skb, RTA_GATEWAY, nh->nh_gw))
1063 goto nla_put_failure; 1062 goto nla_put_failure;
1064#ifdef CONFIG_IP_ROUTE_CLASSID 1063#ifdef CONFIG_IP_ROUTE_CLASSID
1065 if (nh->nh_tclassid && 1064 if (nh->nh_tclassid &&
@@ -1094,7 +1093,7 @@ int fib_sync_down_addr(struct net *net, __be32 local)
1094 struct hlist_head *head = &fib_info_laddrhash[hash]; 1093 struct hlist_head *head = &fib_info_laddrhash[hash];
1095 struct fib_info *fi; 1094 struct fib_info *fi;
1096 1095
1097 if (fib_info_laddrhash == NULL || local == 0) 1096 if (!fib_info_laddrhash || local == 0)
1098 return 0; 1097 return 0;
1099 1098
1100 hlist_for_each_entry(fi, head, fib_lhash) { 1099 hlist_for_each_entry(fi, head, fib_lhash) {
@@ -1163,12 +1162,12 @@ int fib_sync_down_dev(struct net_device *dev, int force)
1163void fib_select_default(struct fib_result *res) 1162void fib_select_default(struct fib_result *res)
1164{ 1163{
1165 struct fib_info *fi = NULL, *last_resort = NULL; 1164 struct fib_info *fi = NULL, *last_resort = NULL;
1166 struct list_head *fa_head = res->fa_head; 1165 struct hlist_head *fa_head = res->fa_head;
1167 struct fib_table *tb = res->table; 1166 struct fib_table *tb = res->table;
1168 int order = -1, last_idx = -1; 1167 int order = -1, last_idx = -1;
1169 struct fib_alias *fa; 1168 struct fib_alias *fa;
1170 1169
1171 list_for_each_entry_rcu(fa, fa_head, fa_list) { 1170 hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
1172 struct fib_info *next_fi = fa->fa_info; 1171 struct fib_info *next_fi = fa->fa_info;
1173 1172
1174 if (next_fi->fib_scope != res->scope || 1173 if (next_fi->fib_scope != res->scope ||
@@ -1183,7 +1182,7 @@ void fib_select_default(struct fib_result *res)
1183 1182
1184 fib_alias_accessed(fa); 1183 fib_alias_accessed(fa);
1185 1184
1186 if (fi == NULL) { 1185 if (!fi) {
1187 if (next_fi != res->fi) 1186 if (next_fi != res->fi)
1188 break; 1187 break;
1189 } else if (!fib_detect_death(fi, order, &last_resort, 1188 } else if (!fib_detect_death(fi, order, &last_resort,
@@ -1196,7 +1195,7 @@ void fib_select_default(struct fib_result *res)
1196 order++; 1195 order++;
1197 } 1196 }
1198 1197
1199 if (order <= 0 || fi == NULL) { 1198 if (order <= 0 || !fi) {
1200 tb->tb_default = -1; 1199 tb->tb_default = -1;
1201 goto out; 1200 goto out;
1202 } 1201 }
@@ -1252,7 +1251,7 @@ int fib_sync_up(struct net_device *dev)
1252 alive++; 1251 alive++;
1253 continue; 1252 continue;
1254 } 1253 }
1255 if (nexthop_nh->nh_dev == NULL || 1254 if (!nexthop_nh->nh_dev ||
1256 !(nexthop_nh->nh_dev->flags & IFF_UP)) 1255 !(nexthop_nh->nh_dev->flags & IFF_UP))
1257 continue; 1256 continue;
1258 if (nexthop_nh->nh_dev != dev || 1257 if (nexthop_nh->nh_dev != dev ||
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 3daf0224ff2e..e13fcc602da2 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -79,6 +79,7 @@
79#include <net/tcp.h> 79#include <net/tcp.h>
80#include <net/sock.h> 80#include <net/sock.h>
81#include <net/ip_fib.h> 81#include <net/ip_fib.h>
82#include <net/switchdev.h>
82#include "fib_lookup.h" 83#include "fib_lookup.h"
83 84
84#define MAX_STAT_DEPTH 32 85#define MAX_STAT_DEPTH 32
@@ -88,38 +89,35 @@
88 89
89typedef unsigned int t_key; 90typedef unsigned int t_key;
90 91
91#define IS_TNODE(n) ((n)->bits) 92#define IS_TRIE(n) ((n)->pos >= KEYLENGTH)
92#define IS_LEAF(n) (!(n)->bits) 93#define IS_TNODE(n) ((n)->bits)
94#define IS_LEAF(n) (!(n)->bits)
93 95
94#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos) 96struct key_vector {
95
96struct tnode {
97 t_key key; 97 t_key key;
98 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
99 unsigned char pos; /* 2log(KEYLENGTH) bits needed */ 98 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
99 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
100 unsigned char slen; 100 unsigned char slen;
101 struct tnode __rcu *parent;
102 struct rcu_head rcu;
103 union { 101 union {
104 /* The fields in this struct are valid if bits > 0 (TNODE) */ 102 /* This list pointer if valid if (pos | bits) == 0 (LEAF) */
105 struct { 103 struct hlist_head leaf;
106 t_key empty_children; /* KEYLENGTH bits needed */ 104 /* This array is valid if (pos | bits) > 0 (TNODE) */
107 t_key full_children; /* KEYLENGTH bits needed */ 105 struct key_vector __rcu *tnode[0];
108 struct tnode __rcu *child[0];
109 };
110 /* This list pointer if valid if bits == 0 (LEAF) */
111 struct hlist_head list;
112 }; 106 };
113}; 107};
114 108
115struct leaf_info { 109struct tnode {
116 struct hlist_node hlist;
117 int plen;
118 u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
119 struct list_head falh;
120 struct rcu_head rcu; 110 struct rcu_head rcu;
111 t_key empty_children; /* KEYLENGTH bits needed */
112 t_key full_children; /* KEYLENGTH bits needed */
113 struct key_vector __rcu *parent;
114 struct key_vector kv[1];
115#define tn_bits kv[0].bits
121}; 116};
122 117
118#define TNODE_SIZE(n) offsetof(struct tnode, kv[0].tnode[n])
119#define LEAF_SIZE TNODE_SIZE(1)
120
123#ifdef CONFIG_IP_FIB_TRIE_STATS 121#ifdef CONFIG_IP_FIB_TRIE_STATS
124struct trie_use_stats { 122struct trie_use_stats {
125 unsigned int gets; 123 unsigned int gets;
@@ -142,13 +140,13 @@ struct trie_stat {
142}; 140};
143 141
144struct trie { 142struct trie {
145 struct tnode __rcu *trie; 143 struct key_vector kv[1];
146#ifdef CONFIG_IP_FIB_TRIE_STATS 144#ifdef CONFIG_IP_FIB_TRIE_STATS
147 struct trie_use_stats __percpu *stats; 145 struct trie_use_stats __percpu *stats;
148#endif 146#endif
149}; 147};
150 148
151static void resize(struct trie *t, struct tnode *tn); 149static struct key_vector *resize(struct trie *t, struct key_vector *tn);
152static size_t tnode_free_size; 150static size_t tnode_free_size;
153 151
154/* 152/*
@@ -161,41 +159,46 @@ static const int sync_pages = 128;
161static struct kmem_cache *fn_alias_kmem __read_mostly; 159static struct kmem_cache *fn_alias_kmem __read_mostly;
162static struct kmem_cache *trie_leaf_kmem __read_mostly; 160static struct kmem_cache *trie_leaf_kmem __read_mostly;
163 161
162static inline struct tnode *tn_info(struct key_vector *kv)
163{
164 return container_of(kv, struct tnode, kv[0]);
165}
166
164/* caller must hold RTNL */ 167/* caller must hold RTNL */
165#define node_parent(n) rtnl_dereference((n)->parent) 168#define node_parent(tn) rtnl_dereference(tn_info(tn)->parent)
169#define get_child(tn, i) rtnl_dereference((tn)->tnode[i])
166 170
167/* caller must hold RCU read lock or RTNL */ 171/* caller must hold RCU read lock or RTNL */
168#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent) 172#define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent)
173#define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i])
169 174
170/* wrapper for rcu_assign_pointer */ 175/* wrapper for rcu_assign_pointer */
171static inline void node_set_parent(struct tnode *n, struct tnode *tp) 176static inline void node_set_parent(struct key_vector *n, struct key_vector *tp)
172{ 177{
173 if (n) 178 if (n)
174 rcu_assign_pointer(n->parent, tp); 179 rcu_assign_pointer(tn_info(n)->parent, tp);
175} 180}
176 181
177#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p) 182#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p)
178 183
179/* This provides us with the number of children in this node, in the case of a 184/* This provides us with the number of children in this node, in the case of a
180 * leaf this will return 0 meaning none of the children are accessible. 185 * leaf this will return 0 meaning none of the children are accessible.
181 */ 186 */
182static inline unsigned long tnode_child_length(const struct tnode *tn) 187static inline unsigned long child_length(const struct key_vector *tn)
183{ 188{
184 return (1ul << tn->bits) & ~(1ul); 189 return (1ul << tn->bits) & ~(1ul);
185} 190}
186 191
187/* caller must hold RTNL */ 192#define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos)
188static inline struct tnode *tnode_get_child(const struct tnode *tn,
189 unsigned long i)
190{
191 return rtnl_dereference(tn->child[i]);
192}
193 193
194/* caller must hold RCU read lock or RTNL */ 194static inline unsigned long get_index(t_key key, struct key_vector *kv)
195static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn,
196 unsigned long i)
197{ 195{
198 return rcu_dereference_rtnl(tn->child[i]); 196 unsigned long index = key ^ kv->key;
197
198 if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos))
199 return 0;
200
201 return index >> kv->pos;
199} 202}
200 203
201/* To understand this stuff, an understanding of keys and all their bits is 204/* To understand this stuff, an understanding of keys and all their bits is
@@ -274,106 +277,104 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa)
274} 277}
275 278
276#define TNODE_KMALLOC_MAX \ 279#define TNODE_KMALLOC_MAX \
277 ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *)) 280 ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct key_vector *))
281#define TNODE_VMALLOC_MAX \
282 ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))
278 283
279static void __node_free_rcu(struct rcu_head *head) 284static void __node_free_rcu(struct rcu_head *head)
280{ 285{
281 struct tnode *n = container_of(head, struct tnode, rcu); 286 struct tnode *n = container_of(head, struct tnode, rcu);
282 287
283 if (IS_LEAF(n)) 288 if (!n->tn_bits)
284 kmem_cache_free(trie_leaf_kmem, n); 289 kmem_cache_free(trie_leaf_kmem, n);
285 else if (n->bits <= TNODE_KMALLOC_MAX) 290 else if (n->tn_bits <= TNODE_KMALLOC_MAX)
286 kfree(n); 291 kfree(n);
287 else 292 else
288 vfree(n); 293 vfree(n);
289} 294}
290 295
291#define node_free(n) call_rcu(&n->rcu, __node_free_rcu) 296#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
292 297
293static inline void free_leaf_info(struct leaf_info *leaf) 298static struct tnode *tnode_alloc(int bits)
294{ 299{
295 kfree_rcu(leaf, rcu); 300 size_t size;
296} 301
302 /* verify bits is within bounds */
303 if (bits > TNODE_VMALLOC_MAX)
304 return NULL;
305
306 /* determine size and verify it is non-zero and didn't overflow */
307 size = TNODE_SIZE(1ul << bits);
297 308
298static struct tnode *tnode_alloc(size_t size)
299{
300 if (size <= PAGE_SIZE) 309 if (size <= PAGE_SIZE)
301 return kzalloc(size, GFP_KERNEL); 310 return kzalloc(size, GFP_KERNEL);
302 else 311 else
303 return vzalloc(size); 312 return vzalloc(size);
304} 313}
305 314
306static inline void empty_child_inc(struct tnode *n) 315static inline void empty_child_inc(struct key_vector *n)
307{ 316{
308 ++n->empty_children ? : ++n->full_children; 317 ++tn_info(n)->empty_children ? : ++tn_info(n)->full_children;
309} 318}
310 319
311static inline void empty_child_dec(struct tnode *n) 320static inline void empty_child_dec(struct key_vector *n)
312{ 321{
313 n->empty_children-- ? : n->full_children--; 322 tn_info(n)->empty_children-- ? : tn_info(n)->full_children--;
314} 323}
315 324
316static struct tnode *leaf_new(t_key key) 325static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
317{ 326{
318 struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); 327 struct tnode *kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
319 if (l) { 328 struct key_vector *l = kv->kv;
320 l->parent = NULL;
321 /* set key and pos to reflect full key value
322 * any trailing zeros in the key should be ignored
323 * as the nodes are searched
324 */
325 l->key = key;
326 l->slen = 0;
327 l->pos = 0;
328 /* set bits to 0 indicating we are not a tnode */
329 l->bits = 0;
330 329
331 INIT_HLIST_HEAD(&l->list); 330 if (!kv)
332 } 331 return NULL;
333 return l;
334}
335 332
336static struct leaf_info *leaf_info_new(int plen) 333 /* initialize key vector */
337{ 334 l->key = key;
338 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); 335 l->pos = 0;
339 if (li) { 336 l->bits = 0;
340 li->plen = plen; 337 l->slen = fa->fa_slen;
341 li->mask_plen = ntohl(inet_make_mask(plen)); 338
342 INIT_LIST_HEAD(&li->falh); 339 /* link leaf to fib alias */
343 } 340 INIT_HLIST_HEAD(&l->leaf);
344 return li; 341 hlist_add_head(&fa->fa_list, &l->leaf);
342
343 return l;
345} 344}
346 345
347static struct tnode *tnode_new(t_key key, int pos, int bits) 346static struct key_vector *tnode_new(t_key key, int pos, int bits)
348{ 347{
349 size_t sz = offsetof(struct tnode, child[1ul << bits]); 348 struct tnode *tnode = tnode_alloc(bits);
350 struct tnode *tn = tnode_alloc(sz);
351 unsigned int shift = pos + bits; 349 unsigned int shift = pos + bits;
350 struct key_vector *tn = tnode->kv;
352 351
353 /* verify bits and pos their msb bits clear and values are valid */ 352 /* verify bits and pos their msb bits clear and values are valid */
354 BUG_ON(!bits || (shift > KEYLENGTH)); 353 BUG_ON(!bits || (shift > KEYLENGTH));
355 354
356 if (tn) { 355 pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
357 tn->parent = NULL; 356 sizeof(struct key_vector *) << bits);
358 tn->slen = pos; 357
359 tn->pos = pos; 358 if (!tnode)
360 tn->bits = bits; 359 return NULL;
361 tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0; 360
362 if (bits == KEYLENGTH) 361 if (bits == KEYLENGTH)
363 tn->full_children = 1; 362 tnode->full_children = 1;
364 else 363 else
365 tn->empty_children = 1ul << bits; 364 tnode->empty_children = 1ul << bits;
366 } 365
366 tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
367 tn->pos = pos;
368 tn->bits = bits;
369 tn->slen = pos;
367 370
368 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
369 sizeof(struct tnode *) << bits);
370 return tn; 371 return tn;
371} 372}
372 373
373/* Check whether a tnode 'n' is "full", i.e. it is an internal node 374/* Check whether a tnode 'n' is "full", i.e. it is an internal node
374 * and no bits are skipped. See discussion in dyntree paper p. 6 375 * and no bits are skipped. See discussion in dyntree paper p. 6
375 */ 376 */
376static inline int tnode_full(const struct tnode *tn, const struct tnode *n) 377static inline int tnode_full(struct key_vector *tn, struct key_vector *n)
377{ 378{
378 return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n); 379 return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
379} 380}
@@ -381,17 +382,18 @@ static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
381/* Add a child at position i overwriting the old value. 382/* Add a child at position i overwriting the old value.
382 * Update the value of full_children and empty_children. 383 * Update the value of full_children and empty_children.
383 */ 384 */
384static void put_child(struct tnode *tn, unsigned long i, struct tnode *n) 385static void put_child(struct key_vector *tn, unsigned long i,
386 struct key_vector *n)
385{ 387{
386 struct tnode *chi = tnode_get_child(tn, i); 388 struct key_vector *chi = get_child(tn, i);
387 int isfull, wasfull; 389 int isfull, wasfull;
388 390
389 BUG_ON(i >= tnode_child_length(tn)); 391 BUG_ON(i >= child_length(tn));
390 392
391 /* update emptyChildren, overflow into fullChildren */ 393 /* update emptyChildren, overflow into fullChildren */
392 if (n == NULL && chi != NULL) 394 if (!n && chi)
393 empty_child_inc(tn); 395 empty_child_inc(tn);
394 if (n != NULL && chi == NULL) 396 if (n && !chi)
395 empty_child_dec(tn); 397 empty_child_dec(tn);
396 398
397 /* update fullChildren */ 399 /* update fullChildren */
@@ -399,23 +401,23 @@ static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
399 isfull = tnode_full(tn, n); 401 isfull = tnode_full(tn, n);
400 402
401 if (wasfull && !isfull) 403 if (wasfull && !isfull)
402 tn->full_children--; 404 tn_info(tn)->full_children--;
403 else if (!wasfull && isfull) 405 else if (!wasfull && isfull)
404 tn->full_children++; 406 tn_info(tn)->full_children++;
405 407
406 if (n && (tn->slen < n->slen)) 408 if (n && (tn->slen < n->slen))
407 tn->slen = n->slen; 409 tn->slen = n->slen;
408 410
409 rcu_assign_pointer(tn->child[i], n); 411 rcu_assign_pointer(tn->tnode[i], n);
410} 412}
411 413
412static void update_children(struct tnode *tn) 414static void update_children(struct key_vector *tn)
413{ 415{
414 unsigned long i; 416 unsigned long i;
415 417
416 /* update all of the child parent pointers */ 418 /* update all of the child parent pointers */
417 for (i = tnode_child_length(tn); i;) { 419 for (i = child_length(tn); i;) {
418 struct tnode *inode = tnode_get_child(tn, --i); 420 struct key_vector *inode = get_child(tn, --i);
419 421
420 if (!inode) 422 if (!inode)
421 continue; 423 continue;
@@ -431,36 +433,37 @@ static void update_children(struct tnode *tn)
431 } 433 }
432} 434}
433 435
434static inline void put_child_root(struct tnode *tp, struct trie *t, 436static inline void put_child_root(struct key_vector *tp, t_key key,
435 t_key key, struct tnode *n) 437 struct key_vector *n)
436{ 438{
437 if (tp) 439 if (IS_TRIE(tp))
438 put_child(tp, get_index(key, tp), n); 440 rcu_assign_pointer(tp->tnode[0], n);
439 else 441 else
440 rcu_assign_pointer(t->trie, n); 442 put_child(tp, get_index(key, tp), n);
441} 443}
442 444
443static inline void tnode_free_init(struct tnode *tn) 445static inline void tnode_free_init(struct key_vector *tn)
444{ 446{
445 tn->rcu.next = NULL; 447 tn_info(tn)->rcu.next = NULL;
446} 448}
447 449
448static inline void tnode_free_append(struct tnode *tn, struct tnode *n) 450static inline void tnode_free_append(struct key_vector *tn,
451 struct key_vector *n)
449{ 452{
450 n->rcu.next = tn->rcu.next; 453 tn_info(n)->rcu.next = tn_info(tn)->rcu.next;
451 tn->rcu.next = &n->rcu; 454 tn_info(tn)->rcu.next = &tn_info(n)->rcu;
452} 455}
453 456
454static void tnode_free(struct tnode *tn) 457static void tnode_free(struct key_vector *tn)
455{ 458{
456 struct callback_head *head = &tn->rcu; 459 struct callback_head *head = &tn_info(tn)->rcu;
457 460
458 while (head) { 461 while (head) {
459 head = head->next; 462 head = head->next;
460 tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]); 463 tnode_free_size += TNODE_SIZE(1ul << tn->bits);
461 node_free(tn); 464 node_free(tn);
462 465
463 tn = container_of(head, struct tnode, rcu); 466 tn = container_of(head, struct tnode, rcu)->kv;
464 } 467 }
465 468
466 if (tnode_free_size >= PAGE_SIZE * sync_pages) { 469 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
@@ -469,14 +472,16 @@ static void tnode_free(struct tnode *tn)
469 } 472 }
470} 473}
471 474
472static void replace(struct trie *t, struct tnode *oldtnode, struct tnode *tn) 475static struct key_vector *replace(struct trie *t,
476 struct key_vector *oldtnode,
477 struct key_vector *tn)
473{ 478{
474 struct tnode *tp = node_parent(oldtnode); 479 struct key_vector *tp = node_parent(oldtnode);
475 unsigned long i; 480 unsigned long i;
476 481
477 /* setup the parent pointer out of and back into this node */ 482 /* setup the parent pointer out of and back into this node */
478 NODE_INIT_PARENT(tn, tp); 483 NODE_INIT_PARENT(tn, tp);
479 put_child_root(tp, t, tn->key, tn); 484 put_child_root(tp, tn->key, tn);
480 485
481 /* update all of the child parent pointers */ 486 /* update all of the child parent pointers */
482 update_children(tn); 487 update_children(tn);
@@ -485,18 +490,21 @@ static void replace(struct trie *t, struct tnode *oldtnode, struct tnode *tn)
485 tnode_free(oldtnode); 490 tnode_free(oldtnode);
486 491
487 /* resize children now that oldtnode is freed */ 492 /* resize children now that oldtnode is freed */
488 for (i = tnode_child_length(tn); i;) { 493 for (i = child_length(tn); i;) {
489 struct tnode *inode = tnode_get_child(tn, --i); 494 struct key_vector *inode = get_child(tn, --i);
490 495
491 /* resize child node */ 496 /* resize child node */
492 if (tnode_full(tn, inode)) 497 if (tnode_full(tn, inode))
493 resize(t, inode); 498 tn = resize(t, inode);
494 } 499 }
500
501 return tp;
495} 502}
496 503
497static int inflate(struct trie *t, struct tnode *oldtnode) 504static struct key_vector *inflate(struct trie *t,
505 struct key_vector *oldtnode)
498{ 506{
499 struct tnode *tn; 507 struct key_vector *tn;
500 unsigned long i; 508 unsigned long i;
501 t_key m; 509 t_key m;
502 510
@@ -504,7 +512,7 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
504 512
505 tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1); 513 tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
506 if (!tn) 514 if (!tn)
507 return -ENOMEM; 515 goto notnode;
508 516
509 /* prepare oldtnode to be freed */ 517 /* prepare oldtnode to be freed */
510 tnode_free_init(oldtnode); 518 tnode_free_init(oldtnode);
@@ -514,13 +522,13 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
514 * point to existing tnodes and the links between our allocated 522 * point to existing tnodes and the links between our allocated
515 * nodes. 523 * nodes.
516 */ 524 */
517 for (i = tnode_child_length(oldtnode), m = 1u << tn->pos; i;) { 525 for (i = child_length(oldtnode), m = 1u << tn->pos; i;) {
518 struct tnode *inode = tnode_get_child(oldtnode, --i); 526 struct key_vector *inode = get_child(oldtnode, --i);
519 struct tnode *node0, *node1; 527 struct key_vector *node0, *node1;
520 unsigned long j, k; 528 unsigned long j, k;
521 529
522 /* An empty child */ 530 /* An empty child */
523 if (inode == NULL) 531 if (!inode)
524 continue; 532 continue;
525 533
526 /* A leaf or an internal node with skipped bits */ 534 /* A leaf or an internal node with skipped bits */
@@ -534,8 +542,8 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
534 542
535 /* An internal node with two children */ 543 /* An internal node with two children */
536 if (inode->bits == 1) { 544 if (inode->bits == 1) {
537 put_child(tn, 2 * i + 1, tnode_get_child(inode, 1)); 545 put_child(tn, 2 * i + 1, get_child(inode, 1));
538 put_child(tn, 2 * i, tnode_get_child(inode, 0)); 546 put_child(tn, 2 * i, get_child(inode, 0));
539 continue; 547 continue;
540 } 548 }
541 549
@@ -564,11 +572,11 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
564 tnode_free_append(tn, node0); 572 tnode_free_append(tn, node0);
565 573
566 /* populate child pointers in new nodes */ 574 /* populate child pointers in new nodes */
567 for (k = tnode_child_length(inode), j = k / 2; j;) { 575 for (k = child_length(inode), j = k / 2; j;) {
568 put_child(node1, --j, tnode_get_child(inode, --k)); 576 put_child(node1, --j, get_child(inode, --k));
569 put_child(node0, j, tnode_get_child(inode, j)); 577 put_child(node0, j, get_child(inode, j));
570 put_child(node1, --j, tnode_get_child(inode, --k)); 578 put_child(node1, --j, get_child(inode, --k));
571 put_child(node0, j, tnode_get_child(inode, j)); 579 put_child(node0, j, get_child(inode, j));
572 } 580 }
573 581
574 /* link new nodes to parent */ 582 /* link new nodes to parent */
@@ -581,25 +589,25 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
581 } 589 }
582 590
583 /* setup the parent pointers into and out of this node */ 591 /* setup the parent pointers into and out of this node */
584 replace(t, oldtnode, tn); 592 return replace(t, oldtnode, tn);
585
586 return 0;
587nomem: 593nomem:
588 /* all pointers should be clean so we are done */ 594 /* all pointers should be clean so we are done */
589 tnode_free(tn); 595 tnode_free(tn);
590 return -ENOMEM; 596notnode:
597 return NULL;
591} 598}
592 599
593static int halve(struct trie *t, struct tnode *oldtnode) 600static struct key_vector *halve(struct trie *t,
601 struct key_vector *oldtnode)
594{ 602{
595 struct tnode *tn; 603 struct key_vector *tn;
596 unsigned long i; 604 unsigned long i;
597 605
598 pr_debug("In halve\n"); 606 pr_debug("In halve\n");
599 607
600 tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1); 608 tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
601 if (!tn) 609 if (!tn)
602 return -ENOMEM; 610 goto notnode;
603 611
604 /* prepare oldtnode to be freed */ 612 /* prepare oldtnode to be freed */
605 tnode_free_init(oldtnode); 613 tnode_free_init(oldtnode);
@@ -609,10 +617,10 @@ static int halve(struct trie *t, struct tnode *oldtnode)
609 * point to existing tnodes and the links between our allocated 617 * point to existing tnodes and the links between our allocated
610 * nodes. 618 * nodes.
611 */ 619 */
612 for (i = tnode_child_length(oldtnode); i;) { 620 for (i = child_length(oldtnode); i;) {
613 struct tnode *node1 = tnode_get_child(oldtnode, --i); 621 struct key_vector *node1 = get_child(oldtnode, --i);
614 struct tnode *node0 = tnode_get_child(oldtnode, --i); 622 struct key_vector *node0 = get_child(oldtnode, --i);
615 struct tnode *inode; 623 struct key_vector *inode;
616 624
617 /* At least one of the children is empty */ 625 /* At least one of the children is empty */
618 if (!node1 || !node0) { 626 if (!node1 || !node0) {
@@ -622,10 +630,8 @@ static int halve(struct trie *t, struct tnode *oldtnode)
622 630
623 /* Two nonempty children */ 631 /* Two nonempty children */
624 inode = tnode_new(node0->key, oldtnode->pos, 1); 632 inode = tnode_new(node0->key, oldtnode->pos, 1);
625 if (!inode) { 633 if (!inode)
626 tnode_free(tn); 634 goto nomem;
627 return -ENOMEM;
628 }
629 tnode_free_append(tn, inode); 635 tnode_free_append(tn, inode);
630 636
631 /* initialize pointers out of node */ 637 /* initialize pointers out of node */
@@ -638,30 +644,36 @@ static int halve(struct trie *t, struct tnode *oldtnode)
638 } 644 }
639 645
640 /* setup the parent pointers into and out of this node */ 646 /* setup the parent pointers into and out of this node */
641 replace(t, oldtnode, tn); 647 return replace(t, oldtnode, tn);
642 648nomem:
643 return 0; 649 /* all pointers should be clean so we are done */
650 tnode_free(tn);
651notnode:
652 return NULL;
644} 653}
645 654
646static void collapse(struct trie *t, struct tnode *oldtnode) 655static struct key_vector *collapse(struct trie *t,
656 struct key_vector *oldtnode)
647{ 657{
648 struct tnode *n, *tp; 658 struct key_vector *n, *tp;
649 unsigned long i; 659 unsigned long i;
650 660
651 /* scan the tnode looking for that one child that might still exist */ 661 /* scan the tnode looking for that one child that might still exist */
652 for (n = NULL, i = tnode_child_length(oldtnode); !n && i;) 662 for (n = NULL, i = child_length(oldtnode); !n && i;)
653 n = tnode_get_child(oldtnode, --i); 663 n = get_child(oldtnode, --i);
654 664
655 /* compress one level */ 665 /* compress one level */
656 tp = node_parent(oldtnode); 666 tp = node_parent(oldtnode);
657 put_child_root(tp, t, oldtnode->key, n); 667 put_child_root(tp, oldtnode->key, n);
658 node_set_parent(n, tp); 668 node_set_parent(n, tp);
659 669
660 /* drop dead node */ 670 /* drop dead node */
661 node_free(oldtnode); 671 node_free(oldtnode);
672
673 return tp;
662} 674}
663 675
664static unsigned char update_suffix(struct tnode *tn) 676static unsigned char update_suffix(struct key_vector *tn)
665{ 677{
666 unsigned char slen = tn->pos; 678 unsigned char slen = tn->pos;
667 unsigned long stride, i; 679 unsigned long stride, i;
@@ -671,8 +683,8 @@ static unsigned char update_suffix(struct tnode *tn)
671 * why we start with a stride of 2 since a stride of 1 would 683 * why we start with a stride of 2 since a stride of 1 would
672 * represent the nodes with suffix length equal to tn->pos 684 * represent the nodes with suffix length equal to tn->pos
673 */ 685 */
674 for (i = 0, stride = 0x2ul ; i < tnode_child_length(tn); i += stride) { 686 for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) {
675 struct tnode *n = tnode_get_child(tn, i); 687 struct key_vector *n = get_child(tn, i);
676 688
677 if (!n || (n->slen <= slen)) 689 if (!n || (n->slen <= slen))
678 continue; 690 continue;
@@ -704,12 +716,12 @@ static unsigned char update_suffix(struct tnode *tn)
704 * 716 *
705 * 'high' in this instance is the variable 'inflate_threshold'. It 717 * 'high' in this instance is the variable 'inflate_threshold'. It
706 * is expressed as a percentage, so we multiply it with 718 * is expressed as a percentage, so we multiply it with
707 * tnode_child_length() and instead of multiplying by 2 (since the 719 * child_length() and instead of multiplying by 2 (since the
708 * child array will be doubled by inflate()) and multiplying 720 * child array will be doubled by inflate()) and multiplying
709 * the left-hand side by 100 (to handle the percentage thing) we 721 * the left-hand side by 100 (to handle the percentage thing) we
710 * multiply the left-hand side by 50. 722 * multiply the left-hand side by 50.
711 * 723 *
712 * The left-hand side may look a bit weird: tnode_child_length(tn) 724 * The left-hand side may look a bit weird: child_length(tn)
713 * - tn->empty_children is of course the number of non-null children 725 * - tn->empty_children is of course the number of non-null children
714 * in the current node. tn->full_children is the number of "full" 726 * in the current node. tn->full_children is the number of "full"
715 * children, that is non-null tnodes with a skip value of 0. 727 * children, that is non-null tnodes with a skip value of 0.
@@ -719,10 +731,10 @@ static unsigned char update_suffix(struct tnode *tn)
719 * A clearer way to write this would be: 731 * A clearer way to write this would be:
720 * 732 *
721 * to_be_doubled = tn->full_children; 733 * to_be_doubled = tn->full_children;
722 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - 734 * not_to_be_doubled = child_length(tn) - tn->empty_children -
723 * tn->full_children; 735 * tn->full_children;
724 * 736 *
725 * new_child_length = tnode_child_length(tn) * 2; 737 * new_child_length = child_length(tn) * 2;
726 * 738 *
727 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / 739 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
728 * new_child_length; 740 * new_child_length;
@@ -739,57 +751,57 @@ static unsigned char update_suffix(struct tnode *tn)
739 * inflate_threshold * new_child_length 751 * inflate_threshold * new_child_length
740 * 752 *
741 * expand not_to_be_doubled and to_be_doubled, and shorten: 753 * expand not_to_be_doubled and to_be_doubled, and shorten:
742 * 100 * (tnode_child_length(tn) - tn->empty_children + 754 * 100 * (child_length(tn) - tn->empty_children +
743 * tn->full_children) >= inflate_threshold * new_child_length 755 * tn->full_children) >= inflate_threshold * new_child_length
744 * 756 *
745 * expand new_child_length: 757 * expand new_child_length:
746 * 100 * (tnode_child_length(tn) - tn->empty_children + 758 * 100 * (child_length(tn) - tn->empty_children +
747 * tn->full_children) >= 759 * tn->full_children) >=
748 * inflate_threshold * tnode_child_length(tn) * 2 760 * inflate_threshold * child_length(tn) * 2
749 * 761 *
750 * shorten again: 762 * shorten again:
751 * 50 * (tn->full_children + tnode_child_length(tn) - 763 * 50 * (tn->full_children + child_length(tn) -
752 * tn->empty_children) >= inflate_threshold * 764 * tn->empty_children) >= inflate_threshold *
753 * tnode_child_length(tn) 765 * child_length(tn)
754 * 766 *
755 */ 767 */
756static bool should_inflate(const struct tnode *tp, const struct tnode *tn) 768static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn)
757{ 769{
758 unsigned long used = tnode_child_length(tn); 770 unsigned long used = child_length(tn);
759 unsigned long threshold = used; 771 unsigned long threshold = used;
760 772
761 /* Keep root node larger */ 773 /* Keep root node larger */
762 threshold *= tp ? inflate_threshold : inflate_threshold_root; 774 threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold;
763 used -= tn->empty_children; 775 used -= tn_info(tn)->empty_children;
764 used += tn->full_children; 776 used += tn_info(tn)->full_children;
765 777
766 /* if bits == KEYLENGTH then pos = 0, and will fail below */ 778 /* if bits == KEYLENGTH then pos = 0, and will fail below */
767 779
768 return (used > 1) && tn->pos && ((50 * used) >= threshold); 780 return (used > 1) && tn->pos && ((50 * used) >= threshold);
769} 781}
770 782
771static bool should_halve(const struct tnode *tp, const struct tnode *tn) 783static inline bool should_halve(struct key_vector *tp, struct key_vector *tn)
772{ 784{
773 unsigned long used = tnode_child_length(tn); 785 unsigned long used = child_length(tn);
774 unsigned long threshold = used; 786 unsigned long threshold = used;
775 787
776 /* Keep root node larger */ 788 /* Keep root node larger */
777 threshold *= tp ? halve_threshold : halve_threshold_root; 789 threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold;
778 used -= tn->empty_children; 790 used -= tn_info(tn)->empty_children;
779 791
780 /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */ 792 /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
781 793
782 return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold); 794 return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
783} 795}
784 796
785static bool should_collapse(const struct tnode *tn) 797static inline bool should_collapse(struct key_vector *tn)
786{ 798{
787 unsigned long used = tnode_child_length(tn); 799 unsigned long used = child_length(tn);
788 800
789 used -= tn->empty_children; 801 used -= tn_info(tn)->empty_children;
790 802
791 /* account for bits == KEYLENGTH case */ 803 /* account for bits == KEYLENGTH case */
792 if ((tn->bits == KEYLENGTH) && tn->full_children) 804 if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children)
793 used -= KEY_MAX; 805 used -= KEY_MAX;
794 806
795 /* One child or none, time to drop us from the trie */ 807 /* One child or none, time to drop us from the trie */
@@ -797,10 +809,13 @@ static bool should_collapse(const struct tnode *tn)
797} 809}
798 810
799#define MAX_WORK 10 811#define MAX_WORK 10
800static void resize(struct trie *t, struct tnode *tn) 812static struct key_vector *resize(struct trie *t, struct key_vector *tn)
801{ 813{
802 struct tnode *tp = node_parent(tn); 814#ifdef CONFIG_IP_FIB_TRIE_STATS
803 struct tnode __rcu **cptr; 815 struct trie_use_stats __percpu *stats = t->stats;
816#endif
817 struct key_vector *tp = node_parent(tn);
818 unsigned long cindex = get_index(tn->key, tp);
804 int max_work = MAX_WORK; 819 int max_work = MAX_WORK;
805 820
806 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", 821 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
@@ -810,183 +825,128 @@ static void resize(struct trie *t, struct tnode *tn)
810 * doing it ourselves. This way we can let RCU fully do its 825 * doing it ourselves. This way we can let RCU fully do its
811 * thing without us interfering 826 * thing without us interfering
812 */ 827 */
813 cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie; 828 BUG_ON(tn != get_child(tp, cindex));
814 BUG_ON(tn != rtnl_dereference(*cptr));
815 829
816 /* Double as long as the resulting node has a number of 830 /* Double as long as the resulting node has a number of
817 * nonempty nodes that are above the threshold. 831 * nonempty nodes that are above the threshold.
818 */ 832 */
819 while (should_inflate(tp, tn) && max_work) { 833 while (should_inflate(tp, tn) && max_work) {
820 if (inflate(t, tn)) { 834 tp = inflate(t, tn);
835 if (!tp) {
821#ifdef CONFIG_IP_FIB_TRIE_STATS 836#ifdef CONFIG_IP_FIB_TRIE_STATS
822 this_cpu_inc(t->stats->resize_node_skipped); 837 this_cpu_inc(stats->resize_node_skipped);
823#endif 838#endif
824 break; 839 break;
825 } 840 }
826 841
827 max_work--; 842 max_work--;
828 tn = rtnl_dereference(*cptr); 843 tn = get_child(tp, cindex);
829 } 844 }
830 845
846 /* update parent in case inflate failed */
847 tp = node_parent(tn);
848
831 /* Return if at least one inflate is run */ 849 /* Return if at least one inflate is run */
832 if (max_work != MAX_WORK) 850 if (max_work != MAX_WORK)
833 return; 851 return tp;
834 852
835 /* Halve as long as the number of empty children in this 853 /* Halve as long as the number of empty children in this
836 * node is above threshold. 854 * node is above threshold.
837 */ 855 */
838 while (should_halve(tp, tn) && max_work) { 856 while (should_halve(tp, tn) && max_work) {
839 if (halve(t, tn)) { 857 tp = halve(t, tn);
858 if (!tp) {
840#ifdef CONFIG_IP_FIB_TRIE_STATS 859#ifdef CONFIG_IP_FIB_TRIE_STATS
841 this_cpu_inc(t->stats->resize_node_skipped); 860 this_cpu_inc(stats->resize_node_skipped);
842#endif 861#endif
843 break; 862 break;
844 } 863 }
845 864
846 max_work--; 865 max_work--;
847 tn = rtnl_dereference(*cptr); 866 tn = get_child(tp, cindex);
848 } 867 }
849 868
850 /* Only one child remains */ 869 /* Only one child remains */
851 if (should_collapse(tn)) { 870 if (should_collapse(tn))
852 collapse(t, tn); 871 return collapse(t, tn);
853 return; 872
854 } 873 /* update parent in case halve failed */
874 tp = node_parent(tn);
855 875
856 /* Return if at least one deflate was run */ 876 /* Return if at least one deflate was run */
857 if (max_work != MAX_WORK) 877 if (max_work != MAX_WORK)
858 return; 878 return tp;
859 879
860 /* push the suffix length to the parent node */ 880 /* push the suffix length to the parent node */
861 if (tn->slen > tn->pos) { 881 if (tn->slen > tn->pos) {
862 unsigned char slen = update_suffix(tn); 882 unsigned char slen = update_suffix(tn);
863 883
864 if (tp && (slen > tp->slen)) 884 if (slen > tp->slen)
865 tp->slen = slen; 885 tp->slen = slen;
866 } 886 }
867}
868
869/* readside must use rcu_read_lock currently dump routines
870 via get_fa_head and dump */
871
872static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
873{
874 struct hlist_head *head = &l->list;
875 struct leaf_info *li;
876
877 hlist_for_each_entry_rcu(li, head, hlist)
878 if (li->plen == plen)
879 return li;
880
881 return NULL;
882}
883
884static inline struct list_head *get_fa_head(struct tnode *l, int plen)
885{
886 struct leaf_info *li = find_leaf_info(l, plen);
887
888 if (!li)
889 return NULL;
890 887
891 return &li->falh; 888 return tp;
892} 889}
893 890
894static void leaf_pull_suffix(struct tnode *l) 891static void leaf_pull_suffix(struct key_vector *tp, struct key_vector *l)
895{ 892{
896 struct tnode *tp = node_parent(l); 893 while ((tp->slen > tp->pos) && (tp->slen > l->slen)) {
897
898 while (tp && (tp->slen > tp->pos) && (tp->slen > l->slen)) {
899 if (update_suffix(tp) > l->slen) 894 if (update_suffix(tp) > l->slen)
900 break; 895 break;
901 tp = node_parent(tp); 896 tp = node_parent(tp);
902 } 897 }
903} 898}
904 899
905static void leaf_push_suffix(struct tnode *l) 900static void leaf_push_suffix(struct key_vector *tn, struct key_vector *l)
906{ 901{
907 struct tnode *tn = node_parent(l);
908
909 /* if this is a new leaf then tn will be NULL and we can sort 902 /* if this is a new leaf then tn will be NULL and we can sort
910 * out parent suffix lengths as a part of trie_rebalance 903 * out parent suffix lengths as a part of trie_rebalance
911 */ 904 */
912 while (tn && (tn->slen < l->slen)) { 905 while (tn->slen < l->slen) {
913 tn->slen = l->slen; 906 tn->slen = l->slen;
914 tn = node_parent(tn); 907 tn = node_parent(tn);
915 } 908 }
916} 909}
917 910
918static void remove_leaf_info(struct tnode *l, struct leaf_info *old) 911/* rcu_read_lock needs to be hold by caller from readside */
919{ 912static struct key_vector *fib_find_node(struct trie *t,
920 /* record the location of the previous list_info entry */ 913 struct key_vector **tp, u32 key)
921 struct hlist_node **pprev = old->hlist.pprev;
922 struct leaf_info *li = hlist_entry(pprev, typeof(*li), hlist.next);
923
924 /* remove the leaf info from the list */
925 hlist_del_rcu(&old->hlist);
926
927 /* only access li if it is pointing at the last valid hlist_node */
928 if (hlist_empty(&l->list) || (*pprev))
929 return;
930
931 /* update the trie with the latest suffix length */
932 l->slen = KEYLENGTH - li->plen;
933 leaf_pull_suffix(l);
934}
935
936static void insert_leaf_info(struct tnode *l, struct leaf_info *new)
937{ 914{
938 struct hlist_head *head = &l->list; 915 struct key_vector *pn, *n = t->kv;
939 struct leaf_info *li = NULL, *last = NULL; 916 unsigned long index = 0;
940 917
941 if (hlist_empty(head)) { 918 do {
942 hlist_add_head_rcu(&new->hlist, head); 919 pn = n;
943 } else { 920 n = get_child_rcu(n, index);
944 hlist_for_each_entry(li, head, hlist) {
945 if (new->plen > li->plen)
946 break;
947
948 last = li;
949 }
950 if (last)
951 hlist_add_behind_rcu(&new->hlist, &last->hlist);
952 else
953 hlist_add_before_rcu(&new->hlist, &li->hlist);
954 }
955
956 /* if we added to the tail node then we need to update slen */
957 if (l->slen < (KEYLENGTH - new->plen)) {
958 l->slen = KEYLENGTH - new->plen;
959 leaf_push_suffix(l);
960 }
961}
962 921
963/* rcu_read_lock needs to be hold by caller from readside */ 922 if (!n)
964static struct tnode *fib_find_node(struct trie *t, u32 key) 923 break;
965{
966 struct tnode *n = rcu_dereference_rtnl(t->trie);
967 924
968 while (n) { 925 index = get_cindex(key, n);
969 unsigned long index = get_index(key, n);
970 926
971 /* This bit of code is a bit tricky but it combines multiple 927 /* This bit of code is a bit tricky but it combines multiple
972 * checks into a single check. The prefix consists of the 928 * checks into a single check. The prefix consists of the
973 * prefix plus zeros for the bits in the cindex. The index 929 * prefix plus zeros for the bits in the cindex. The index
974 * is the difference between the key and this value. From 930 * is the difference between the key and this value. From
975 * this we can actually derive several pieces of data. 931 * this we can actually derive several pieces of data.
976 * if (index & (~0ul << bits)) 932 * if (index >= (1ul << bits))
977 * we have a mismatch in skip bits and failed 933 * we have a mismatch in skip bits and failed
978 * else 934 * else
979 * we know the value is cindex 935 * we know the value is cindex
936 *
937 * This check is safe even if bits == KEYLENGTH due to the
938 * fact that we can only allocate a node with 32 bits if a
939 * long is greater than 32 bits.
980 */ 940 */
981 if (index & (~0ul << n->bits)) 941 if (index >= (1ul << n->bits)) {
982 return NULL; 942 n = NULL;
983
984 /* we have found a leaf. Prefixes have already been compared */
985 if (IS_LEAF(n))
986 break; 943 break;
944 }
987 945
988 n = tnode_get_child_rcu(n, index); 946 /* keep searching until we find a perfect match leaf or NULL */
989 } 947 } while (IS_TNODE(n));
948
949 *tp = pn;
990 950
991 return n; 951 return n;
992} 952}
@@ -994,14 +954,23 @@ static struct tnode *fib_find_node(struct trie *t, u32 key)
994/* Return the first fib alias matching TOS with 954/* Return the first fib alias matching TOS with
995 * priority less than or equal to PRIO. 955 * priority less than or equal to PRIO.
996 */ 956 */
997static struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) 957static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
958 u8 tos, u32 prio, u32 tb_id)
998{ 959{
999 struct fib_alias *fa; 960 struct fib_alias *fa;
1000 961
1001 if (!fah) 962 if (!fah)
1002 return NULL; 963 return NULL;
1003 964
1004 list_for_each_entry(fa, fah, fa_list) { 965 hlist_for_each_entry(fa, fah, fa_list) {
966 if (fa->fa_slen < slen)
967 continue;
968 if (fa->fa_slen != slen)
969 break;
970 if (fa->tb_id > tb_id)
971 continue;
972 if (fa->tb_id != tb_id)
973 break;
1005 if (fa->fa_tos > tos) 974 if (fa->fa_tos > tos)
1006 continue; 975 continue;
1007 if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos) 976 if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
@@ -1011,77 +980,23 @@ static struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
1011 return NULL; 980 return NULL;
1012} 981}
1013 982
1014static void trie_rebalance(struct trie *t, struct tnode *tn) 983static void trie_rebalance(struct trie *t, struct key_vector *tn)
1015{ 984{
1016 struct tnode *tp; 985 while (!IS_TRIE(tn))
1017 986 tn = resize(t, tn);
1018 while ((tp = node_parent(tn)) != NULL) {
1019 resize(t, tn);
1020 tn = tp;
1021 }
1022
1023 /* Handle last (top) tnode */
1024 if (IS_TNODE(tn))
1025 resize(t, tn);
1026} 987}
1027 988
1028/* only used from updater-side */ 989static int fib_insert_node(struct trie *t, struct key_vector *tp,
1029 990 struct fib_alias *new, t_key key)
1030static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1031{ 991{
1032 struct list_head *fa_head = NULL; 992 struct key_vector *n, *l;
1033 struct tnode *l, *n, *tp = NULL;
1034 struct leaf_info *li;
1035
1036 li = leaf_info_new(plen);
1037 if (!li)
1038 return NULL;
1039 fa_head = &li->falh;
1040 993
1041 n = rtnl_dereference(t->trie); 994 l = leaf_new(key, new);
1042 995 if (!l)
1043 /* If we point to NULL, stop. Either the tree is empty and we should 996 goto noleaf;
1044 * just put a new leaf in if, or we have reached an empty child slot,
1045 * and we should just put our new leaf in that.
1046 *
1047 * If we hit a node with a key that does't match then we should stop
1048 * and create a new tnode to replace that node and insert ourselves
1049 * and the other node into the new tnode.
1050 */
1051 while (n) {
1052 unsigned long index = get_index(key, n);
1053
1054 /* This bit of code is a bit tricky but it combines multiple
1055 * checks into a single check. The prefix consists of the
1056 * prefix plus zeros for the "bits" in the prefix. The index
1057 * is the difference between the key and this value. From
1058 * this we can actually derive several pieces of data.
1059 * if !(index >> bits)
1060 * we know the value is child index
1061 * else
1062 * we have a mismatch in skip bits and failed
1063 */
1064 if (index >> n->bits)
1065 break;
1066
1067 /* we have found a leaf. Prefixes have already been compared */
1068 if (IS_LEAF(n)) {
1069 /* Case 1: n is a leaf, and prefixes match*/
1070 insert_leaf_info(n, li);
1071 return fa_head;
1072 }
1073
1074 tp = n;
1075 n = tnode_get_child_rcu(n, index);
1076 }
1077
1078 l = leaf_new(key);
1079 if (!l) {
1080 free_leaf_info(li);
1081 return NULL;
1082 }
1083 997
1084 insert_leaf_info(l, li); 998 /* retrieve child from parent node */
999 n = get_child(tp, get_index(key, tp));
1085 1000
1086 /* Case 2: n is a LEAF or a TNODE and the key doesn't match. 1001 /* Case 2: n is a LEAF or a TNODE and the key doesn't match.
1087 * 1002 *
@@ -1090,21 +1005,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1090 * leaves us in position for handling as case 3 1005 * leaves us in position for handling as case 3
1091 */ 1006 */
1092 if (n) { 1007 if (n) {
1093 struct tnode *tn; 1008 struct key_vector *tn;
1094 1009
1095 tn = tnode_new(key, __fls(key ^ n->key), 1); 1010 tn = tnode_new(key, __fls(key ^ n->key), 1);
1096 if (!tn) { 1011 if (!tn)
1097 free_leaf_info(li); 1012 goto notnode;
1098 node_free(l);
1099 return NULL;
1100 }
1101 1013
1102 /* initialize routes out of node */ 1014 /* initialize routes out of node */
1103 NODE_INIT_PARENT(tn, tp); 1015 NODE_INIT_PARENT(tn, tp);
1104 put_child(tn, get_index(key, tn) ^ 1, n); 1016 put_child(tn, get_index(key, tn) ^ 1, n);
1105 1017
1106 /* start adding routes into the node */ 1018 /* start adding routes into the node */
1107 put_child_root(tp, t, key, tn); 1019 put_child_root(tp, key, tn);
1108 node_set_parent(n, tn); 1020 node_set_parent(n, tn);
1109 1021
1110 /* parent now has a NULL spot where the leaf can go */ 1022 /* parent now has a NULL spot where the leaf can go */
@@ -1112,69 +1024,93 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1112 } 1024 }
1113 1025
1114 /* Case 3: n is NULL, and will just insert a new leaf */ 1026 /* Case 3: n is NULL, and will just insert a new leaf */
1115 if (tp) { 1027 NODE_INIT_PARENT(l, tp);
1116 NODE_INIT_PARENT(l, tp); 1028 put_child_root(tp, key, l);
1117 put_child(tp, get_index(key, tp), l); 1029 trie_rebalance(t, tp);
1118 trie_rebalance(t, tp); 1030
1031 return 0;
1032notnode:
1033 node_free(l);
1034noleaf:
1035 return -ENOMEM;
1036}
1037
1038static int fib_insert_alias(struct trie *t, struct key_vector *tp,
1039 struct key_vector *l, struct fib_alias *new,
1040 struct fib_alias *fa, t_key key)
1041{
1042 if (!l)
1043 return fib_insert_node(t, tp, new, key);
1044
1045 if (fa) {
1046 hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
1119 } else { 1047 } else {
1120 rcu_assign_pointer(t->trie, l); 1048 struct fib_alias *last;
1049
1050 hlist_for_each_entry(last, &l->leaf, fa_list) {
1051 if (new->fa_slen < last->fa_slen)
1052 break;
1053 if ((new->fa_slen == last->fa_slen) &&
1054 (new->tb_id > last->tb_id))
1055 break;
1056 fa = last;
1057 }
1058
1059 if (fa)
1060 hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
1061 else
1062 hlist_add_head_rcu(&new->fa_list, &l->leaf);
1121 } 1063 }
1122 1064
1123 return fa_head; 1065 /* if we added to the tail node then we need to update slen */
1066 if (l->slen < new->fa_slen) {
1067 l->slen = new->fa_slen;
1068 leaf_push_suffix(tp, l);
1069 }
1070
1071 return 0;
1124} 1072}
1125 1073
1126/* 1074/* Caller must hold RTNL. */
1127 * Caller must hold RTNL.
1128 */
1129int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) 1075int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1130{ 1076{
1131 struct trie *t = (struct trie *) tb->tb_data; 1077 struct trie *t = (struct trie *)tb->tb_data;
1132 struct fib_alias *fa, *new_fa; 1078 struct fib_alias *fa, *new_fa;
1133 struct list_head *fa_head = NULL; 1079 struct key_vector *l, *tp;
1134 struct fib_info *fi; 1080 struct fib_info *fi;
1135 int plen = cfg->fc_dst_len; 1081 u8 plen = cfg->fc_dst_len;
1082 u8 slen = KEYLENGTH - plen;
1136 u8 tos = cfg->fc_tos; 1083 u8 tos = cfg->fc_tos;
1137 u32 key, mask; 1084 u32 key;
1138 int err; 1085 int err;
1139 struct tnode *l;
1140 1086
1141 if (plen > 32) 1087 if (plen > KEYLENGTH)
1142 return -EINVAL; 1088 return -EINVAL;
1143 1089
1144 key = ntohl(cfg->fc_dst); 1090 key = ntohl(cfg->fc_dst);
1145 1091
1146 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); 1092 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1147 1093
1148 mask = ntohl(inet_make_mask(plen)); 1094 if ((plen < KEYLENGTH) && (key << plen))
1149
1150 if (key & ~mask)
1151 return -EINVAL; 1095 return -EINVAL;
1152 1096
1153 key = key & mask;
1154
1155 fi = fib_create_info(cfg); 1097 fi = fib_create_info(cfg);
1156 if (IS_ERR(fi)) { 1098 if (IS_ERR(fi)) {
1157 err = PTR_ERR(fi); 1099 err = PTR_ERR(fi);
1158 goto err; 1100 goto err;
1159 } 1101 }
1160 1102
1161 l = fib_find_node(t, key); 1103 l = fib_find_node(t, &tp, key);
1162 fa = NULL; 1104 fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
1163 1105 tb->tb_id) : NULL;
1164 if (l) {
1165 fa_head = get_fa_head(l, plen);
1166 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1167 }
1168 1106
1169 /* Now fa, if non-NULL, points to the first fib alias 1107 /* Now fa, if non-NULL, points to the first fib alias
1170 * with the same keys [prefix,tos,priority], if such key already 1108 * with the same keys [prefix,tos,priority], if such key already
1171 * exists or to the node before which we will insert new one. 1109 * exists or to the node before which we will insert new one.
1172 * 1110 *
1173 * If fa is NULL, we will need to allocate a new one and 1111 * If fa is NULL, we will need to allocate a new one and
1174 * insert to the head of f. 1112 * insert to the tail of the section matching the suffix length
1175 * 1113 * of the new alias.
1176 * If f is NULL, no fib node matched the destination key
1177 * and we need to allocate a new one of those as well.
1178 */ 1114 */
1179 1115
1180 if (fa && fa->fa_tos == tos && 1116 if (fa && fa->fa_tos == tos &&
@@ -1192,9 +1128,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1192 */ 1128 */
1193 fa_match = NULL; 1129 fa_match = NULL;
1194 fa_first = fa; 1130 fa_first = fa;
1195 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); 1131 hlist_for_each_entry_from(fa, fa_list) {
1196 list_for_each_entry_continue(fa, fa_head, fa_list) { 1132 if ((fa->fa_slen != slen) ||
1197 if (fa->fa_tos != tos) 1133 (fa->tb_id != tb->tb_id) ||
1134 (fa->fa_tos != tos))
1198 break; 1135 break;
1199 if (fa->fa_info->fib_priority != fi->fib_priority) 1136 if (fa->fa_info->fib_priority != fi->fib_priority)
1200 break; 1137 break;
@@ -1217,7 +1154,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1217 } 1154 }
1218 err = -ENOBUFS; 1155 err = -ENOBUFS;
1219 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); 1156 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1220 if (new_fa == NULL) 1157 if (!new_fa)
1221 goto out; 1158 goto out;
1222 1159
1223 fi_drop = fa->fa_info; 1160 fi_drop = fa->fa_info;
@@ -1226,8 +1163,21 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1226 new_fa->fa_type = cfg->fc_type; 1163 new_fa->fa_type = cfg->fc_type;
1227 state = fa->fa_state; 1164 state = fa->fa_state;
1228 new_fa->fa_state = state & ~FA_S_ACCESSED; 1165 new_fa->fa_state = state & ~FA_S_ACCESSED;
1166 new_fa->fa_slen = fa->fa_slen;
1167
1168 err = netdev_switch_fib_ipv4_add(key, plen, fi,
1169 new_fa->fa_tos,
1170 cfg->fc_type,
1171 cfg->fc_nlflags,
1172 tb->tb_id);
1173 if (err) {
1174 netdev_switch_fib_ipv4_abort(fi);
1175 kmem_cache_free(fn_alias_kmem, new_fa);
1176 goto out;
1177 }
1178
1179 hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1229 1180
1230 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1231 alias_free_mem_rcu(fa); 1181 alias_free_mem_rcu(fa);
1232 1182
1233 fib_release_info(fi_drop); 1183 fib_release_info(fi_drop);
@@ -1254,37 +1204,42 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1254 1204
1255 err = -ENOBUFS; 1205 err = -ENOBUFS;
1256 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); 1206 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1257 if (new_fa == NULL) 1207 if (!new_fa)
1258 goto out; 1208 goto out;
1259 1209
1260 new_fa->fa_info = fi; 1210 new_fa->fa_info = fi;
1261 new_fa->fa_tos = tos; 1211 new_fa->fa_tos = tos;
1262 new_fa->fa_type = cfg->fc_type; 1212 new_fa->fa_type = cfg->fc_type;
1263 new_fa->fa_state = 0; 1213 new_fa->fa_state = 0;
1264 /* 1214 new_fa->fa_slen = slen;
1265 * Insert new entry to the list. 1215 new_fa->tb_id = tb->tb_id;
1266 */ 1216
1267 1217 /* (Optionally) offload fib entry to switch hardware. */
1268 if (!fa_head) { 1218 err = netdev_switch_fib_ipv4_add(key, plen, fi, tos,
1269 fa_head = fib_insert_node(t, key, plen); 1219 cfg->fc_type,
1270 if (unlikely(!fa_head)) { 1220 cfg->fc_nlflags,
1271 err = -ENOMEM; 1221 tb->tb_id);
1272 goto out_free_new_fa; 1222 if (err) {
1273 } 1223 netdev_switch_fib_ipv4_abort(fi);
1224 goto out_free_new_fa;
1274 } 1225 }
1275 1226
1227 /* Insert new entry to the list. */
1228 err = fib_insert_alias(t, tp, l, new_fa, fa, key);
1229 if (err)
1230 goto out_sw_fib_del;
1231
1276 if (!plen) 1232 if (!plen)
1277 tb->tb_num_default++; 1233 tb->tb_num_default++;
1278 1234
1279 list_add_tail_rcu(&new_fa->fa_list,
1280 (fa ? &fa->fa_list : fa_head));
1281
1282 rt_cache_flush(cfg->fc_nlinfo.nl_net); 1235 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1283 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, 1236 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
1284 &cfg->fc_nlinfo, 0); 1237 &cfg->fc_nlinfo, 0);
1285succeeded: 1238succeeded:
1286 return 0; 1239 return 0;
1287 1240
1241out_sw_fib_del:
1242 netdev_switch_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
1288out_free_new_fa: 1243out_free_new_fa:
1289 kmem_cache_free(fn_alias_kmem, new_fa); 1244 kmem_cache_free(fn_alias_kmem, new_fa);
1290out: 1245out:
@@ -1293,7 +1248,7 @@ err:
1293 return err; 1248 return err;
1294} 1249}
1295 1250
1296static inline t_key prefix_mismatch(t_key key, struct tnode *n) 1251static inline t_key prefix_mismatch(t_key key, struct key_vector *n)
1297{ 1252{
1298 t_key prefix = n->key; 1253 t_key prefix = n->key;
1299 1254
@@ -1304,16 +1259,20 @@ static inline t_key prefix_mismatch(t_key key, struct tnode *n)
1304int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, 1259int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1305 struct fib_result *res, int fib_flags) 1260 struct fib_result *res, int fib_flags)
1306{ 1261{
1307 struct trie *t = (struct trie *)tb->tb_data; 1262 struct trie *t = (struct trie *) tb->tb_data;
1308#ifdef CONFIG_IP_FIB_TRIE_STATS 1263#ifdef CONFIG_IP_FIB_TRIE_STATS
1309 struct trie_use_stats __percpu *stats = t->stats; 1264 struct trie_use_stats __percpu *stats = t->stats;
1310#endif 1265#endif
1311 const t_key key = ntohl(flp->daddr); 1266 const t_key key = ntohl(flp->daddr);
1312 struct tnode *n, *pn; 1267 struct key_vector *n, *pn;
1313 struct leaf_info *li; 1268 struct fib_alias *fa;
1269 unsigned long index;
1314 t_key cindex; 1270 t_key cindex;
1315 1271
1316 n = rcu_dereference(t->trie); 1272 pn = t->kv;
1273 cindex = 0;
1274
1275 n = get_child_rcu(pn, cindex);
1317 if (!n) 1276 if (!n)
1318 return -EAGAIN; 1277 return -EAGAIN;
1319 1278
@@ -1321,24 +1280,25 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1321 this_cpu_inc(stats->gets); 1280 this_cpu_inc(stats->gets);
1322#endif 1281#endif
1323 1282
1324 pn = n;
1325 cindex = 0;
1326
1327 /* Step 1: Travel to the longest prefix match in the trie */ 1283 /* Step 1: Travel to the longest prefix match in the trie */
1328 for (;;) { 1284 for (;;) {
1329 unsigned long index = get_index(key, n); 1285 index = get_cindex(key, n);
1330 1286
1331 /* This bit of code is a bit tricky but it combines multiple 1287 /* This bit of code is a bit tricky but it combines multiple
1332 * checks into a single check. The prefix consists of the 1288 * checks into a single check. The prefix consists of the
1333 * prefix plus zeros for the "bits" in the prefix. The index 1289 * prefix plus zeros for the "bits" in the prefix. The index
1334 * is the difference between the key and this value. From 1290 * is the difference between the key and this value. From
1335 * this we can actually derive several pieces of data. 1291 * this we can actually derive several pieces of data.
1336 * if (index & (~0ul << bits)) 1292 * if (index >= (1ul << bits))
1337 * we have a mismatch in skip bits and failed 1293 * we have a mismatch in skip bits and failed
1338 * else 1294 * else
1339 * we know the value is cindex 1295 * we know the value is cindex
1296 *
1297 * This check is safe even if bits == KEYLENGTH due to the
1298 * fact that we can only allocate a node with 32 bits if a
1299 * long is greater than 32 bits.
1340 */ 1300 */
1341 if (index & (~0ul << n->bits)) 1301 if (index >= (1ul << n->bits))
1342 break; 1302 break;
1343 1303
1344 /* we have found a leaf. Prefixes have already been compared */ 1304 /* we have found a leaf. Prefixes have already been compared */
@@ -1353,7 +1313,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1353 cindex = index; 1313 cindex = index;
1354 } 1314 }
1355 1315
1356 n = tnode_get_child_rcu(n, index); 1316 n = get_child_rcu(n, index);
1357 if (unlikely(!n)) 1317 if (unlikely(!n))
1358 goto backtrace; 1318 goto backtrace;
1359 } 1319 }
@@ -1361,7 +1321,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1361 /* Step 2: Sort out leaves and begin backtracing for longest prefix */ 1321 /* Step 2: Sort out leaves and begin backtracing for longest prefix */
1362 for (;;) { 1322 for (;;) {
1363 /* record the pointer where our next node pointer is stored */ 1323 /* record the pointer where our next node pointer is stored */
1364 struct tnode __rcu **cptr = n->child; 1324 struct key_vector __rcu **cptr = n->tnode;
1365 1325
1366 /* This test verifies that none of the bits that differ 1326 /* This test verifies that none of the bits that differ
1367 * between the key and the prefix exist in the region of 1327 * between the key and the prefix exist in the region of
@@ -1393,13 +1353,17 @@ backtrace:
1393 while (!cindex) { 1353 while (!cindex) {
1394 t_key pkey = pn->key; 1354 t_key pkey = pn->key;
1395 1355
1396 pn = node_parent_rcu(pn); 1356 /* If we don't have a parent then there is
1397 if (unlikely(!pn)) 1357 * nothing for us to do as we do not have any
1358 * further nodes to parse.
1359 */
1360 if (IS_TRIE(pn))
1398 return -EAGAIN; 1361 return -EAGAIN;
1399#ifdef CONFIG_IP_FIB_TRIE_STATS 1362#ifdef CONFIG_IP_FIB_TRIE_STATS
1400 this_cpu_inc(stats->backtrack); 1363 this_cpu_inc(stats->backtrack);
1401#endif 1364#endif
1402 /* Get Child's index */ 1365 /* Get Child's index */
1366 pn = node_parent_rcu(pn);
1403 cindex = get_index(pkey, pn); 1367 cindex = get_index(pkey, pn);
1404 } 1368 }
1405 1369
@@ -1407,138 +1371,134 @@ backtrace:
1407 cindex &= cindex - 1; 1371 cindex &= cindex - 1;
1408 1372
1409 /* grab pointer for next child node */ 1373 /* grab pointer for next child node */
1410 cptr = &pn->child[cindex]; 1374 cptr = &pn->tnode[cindex];
1411 } 1375 }
1412 } 1376 }
1413 1377
1414found: 1378found:
1379 /* this line carries forward the xor from earlier in the function */
1380 index = key ^ n->key;
1381
1415 /* Step 3: Process the leaf, if that fails fall back to backtracing */ 1382 /* Step 3: Process the leaf, if that fails fall back to backtracing */
1416 hlist_for_each_entry_rcu(li, &n->list, hlist) { 1383 hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
1417 struct fib_alias *fa; 1384 struct fib_info *fi = fa->fa_info;
1385 int nhsel, err;
1418 1386
1419 if ((key ^ n->key) & li->mask_plen) 1387 if ((index >= (1ul << fa->fa_slen)) &&
1388 ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen != KEYLENGTH)))
1420 continue; 1389 continue;
1421 1390 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1422 list_for_each_entry_rcu(fa, &li->falh, fa_list) { 1391 continue;
1423 struct fib_info *fi = fa->fa_info; 1392 if (fi->fib_dead)
1424 int nhsel, err; 1393 continue;
1425 1394 if (fa->fa_info->fib_scope < flp->flowi4_scope)
1426 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) 1395 continue;
1427 continue; 1396 fib_alias_accessed(fa);
1428 if (fi->fib_dead) 1397 err = fib_props[fa->fa_type].error;
1429 continue; 1398 if (unlikely(err < 0)) {
1430 if (fa->fa_info->fib_scope < flp->flowi4_scope)
1431 continue;
1432 fib_alias_accessed(fa);
1433 err = fib_props[fa->fa_type].error;
1434 if (unlikely(err < 0)) {
1435#ifdef CONFIG_IP_FIB_TRIE_STATS 1399#ifdef CONFIG_IP_FIB_TRIE_STATS
1436 this_cpu_inc(stats->semantic_match_passed); 1400 this_cpu_inc(stats->semantic_match_passed);
1437#endif 1401#endif
1438 return err; 1402 return err;
1439 } 1403 }
1440 if (fi->fib_flags & RTNH_F_DEAD) 1404 if (fi->fib_flags & RTNH_F_DEAD)
1405 continue;
1406 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1407 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1408
1409 if (nh->nh_flags & RTNH_F_DEAD)
1441 continue; 1410 continue;
1442 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { 1411 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1443 const struct fib_nh *nh = &fi->fib_nh[nhsel]; 1412 continue;
1444 1413
1445 if (nh->nh_flags & RTNH_F_DEAD) 1414 if (!(fib_flags & FIB_LOOKUP_NOREF))
1446 continue; 1415 atomic_inc(&fi->fib_clntref);
1447 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) 1416
1448 continue; 1417 res->prefixlen = KEYLENGTH - fa->fa_slen;
1449 1418 res->nh_sel = nhsel;
1450 if (!(fib_flags & FIB_LOOKUP_NOREF)) 1419 res->type = fa->fa_type;
1451 atomic_inc(&fi->fib_clntref); 1420 res->scope = fi->fib_scope;
1452 1421 res->fi = fi;
1453 res->prefixlen = li->plen; 1422 res->table = tb;
1454 res->nh_sel = nhsel; 1423 res->fa_head = &n->leaf;
1455 res->type = fa->fa_type;
1456 res->scope = fi->fib_scope;
1457 res->fi = fi;
1458 res->table = tb;
1459 res->fa_head = &li->falh;
1460#ifdef CONFIG_IP_FIB_TRIE_STATS 1424#ifdef CONFIG_IP_FIB_TRIE_STATS
1461 this_cpu_inc(stats->semantic_match_passed); 1425 this_cpu_inc(stats->semantic_match_passed);
1462#endif 1426#endif
1463 return err; 1427 return err;
1464 }
1465 } 1428 }
1466 1429 }
1467#ifdef CONFIG_IP_FIB_TRIE_STATS 1430#ifdef CONFIG_IP_FIB_TRIE_STATS
1468 this_cpu_inc(stats->semantic_match_miss); 1431 this_cpu_inc(stats->semantic_match_miss);
1469#endif 1432#endif
1470 }
1471 goto backtrace; 1433 goto backtrace;
1472} 1434}
1473EXPORT_SYMBOL_GPL(fib_table_lookup); 1435EXPORT_SYMBOL_GPL(fib_table_lookup);
1474 1436
1475/* 1437static void fib_remove_alias(struct trie *t, struct key_vector *tp,
1476 * Remove the leaf and return parent. 1438 struct key_vector *l, struct fib_alias *old)
1477 */
1478static void trie_leaf_remove(struct trie *t, struct tnode *l)
1479{ 1439{
1480 struct tnode *tp = node_parent(l); 1440 /* record the location of the previous list_info entry */
1441 struct hlist_node **pprev = old->fa_list.pprev;
1442 struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);
1481 1443
1482 pr_debug("entering trie_leaf_remove(%p)\n", l); 1444 /* remove the fib_alias from the list */
1445 hlist_del_rcu(&old->fa_list);
1483 1446
1484 if (tp) { 1447 /* if we emptied the list this leaf will be freed and we can sort
1485 put_child(tp, get_index(l->key, tp), NULL); 1448 * out parent suffix lengths as a part of trie_rebalance
1449 */
1450 if (hlist_empty(&l->leaf)) {
1451 put_child_root(tp, l->key, NULL);
1452 node_free(l);
1486 trie_rebalance(t, tp); 1453 trie_rebalance(t, tp);
1487 } else { 1454 return;
1488 RCU_INIT_POINTER(t->trie, NULL);
1489 } 1455 }
1490 1456
1491 node_free(l); 1457 /* only access fa if it is pointing at the last valid hlist_node */
1458 if (*pprev)
1459 return;
1460
1461 /* update the trie with the latest suffix length */
1462 l->slen = fa->fa_slen;
1463 leaf_pull_suffix(tp, l);
1492} 1464}
1493 1465
1494/* 1466/* Caller must hold RTNL. */
1495 * Caller must hold RTNL.
1496 */
1497int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) 1467int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1498{ 1468{
1499 struct trie *t = (struct trie *) tb->tb_data; 1469 struct trie *t = (struct trie *) tb->tb_data;
1500 u32 key, mask;
1501 int plen = cfg->fc_dst_len;
1502 u8 tos = cfg->fc_tos;
1503 struct fib_alias *fa, *fa_to_delete; 1470 struct fib_alias *fa, *fa_to_delete;
1504 struct list_head *fa_head; 1471 struct key_vector *l, *tp;
1505 struct tnode *l; 1472 u8 plen = cfg->fc_dst_len;
1506 struct leaf_info *li; 1473 u8 slen = KEYLENGTH - plen;
1474 u8 tos = cfg->fc_tos;
1475 u32 key;
1507 1476
1508 if (plen > 32) 1477 if (plen > KEYLENGTH)
1509 return -EINVAL; 1478 return -EINVAL;
1510 1479
1511 key = ntohl(cfg->fc_dst); 1480 key = ntohl(cfg->fc_dst);
1512 mask = ntohl(inet_make_mask(plen));
1513 1481
1514 if (key & ~mask) 1482 if ((plen < KEYLENGTH) && (key << plen))
1515 return -EINVAL; 1483 return -EINVAL;
1516 1484
1517 key = key & mask; 1485 l = fib_find_node(t, &tp, key);
1518 l = fib_find_node(t, key);
1519
1520 if (!l) 1486 if (!l)
1521 return -ESRCH; 1487 return -ESRCH;
1522 1488
1523 li = find_leaf_info(l, plen); 1489 fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id);
1524
1525 if (!li)
1526 return -ESRCH;
1527
1528 fa_head = &li->falh;
1529 fa = fib_find_alias(fa_head, tos, 0);
1530
1531 if (!fa) 1490 if (!fa)
1532 return -ESRCH; 1491 return -ESRCH;
1533 1492
1534 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); 1493 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1535 1494
1536 fa_to_delete = NULL; 1495 fa_to_delete = NULL;
1537 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); 1496 hlist_for_each_entry_from(fa, fa_list) {
1538 list_for_each_entry_continue(fa, fa_head, fa_list) {
1539 struct fib_info *fi = fa->fa_info; 1497 struct fib_info *fi = fa->fa_info;
1540 1498
1541 if (fa->fa_tos != tos) 1499 if ((fa->fa_slen != slen) ||
1500 (fa->tb_id != tb->tb_id) ||
1501 (fa->fa_tos != tos))
1542 break; 1502 break;
1543 1503
1544 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && 1504 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
@@ -1557,240 +1517,397 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1557 if (!fa_to_delete) 1517 if (!fa_to_delete)
1558 return -ESRCH; 1518 return -ESRCH;
1559 1519
1560 fa = fa_to_delete; 1520 netdev_switch_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
1561 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, 1521 cfg->fc_type, tb->tb_id);
1562 &cfg->fc_nlinfo, 0);
1563 1522
1564 list_del_rcu(&fa->fa_list); 1523 rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
1524 &cfg->fc_nlinfo, 0);
1565 1525
1566 if (!plen) 1526 if (!plen)
1567 tb->tb_num_default--; 1527 tb->tb_num_default--;
1568 1528
1569 if (list_empty(fa_head)) { 1529 fib_remove_alias(t, tp, l, fa_to_delete);
1570 remove_leaf_info(l, li);
1571 free_leaf_info(li);
1572 }
1573 1530
1574 if (hlist_empty(&l->list)) 1531 if (fa_to_delete->fa_state & FA_S_ACCESSED)
1575 trie_leaf_remove(t, l);
1576
1577 if (fa->fa_state & FA_S_ACCESSED)
1578 rt_cache_flush(cfg->fc_nlinfo.nl_net); 1532 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1579 1533
1580 fib_release_info(fa->fa_info); 1534 fib_release_info(fa_to_delete->fa_info);
1581 alias_free_mem_rcu(fa); 1535 alias_free_mem_rcu(fa_to_delete);
1582 return 0; 1536 return 0;
1583} 1537}
1584 1538
1585static int trie_flush_list(struct list_head *head) 1539/* Scan for the next leaf starting at the provided key value */
1540static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key)
1586{ 1541{
1587 struct fib_alias *fa, *fa_node; 1542 struct key_vector *pn, *n = *tn;
1588 int found = 0; 1543 unsigned long cindex;
1589 1544
1590 list_for_each_entry_safe(fa, fa_node, head, fa_list) { 1545 /* this loop is meant to try and find the key in the trie */
1591 struct fib_info *fi = fa->fa_info; 1546 do {
1547 /* record parent and next child index */
1548 pn = n;
1549 cindex = key ? get_index(key, pn) : 0;
1592 1550
1593 if (fi && (fi->fib_flags & RTNH_F_DEAD)) { 1551 if (cindex >> pn->bits)
1594 list_del_rcu(&fa->fa_list); 1552 break;
1595 fib_release_info(fa->fa_info); 1553
1596 alias_free_mem_rcu(fa); 1554 /* descend into the next child */
1597 found++; 1555 n = get_child_rcu(pn, cindex++);
1556 if (!n)
1557 break;
1558
1559 /* guarantee forward progress on the keys */
1560 if (IS_LEAF(n) && (n->key >= key))
1561 goto found;
1562 } while (IS_TNODE(n));
1563
1564 /* this loop will search for the next leaf with a greater key */
1565 while (!IS_TRIE(pn)) {
1566 /* if we exhausted the parent node we will need to climb */
1567 if (cindex >= (1ul << pn->bits)) {
1568 t_key pkey = pn->key;
1569
1570 pn = node_parent_rcu(pn);
1571 cindex = get_index(pkey, pn) + 1;
1572 continue;
1598 } 1573 }
1574
1575 /* grab the next available node */
1576 n = get_child_rcu(pn, cindex++);
1577 if (!n)
1578 continue;
1579
1580 /* no need to compare keys since we bumped the index */
1581 if (IS_LEAF(n))
1582 goto found;
1583
1584 /* Rescan start scanning in new node */
1585 pn = n;
1586 cindex = 0;
1599 } 1587 }
1600 return found; 1588
1589 *tn = pn;
1590 return NULL; /* Root of trie */
1591found:
1592 /* if we are at the limit for keys just return NULL for the tnode */
1593 *tn = pn;
1594 return n;
1601} 1595}
1602 1596
1603static int trie_flush_leaf(struct tnode *l) 1597static void fib_trie_free(struct fib_table *tb)
1604{ 1598{
1605 int found = 0; 1599 struct trie *t = (struct trie *)tb->tb_data;
1606 struct hlist_head *lih = &l->list; 1600 struct key_vector *pn = t->kv;
1601 unsigned long cindex = 1;
1607 struct hlist_node *tmp; 1602 struct hlist_node *tmp;
1608 struct leaf_info *li = NULL; 1603 struct fib_alias *fa;
1609 unsigned char plen = KEYLENGTH; 1604
1605 /* walk trie in reverse order and free everything */
1606 for (;;) {
1607 struct key_vector *n;
1608
1609 if (!(cindex--)) {
1610 t_key pkey = pn->key;
1611
1612 if (IS_TRIE(pn))
1613 break;
1614
1615 n = pn;
1616 pn = node_parent(pn);
1610 1617
1611 hlist_for_each_entry_safe(li, tmp, lih, hlist) { 1618 /* drop emptied tnode */
1612 found += trie_flush_list(&li->falh); 1619 put_child_root(pn, n->key, NULL);
1620 node_free(n);
1621
1622 cindex = get_index(pkey, pn);
1613 1623
1614 if (list_empty(&li->falh)) {
1615 hlist_del_rcu(&li->hlist);
1616 free_leaf_info(li);
1617 continue; 1624 continue;
1618 } 1625 }
1619 1626
1620 plen = li->plen; 1627 /* grab the next available node */
1621 } 1628 n = get_child(pn, cindex);
1629 if (!n)
1630 continue;
1622 1631
1623 l->slen = KEYLENGTH - plen; 1632 if (IS_TNODE(n)) {
1633 /* record pn and cindex for leaf walking */
1634 pn = n;
1635 cindex = 1ul << n->bits;
1624 1636
1625 return found; 1637 continue;
1638 }
1639
1640 hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
1641 hlist_del_rcu(&fa->fa_list);
1642 alias_free_mem_rcu(fa);
1643 }
1644
1645 put_child_root(pn, n->key, NULL);
1646 node_free(n);
1647 }
1648
1649#ifdef CONFIG_IP_FIB_TRIE_STATS
1650 free_percpu(t->stats);
1651#endif
1652 kfree(tb);
1626} 1653}
1627 1654
1628/* 1655struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
1629 * Scan for the next right leaf starting at node p->child[idx]
1630 * Since we have back pointer, no recursion necessary.
1631 */
1632static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
1633{ 1656{
1634 do { 1657 struct trie *ot = (struct trie *)oldtb->tb_data;
1635 unsigned long idx = c ? idx = get_index(c->key, p) + 1 : 0; 1658 struct key_vector *l, *tp = ot->kv;
1659 struct fib_table *local_tb;
1660 struct fib_alias *fa;
1661 struct trie *lt;
1662 t_key key = 0;
1636 1663
1637 while (idx < tnode_child_length(p)) { 1664 if (oldtb->tb_data == oldtb->__data)
1638 c = tnode_get_child_rcu(p, idx++); 1665 return oldtb;
1639 if (!c) 1666
1667 local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL);
1668 if (!local_tb)
1669 return NULL;
1670
1671 lt = (struct trie *)local_tb->tb_data;
1672
1673 while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
1674 struct key_vector *local_l = NULL, *local_tp;
1675
1676 hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
1677 struct fib_alias *new_fa;
1678
1679 if (local_tb->tb_id != fa->tb_id)
1640 continue; 1680 continue;
1641 1681
1642 if (IS_LEAF(c)) 1682 /* clone fa for new local table */
1643 return c; 1683 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1684 if (!new_fa)
1685 goto out;
1686
1687 memcpy(new_fa, fa, sizeof(*fa));
1644 1688
1645 /* Rescan start scanning in new node */ 1689 /* insert clone into table */
1646 p = c; 1690 if (!local_l)
1647 idx = 0; 1691 local_l = fib_find_node(lt, &local_tp, l->key);
1692
1693 if (fib_insert_alias(lt, local_tp, local_l, new_fa,
1694 NULL, l->key))
1695 goto out;
1648 } 1696 }
1649 1697
1650 /* Node empty, walk back up to parent */ 1698 /* stop loop if key wrapped back to 0 */
1651 c = p; 1699 key = l->key + 1;
1652 } while ((p = node_parent_rcu(c)) != NULL); 1700 if (key < l->key)
1701 break;
1702 }
1653 1703
1654 return NULL; /* Root of trie */ 1704 return local_tb;
1705out:
1706 fib_trie_free(local_tb);
1707
1708 return NULL;
1655} 1709}
1656 1710
1657static struct tnode *trie_firstleaf(struct trie *t) 1711/* Caller must hold RTNL */
1712void fib_table_flush_external(struct fib_table *tb)
1658{ 1713{
1659 struct tnode *n = rcu_dereference_rtnl(t->trie); 1714 struct trie *t = (struct trie *)tb->tb_data;
1715 struct key_vector *pn = t->kv;
1716 unsigned long cindex = 1;
1717 struct hlist_node *tmp;
1718 struct fib_alias *fa;
1660 1719
1661 if (!n) 1720 /* walk trie in reverse order */
1662 return NULL; 1721 for (;;) {
1722 unsigned char slen = 0;
1723 struct key_vector *n;
1663 1724
1664 if (IS_LEAF(n)) /* trie is just a leaf */ 1725 if (!(cindex--)) {
1665 return n; 1726 t_key pkey = pn->key;
1666 1727
1667 return leaf_walk_rcu(n, NULL); 1728 /* cannot resize the trie vector */
1668} 1729 if (IS_TRIE(pn))
1730 break;
1669 1731
1670static struct tnode *trie_nextleaf(struct tnode *l) 1732 /* resize completed node */
1671{ 1733 pn = resize(t, pn);
1672 struct tnode *p = node_parent_rcu(l); 1734 cindex = get_index(pkey, pn);
1673 1735
1674 if (!p) 1736 continue;
1675 return NULL; /* trie with just one leaf */ 1737 }
1676 1738
1677 return leaf_walk_rcu(p, l); 1739 /* grab the next available node */
1678} 1740 n = get_child(pn, cindex);
1741 if (!n)
1742 continue;
1679 1743
1680static struct tnode *trie_leafindex(struct trie *t, int index) 1744 if (IS_TNODE(n)) {
1681{ 1745 /* record pn and cindex for leaf walking */
1682 struct tnode *l = trie_firstleaf(t); 1746 pn = n;
1747 cindex = 1ul << n->bits;
1683 1748
1684 while (l && index-- > 0) 1749 continue;
1685 l = trie_nextleaf(l); 1750 }
1686 1751
1687 return l; 1752 hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
1688} 1753 struct fib_info *fi = fa->fa_info;
1754
1755 /* if alias was cloned to local then we just
1756 * need to remove the local copy from main
1757 */
1758 if (tb->tb_id != fa->tb_id) {
1759 hlist_del_rcu(&fa->fa_list);
1760 alias_free_mem_rcu(fa);
1761 continue;
1762 }
1689 1763
1764 /* record local slen */
1765 slen = fa->fa_slen;
1690 1766
1691/* 1767 if (!fi || !(fi->fib_flags & RTNH_F_EXTERNAL))
1692 * Caller must hold RTNL. 1768 continue;
1693 */ 1769
1770 netdev_switch_fib_ipv4_del(n->key,
1771 KEYLENGTH - fa->fa_slen,
1772 fi, fa->fa_tos,
1773 fa->fa_type, tb->tb_id);
1774 }
1775
1776 /* update leaf slen */
1777 n->slen = slen;
1778
1779 if (hlist_empty(&n->leaf)) {
1780 put_child_root(pn, n->key, NULL);
1781 node_free(n);
1782 } else {
1783 leaf_pull_suffix(pn, n);
1784 }
1785 }
1786}
1787
1788/* Caller must hold RTNL. */
1694int fib_table_flush(struct fib_table *tb) 1789int fib_table_flush(struct fib_table *tb)
1695{ 1790{
1696 struct trie *t = (struct trie *) tb->tb_data; 1791 struct trie *t = (struct trie *)tb->tb_data;
1697 struct tnode *l, *ll = NULL; 1792 struct key_vector *pn = t->kv;
1793 unsigned long cindex = 1;
1794 struct hlist_node *tmp;
1795 struct fib_alias *fa;
1698 int found = 0; 1796 int found = 0;
1699 1797
1700 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) { 1798 /* walk trie in reverse order */
1701 found += trie_flush_leaf(l); 1799 for (;;) {
1800 unsigned char slen = 0;
1801 struct key_vector *n;
1802
1803 if (!(cindex--)) {
1804 t_key pkey = pn->key;
1702 1805
1703 if (ll) { 1806 /* cannot resize the trie vector */
1704 if (hlist_empty(&ll->list)) 1807 if (IS_TRIE(pn))
1705 trie_leaf_remove(t, ll); 1808 break;
1706 else 1809
1707 leaf_pull_suffix(ll); 1810 /* resize completed node */
1811 pn = resize(t, pn);
1812 cindex = get_index(pkey, pn);
1813
1814 continue;
1708 } 1815 }
1709 1816
1710 ll = l; 1817 /* grab the next available node */
1711 } 1818 n = get_child(pn, cindex);
1819 if (!n)
1820 continue;
1712 1821
1713 if (ll) { 1822 if (IS_TNODE(n)) {
1714 if (hlist_empty(&ll->list)) 1823 /* record pn and cindex for leaf walking */
1715 trie_leaf_remove(t, ll); 1824 pn = n;
1716 else 1825 cindex = 1ul << n->bits;
1717 leaf_pull_suffix(ll); 1826
1827 continue;
1828 }
1829
1830 hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
1831 struct fib_info *fi = fa->fa_info;
1832
1833 if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) {
1834 slen = fa->fa_slen;
1835 continue;
1836 }
1837
1838 netdev_switch_fib_ipv4_del(n->key,
1839 KEYLENGTH - fa->fa_slen,
1840 fi, fa->fa_tos,
1841 fa->fa_type, tb->tb_id);
1842 hlist_del_rcu(&fa->fa_list);
1843 fib_release_info(fa->fa_info);
1844 alias_free_mem_rcu(fa);
1845 found++;
1846 }
1847
1848 /* update leaf slen */
1849 n->slen = slen;
1850
1851 if (hlist_empty(&n->leaf)) {
1852 put_child_root(pn, n->key, NULL);
1853 node_free(n);
1854 } else {
1855 leaf_pull_suffix(pn, n);
1856 }
1718 } 1857 }
1719 1858
1720 pr_debug("trie_flush found=%d\n", found); 1859 pr_debug("trie_flush found=%d\n", found);
1721 return found; 1860 return found;
1722} 1861}
1723 1862
1724void fib_free_table(struct fib_table *tb) 1863static void __trie_free_rcu(struct rcu_head *head)
1725{ 1864{
1865 struct fib_table *tb = container_of(head, struct fib_table, rcu);
1726#ifdef CONFIG_IP_FIB_TRIE_STATS 1866#ifdef CONFIG_IP_FIB_TRIE_STATS
1727 struct trie *t = (struct trie *)tb->tb_data; 1867 struct trie *t = (struct trie *)tb->tb_data;
1728 1868
1729 free_percpu(t->stats); 1869 if (tb->tb_data == tb->__data)
1870 free_percpu(t->stats);
1730#endif /* CONFIG_IP_FIB_TRIE_STATS */ 1871#endif /* CONFIG_IP_FIB_TRIE_STATS */
1731 kfree(tb); 1872 kfree(tb);
1732} 1873}
1733 1874
1734static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, 1875void fib_free_table(struct fib_table *tb)
1735 struct fib_table *tb,
1736 struct sk_buff *skb, struct netlink_callback *cb)
1737{ 1876{
1738 int i, s_i; 1877 call_rcu(&tb->rcu, __trie_free_rcu);
1878}
1879
1880static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
1881 struct sk_buff *skb, struct netlink_callback *cb)
1882{
1883 __be32 xkey = htonl(l->key);
1739 struct fib_alias *fa; 1884 struct fib_alias *fa;
1740 __be32 xkey = htonl(key); 1885 int i, s_i;
1741 1886
1742 s_i = cb->args[5]; 1887 s_i = cb->args[4];
1743 i = 0; 1888 i = 0;
1744 1889
1745 /* rcu_read_lock is hold by caller */ 1890 /* rcu_read_lock is hold by caller */
1746 1891 hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
1747 list_for_each_entry_rcu(fa, fah, fa_list) {
1748 if (i < s_i) { 1892 if (i < s_i) {
1749 i++; 1893 i++;
1750 continue; 1894 continue;
1751 } 1895 }
1752 1896
1897 if (tb->tb_id != fa->tb_id) {
1898 i++;
1899 continue;
1900 }
1901
1753 if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid, 1902 if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1754 cb->nlh->nlmsg_seq, 1903 cb->nlh->nlmsg_seq,
1755 RTM_NEWROUTE, 1904 RTM_NEWROUTE,
1756 tb->tb_id, 1905 tb->tb_id,
1757 fa->fa_type, 1906 fa->fa_type,
1758 xkey, 1907 xkey,
1759 plen, 1908 KEYLENGTH - fa->fa_slen,
1760 fa->fa_tos, 1909 fa->fa_tos,
1761 fa->fa_info, NLM_F_MULTI) < 0) { 1910 fa->fa_info, NLM_F_MULTI) < 0) {
1762 cb->args[5] = i;
1763 return -1;
1764 }
1765 i++;
1766 }
1767 cb->args[5] = i;
1768 return skb->len;
1769}
1770
1771static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
1772 struct sk_buff *skb, struct netlink_callback *cb)
1773{
1774 struct leaf_info *li;
1775 int i, s_i;
1776
1777 s_i = cb->args[4];
1778 i = 0;
1779
1780 /* rcu_read_lock is hold by caller */
1781 hlist_for_each_entry_rcu(li, &l->list, hlist) {
1782 if (i < s_i) {
1783 i++;
1784 continue;
1785 }
1786
1787 if (i > s_i)
1788 cb->args[5] = 0;
1789
1790 if (list_empty(&li->falh))
1791 continue;
1792
1793 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1794 cb->args[4] = i; 1911 cb->args[4] = i;
1795 return -1; 1912 return -1;
1796 } 1913 }
@@ -1801,44 +1918,38 @@ static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
1801 return skb->len; 1918 return skb->len;
1802} 1919}
1803 1920
1921/* rcu_read_lock needs to be hold by caller from readside */
1804int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, 1922int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1805 struct netlink_callback *cb) 1923 struct netlink_callback *cb)
1806{ 1924{
1807 struct tnode *l; 1925 struct trie *t = (struct trie *)tb->tb_data;
1808 struct trie *t = (struct trie *) tb->tb_data; 1926 struct key_vector *l, *tp = t->kv;
1809 t_key key = cb->args[2];
1810 int count = cb->args[3];
1811
1812 rcu_read_lock();
1813 /* Dump starting at last key. 1927 /* Dump starting at last key.
1814 * Note: 0.0.0.0/0 (ie default) is first key. 1928 * Note: 0.0.0.0/0 (ie default) is first key.
1815 */ 1929 */
1816 if (count == 0) 1930 int count = cb->args[2];
1817 l = trie_firstleaf(t); 1931 t_key key = cb->args[3];
1818 else {
1819 /* Normally, continue from last key, but if that is missing
1820 * fallback to using slow rescan
1821 */
1822 l = fib_find_node(t, key);
1823 if (!l)
1824 l = trie_leafindex(t, count);
1825 }
1826 1932
1827 while (l) { 1933 while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
1828 cb->args[2] = l->key;
1829 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) { 1934 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1830 cb->args[3] = count; 1935 cb->args[3] = key;
1831 rcu_read_unlock(); 1936 cb->args[2] = count;
1832 return -1; 1937 return -1;
1833 } 1938 }
1834 1939
1835 ++count; 1940 ++count;
1836 l = trie_nextleaf(l); 1941 key = l->key + 1;
1942
1837 memset(&cb->args[4], 0, 1943 memset(&cb->args[4], 0,
1838 sizeof(cb->args) - 4*sizeof(cb->args[0])); 1944 sizeof(cb->args) - 4*sizeof(cb->args[0]));
1945
1946 /* stop loop if key wrapped back to 0 */
1947 if (key < l->key)
1948 break;
1839 } 1949 }
1840 cb->args[3] = count; 1950
1841 rcu_read_unlock(); 1951 cb->args[3] = key;
1952 cb->args[2] = count;
1842 1953
1843 return skb->len; 1954 return skb->len;
1844} 1955}
@@ -1850,28 +1961,34 @@ void __init fib_trie_init(void)
1850 0, SLAB_PANIC, NULL); 1961 0, SLAB_PANIC, NULL);
1851 1962
1852 trie_leaf_kmem = kmem_cache_create("ip_fib_trie", 1963 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1853 max(sizeof(struct tnode), 1964 LEAF_SIZE,
1854 sizeof(struct leaf_info)),
1855 0, SLAB_PANIC, NULL); 1965 0, SLAB_PANIC, NULL);
1856} 1966}
1857 1967
1858 1968struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
1859struct fib_table *fib_trie_table(u32 id)
1860{ 1969{
1861 struct fib_table *tb; 1970 struct fib_table *tb;
1862 struct trie *t; 1971 struct trie *t;
1972 size_t sz = sizeof(*tb);
1973
1974 if (!alias)
1975 sz += sizeof(struct trie);
1863 1976
1864 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie), 1977 tb = kzalloc(sz, GFP_KERNEL);
1865 GFP_KERNEL); 1978 if (!tb)
1866 if (tb == NULL)
1867 return NULL; 1979 return NULL;
1868 1980
1869 tb->tb_id = id; 1981 tb->tb_id = id;
1870 tb->tb_default = -1; 1982 tb->tb_default = -1;
1871 tb->tb_num_default = 0; 1983 tb->tb_num_default = 0;
1984 tb->tb_data = (alias ? alias->__data : tb->__data);
1985
1986 if (alias)
1987 return tb;
1872 1988
1873 t = (struct trie *) tb->tb_data; 1989 t = (struct trie *) tb->tb_data;
1874 RCU_INIT_POINTER(t->trie, NULL); 1990 t->kv[0].pos = KEYLENGTH;
1991 t->kv[0].slen = KEYLENGTH;
1875#ifdef CONFIG_IP_FIB_TRIE_STATS 1992#ifdef CONFIG_IP_FIB_TRIE_STATS
1876 t->stats = alloc_percpu(struct trie_use_stats); 1993 t->stats = alloc_percpu(struct trie_use_stats);
1877 if (!t->stats) { 1994 if (!t->stats) {
@@ -1888,65 +2005,63 @@ struct fib_table *fib_trie_table(u32 id)
1888struct fib_trie_iter { 2005struct fib_trie_iter {
1889 struct seq_net_private p; 2006 struct seq_net_private p;
1890 struct fib_table *tb; 2007 struct fib_table *tb;
1891 struct tnode *tnode; 2008 struct key_vector *tnode;
1892 unsigned int index; 2009 unsigned int index;
1893 unsigned int depth; 2010 unsigned int depth;
1894}; 2011};
1895 2012
1896static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter) 2013static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
1897{ 2014{
1898 unsigned long cindex = iter->index; 2015 unsigned long cindex = iter->index;
1899 struct tnode *tn = iter->tnode; 2016 struct key_vector *pn = iter->tnode;
1900 struct tnode *p; 2017 t_key pkey;
1901
1902 /* A single entry routing table */
1903 if (!tn)
1904 return NULL;
1905 2018
1906 pr_debug("get_next iter={node=%p index=%d depth=%d}\n", 2019 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1907 iter->tnode, iter->index, iter->depth); 2020 iter->tnode, iter->index, iter->depth);
1908rescan:
1909 while (cindex < tnode_child_length(tn)) {
1910 struct tnode *n = tnode_get_child_rcu(tn, cindex);
1911 2021
1912 if (n) { 2022 while (!IS_TRIE(pn)) {
2023 while (cindex < child_length(pn)) {
2024 struct key_vector *n = get_child_rcu(pn, cindex++);
2025
2026 if (!n)
2027 continue;
2028
1913 if (IS_LEAF(n)) { 2029 if (IS_LEAF(n)) {
1914 iter->tnode = tn; 2030 iter->tnode = pn;
1915 iter->index = cindex + 1; 2031 iter->index = cindex;
1916 } else { 2032 } else {
1917 /* push down one level */ 2033 /* push down one level */
1918 iter->tnode = n; 2034 iter->tnode = n;
1919 iter->index = 0; 2035 iter->index = 0;
1920 ++iter->depth; 2036 ++iter->depth;
1921 } 2037 }
2038
1922 return n; 2039 return n;
1923 } 2040 }
1924 2041
1925 ++cindex; 2042 /* Current node exhausted, pop back up */
1926 } 2043 pkey = pn->key;
1927 2044 pn = node_parent_rcu(pn);
1928 /* Current node exhausted, pop back up */ 2045 cindex = get_index(pkey, pn) + 1;
1929 p = node_parent_rcu(tn);
1930 if (p) {
1931 cindex = get_index(tn->key, p) + 1;
1932 tn = p;
1933 --iter->depth; 2046 --iter->depth;
1934 goto rescan;
1935 } 2047 }
1936 2048
1937 /* got root? */ 2049 /* record root node so further searches know we are done */
2050 iter->tnode = pn;
2051 iter->index = 0;
2052
1938 return NULL; 2053 return NULL;
1939} 2054}
1940 2055
1941static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter, 2056static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter,
1942 struct trie *t) 2057 struct trie *t)
1943{ 2058{
1944 struct tnode *n; 2059 struct key_vector *n, *pn = t->kv;
1945 2060
1946 if (!t) 2061 if (!t)
1947 return NULL; 2062 return NULL;
1948 2063
1949 n = rcu_dereference(t->trie); 2064 n = rcu_dereference(pn->tnode[0]);
1950 if (!n) 2065 if (!n)
1951 return NULL; 2066 return NULL;
1952 2067
@@ -1955,7 +2070,7 @@ static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
1955 iter->index = 0; 2070 iter->index = 0;
1956 iter->depth = 1; 2071 iter->depth = 1;
1957 } else { 2072 } else {
1958 iter->tnode = NULL; 2073 iter->tnode = pn;
1959 iter->index = 0; 2074 iter->index = 0;
1960 iter->depth = 0; 2075 iter->depth = 0;
1961 } 2076 }
@@ -1965,7 +2080,7 @@ static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
1965 2080
1966static void trie_collect_stats(struct trie *t, struct trie_stat *s) 2081static void trie_collect_stats(struct trie *t, struct trie_stat *s)
1967{ 2082{
1968 struct tnode *n; 2083 struct key_vector *n;
1969 struct fib_trie_iter iter; 2084 struct fib_trie_iter iter;
1970 2085
1971 memset(s, 0, sizeof(*s)); 2086 memset(s, 0, sizeof(*s));
@@ -1973,20 +2088,20 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
1973 rcu_read_lock(); 2088 rcu_read_lock();
1974 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) { 2089 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
1975 if (IS_LEAF(n)) { 2090 if (IS_LEAF(n)) {
1976 struct leaf_info *li; 2091 struct fib_alias *fa;
1977 2092
1978 s->leaves++; 2093 s->leaves++;
1979 s->totdepth += iter.depth; 2094 s->totdepth += iter.depth;
1980 if (iter.depth > s->maxdepth) 2095 if (iter.depth > s->maxdepth)
1981 s->maxdepth = iter.depth; 2096 s->maxdepth = iter.depth;
1982 2097
1983 hlist_for_each_entry_rcu(li, &n->list, hlist) 2098 hlist_for_each_entry_rcu(fa, &n->leaf, fa_list)
1984 ++s->prefixes; 2099 ++s->prefixes;
1985 } else { 2100 } else {
1986 s->tnodes++; 2101 s->tnodes++;
1987 if (n->bits < MAX_STAT_DEPTH) 2102 if (n->bits < MAX_STAT_DEPTH)
1988 s->nodesizes[n->bits]++; 2103 s->nodesizes[n->bits]++;
1989 s->nullpointers += n->empty_children; 2104 s->nullpointers += tn_info(n)->empty_children;
1990 } 2105 }
1991 } 2106 }
1992 rcu_read_unlock(); 2107 rcu_read_unlock();
@@ -2009,13 +2124,13 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2009 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); 2124 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
2010 2125
2011 seq_printf(seq, "\tLeaves: %u\n", stat->leaves); 2126 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
2012 bytes = sizeof(struct tnode) * stat->leaves; 2127 bytes = LEAF_SIZE * stat->leaves;
2013 2128
2014 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); 2129 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2015 bytes += sizeof(struct leaf_info) * stat->prefixes; 2130 bytes += sizeof(struct fib_alias) * stat->prefixes;
2016 2131
2017 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes); 2132 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2018 bytes += sizeof(struct tnode) * stat->tnodes; 2133 bytes += TNODE_SIZE(0) * stat->tnodes;
2019 2134
2020 max = MAX_STAT_DEPTH; 2135 max = MAX_STAT_DEPTH;
2021 while (max > 0 && stat->nodesizes[max-1] == 0) 2136 while (max > 0 && stat->nodesizes[max-1] == 0)
@@ -2030,7 +2145,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2030 seq_putc(seq, '\n'); 2145 seq_putc(seq, '\n');
2031 seq_printf(seq, "\tPointers: %u\n", pointers); 2146 seq_printf(seq, "\tPointers: %u\n", pointers);
2032 2147
2033 bytes += sizeof(struct tnode *) * pointers; 2148 bytes += sizeof(struct key_vector *) * pointers;
2034 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); 2149 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2035 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); 2150 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2036} 2151}
@@ -2084,7 +2199,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2084 seq_printf(seq, 2199 seq_printf(seq,
2085 "Basic info: size of leaf:" 2200 "Basic info: size of leaf:"
2086 " %Zd bytes, size of tnode: %Zd bytes.\n", 2201 " %Zd bytes, size of tnode: %Zd bytes.\n",
2087 sizeof(struct tnode), sizeof(struct tnode)); 2202 LEAF_SIZE, TNODE_SIZE(0));
2088 2203
2089 for (h = 0; h < FIB_TABLE_HASHSZ; h++) { 2204 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2090 struct hlist_head *head = &net->ipv4.fib_table_hash[h]; 2205 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
@@ -2123,7 +2238,7 @@ static const struct file_operations fib_triestat_fops = {
2123 .release = single_release_net, 2238 .release = single_release_net,
2124}; 2239};
2125 2240
2126static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos) 2241static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2127{ 2242{
2128 struct fib_trie_iter *iter = seq->private; 2243 struct fib_trie_iter *iter = seq->private;
2129 struct net *net = seq_file_net(seq); 2244 struct net *net = seq_file_net(seq);
@@ -2135,7 +2250,7 @@ static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2135 struct fib_table *tb; 2250 struct fib_table *tb;
2136 2251
2137 hlist_for_each_entry_rcu(tb, head, tb_hlist) { 2252 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2138 struct tnode *n; 2253 struct key_vector *n;
2139 2254
2140 for (n = fib_trie_get_first(iter, 2255 for (n = fib_trie_get_first(iter,
2141 (struct trie *) tb->tb_data); 2256 (struct trie *) tb->tb_data);
@@ -2164,7 +2279,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2164 struct fib_table *tb = iter->tb; 2279 struct fib_table *tb = iter->tb;
2165 struct hlist_node *tb_node; 2280 struct hlist_node *tb_node;
2166 unsigned int h; 2281 unsigned int h;
2167 struct tnode *n; 2282 struct key_vector *n;
2168 2283
2169 ++*pos; 2284 ++*pos;
2170 /* next node in same table */ 2285 /* next node in same table */
@@ -2250,9 +2365,9 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2250static int fib_trie_seq_show(struct seq_file *seq, void *v) 2365static int fib_trie_seq_show(struct seq_file *seq, void *v)
2251{ 2366{
2252 const struct fib_trie_iter *iter = seq->private; 2367 const struct fib_trie_iter *iter = seq->private;
2253 struct tnode *n = v; 2368 struct key_vector *n = v;
2254 2369
2255 if (!node_parent_rcu(n)) 2370 if (IS_TRIE(node_parent_rcu(n)))
2256 fib_table_print(seq, iter->tb); 2371 fib_table_print(seq, iter->tb);
2257 2372
2258 if (IS_TNODE(n)) { 2373 if (IS_TNODE(n)) {
@@ -2261,30 +2376,28 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
2261 seq_indent(seq, iter->depth-1); 2376 seq_indent(seq, iter->depth-1);
2262 seq_printf(seq, " +-- %pI4/%zu %u %u %u\n", 2377 seq_printf(seq, " +-- %pI4/%zu %u %u %u\n",
2263 &prf, KEYLENGTH - n->pos - n->bits, n->bits, 2378 &prf, KEYLENGTH - n->pos - n->bits, n->bits,
2264 n->full_children, n->empty_children); 2379 tn_info(n)->full_children,
2380 tn_info(n)->empty_children);
2265 } else { 2381 } else {
2266 struct leaf_info *li;
2267 __be32 val = htonl(n->key); 2382 __be32 val = htonl(n->key);
2383 struct fib_alias *fa;
2268 2384
2269 seq_indent(seq, iter->depth); 2385 seq_indent(seq, iter->depth);
2270 seq_printf(seq, " |-- %pI4\n", &val); 2386 seq_printf(seq, " |-- %pI4\n", &val);
2271 2387
2272 hlist_for_each_entry_rcu(li, &n->list, hlist) { 2388 hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
2273 struct fib_alias *fa; 2389 char buf1[32], buf2[32];
2274 2390
2275 list_for_each_entry_rcu(fa, &li->falh, fa_list) { 2391 seq_indent(seq, iter->depth + 1);
2276 char buf1[32], buf2[32]; 2392 seq_printf(seq, " /%zu %s %s",
2277 2393 KEYLENGTH - fa->fa_slen,
2278 seq_indent(seq, iter->depth+1); 2394 rtn_scope(buf1, sizeof(buf1),
2279 seq_printf(seq, " /%d %s %s", li->plen, 2395 fa->fa_info->fib_scope),
2280 rtn_scope(buf1, sizeof(buf1), 2396 rtn_type(buf2, sizeof(buf2),
2281 fa->fa_info->fib_scope), 2397 fa->fa_type));
2282 rtn_type(buf2, sizeof(buf2), 2398 if (fa->fa_tos)
2283 fa->fa_type)); 2399 seq_printf(seq, " tos=%d", fa->fa_tos);
2284 if (fa->fa_tos) 2400 seq_putc(seq, '\n');
2285 seq_printf(seq, " tos=%d", fa->fa_tos);
2286 seq_putc(seq, '\n');
2287 }
2288 } 2401 }
2289 } 2402 }
2290 2403
@@ -2314,31 +2427,47 @@ static const struct file_operations fib_trie_fops = {
2314 2427
2315struct fib_route_iter { 2428struct fib_route_iter {
2316 struct seq_net_private p; 2429 struct seq_net_private p;
2317 struct trie *main_trie; 2430 struct fib_table *main_tb;
2431 struct key_vector *tnode;
2318 loff_t pos; 2432 loff_t pos;
2319 t_key key; 2433 t_key key;
2320}; 2434};
2321 2435
2322static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos) 2436static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
2437 loff_t pos)
2323{ 2438{
2324 struct tnode *l = NULL; 2439 struct fib_table *tb = iter->main_tb;
2325 struct trie *t = iter->main_trie; 2440 struct key_vector *l, **tp = &iter->tnode;
2441 struct trie *t;
2442 t_key key;
2326 2443
2327 /* use cache location of last found key */ 2444 /* use cache location of next-to-find key */
2328 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key))) 2445 if (iter->pos > 0 && pos >= iter->pos) {
2329 pos -= iter->pos; 2446 pos -= iter->pos;
2330 else { 2447 key = iter->key;
2448 } else {
2449 t = (struct trie *)tb->tb_data;
2450 iter->tnode = t->kv;
2331 iter->pos = 0; 2451 iter->pos = 0;
2332 l = trie_firstleaf(t); 2452 key = 0;
2333 } 2453 }
2334 2454
2335 while (l && pos-- > 0) { 2455 while ((l = leaf_walk_rcu(tp, key)) != NULL) {
2456 key = l->key + 1;
2336 iter->pos++; 2457 iter->pos++;
2337 l = trie_nextleaf(l); 2458
2459 if (pos-- <= 0)
2460 break;
2461
2462 l = NULL;
2463
2464 /* handle unlikely case of a key wrap */
2465 if (!key)
2466 break;
2338 } 2467 }
2339 2468
2340 if (l) 2469 if (l)
2341 iter->key = pos; /* remember it */ 2470 iter->key = key; /* remember it */
2342 else 2471 else
2343 iter->pos = 0; /* forget it */ 2472 iter->pos = 0; /* forget it */
2344 2473
@@ -2350,37 +2479,46 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2350{ 2479{
2351 struct fib_route_iter *iter = seq->private; 2480 struct fib_route_iter *iter = seq->private;
2352 struct fib_table *tb; 2481 struct fib_table *tb;
2482 struct trie *t;
2353 2483
2354 rcu_read_lock(); 2484 rcu_read_lock();
2485
2355 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); 2486 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2356 if (!tb) 2487 if (!tb)
2357 return NULL; 2488 return NULL;
2358 2489
2359 iter->main_trie = (struct trie *) tb->tb_data; 2490 iter->main_tb = tb;
2360 if (*pos == 0) 2491
2361 return SEQ_START_TOKEN; 2492 if (*pos != 0)
2362 else 2493 return fib_route_get_idx(iter, *pos);
2363 return fib_route_get_idx(iter, *pos - 1); 2494
2495 t = (struct trie *)tb->tb_data;
2496 iter->tnode = t->kv;
2497 iter->pos = 0;
2498 iter->key = 0;
2499
2500 return SEQ_START_TOKEN;
2364} 2501}
2365 2502
2366static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2503static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2367{ 2504{
2368 struct fib_route_iter *iter = seq->private; 2505 struct fib_route_iter *iter = seq->private;
2369 struct tnode *l = v; 2506 struct key_vector *l = NULL;
2507 t_key key = iter->key;
2370 2508
2371 ++*pos; 2509 ++*pos;
2372 if (v == SEQ_START_TOKEN) { 2510
2373 iter->pos = 0; 2511 /* only allow key of 0 for start of sequence */
2374 l = trie_firstleaf(iter->main_trie); 2512 if ((v == SEQ_START_TOKEN) || key)
2375 } else { 2513 l = leaf_walk_rcu(&iter->tnode, key);
2514
2515 if (l) {
2516 iter->key = l->key + 1;
2376 iter->pos++; 2517 iter->pos++;
2377 l = trie_nextleaf(l); 2518 } else {
2519 iter->pos = 0;
2378 } 2520 }
2379 2521
2380 if (l)
2381 iter->key = l->key;
2382 else
2383 iter->pos = 0;
2384 return l; 2522 return l;
2385} 2523}
2386 2524
@@ -2412,8 +2550,11 @@ static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info
2412 */ 2550 */
2413static int fib_route_seq_show(struct seq_file *seq, void *v) 2551static int fib_route_seq_show(struct seq_file *seq, void *v)
2414{ 2552{
2415 struct tnode *l = v; 2553 struct fib_route_iter *iter = seq->private;
2416 struct leaf_info *li; 2554 struct fib_table *tb = iter->main_tb;
2555 struct fib_alias *fa;
2556 struct key_vector *l = v;
2557 __be32 prefix;
2417 2558
2418 if (v == SEQ_START_TOKEN) { 2559 if (v == SEQ_START_TOKEN) {
2419 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " 2560 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
@@ -2422,45 +2563,43 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
2422 return 0; 2563 return 0;
2423 } 2564 }
2424 2565
2425 hlist_for_each_entry_rcu(li, &l->list, hlist) { 2566 prefix = htonl(l->key);
2426 struct fib_alias *fa;
2427 __be32 mask, prefix;
2428 2567
2429 mask = inet_make_mask(li->plen); 2568 hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
2430 prefix = htonl(l->key); 2569 const struct fib_info *fi = fa->fa_info;
2570 __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
2571 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2431 2572
2432 list_for_each_entry_rcu(fa, &li->falh, fa_list) { 2573 if ((fa->fa_type == RTN_BROADCAST) ||
2433 const struct fib_info *fi = fa->fa_info; 2574 (fa->fa_type == RTN_MULTICAST))
2434 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); 2575 continue;
2435 2576
2436 if (fa->fa_type == RTN_BROADCAST 2577 if (fa->tb_id != tb->tb_id)
2437 || fa->fa_type == RTN_MULTICAST) 2578 continue;
2438 continue;
2439 2579
2440 seq_setwidth(seq, 127); 2580 seq_setwidth(seq, 127);
2441 2581
2442 if (fi) 2582 if (fi)
2443 seq_printf(seq, 2583 seq_printf(seq,
2444 "%s\t%08X\t%08X\t%04X\t%d\t%u\t" 2584 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2445 "%d\t%08X\t%d\t%u\t%u", 2585 "%d\t%08X\t%d\t%u\t%u",
2446 fi->fib_dev ? fi->fib_dev->name : "*", 2586 fi->fib_dev ? fi->fib_dev->name : "*",
2447 prefix, 2587 prefix,
2448 fi->fib_nh->nh_gw, flags, 0, 0, 2588 fi->fib_nh->nh_gw, flags, 0, 0,
2449 fi->fib_priority, 2589 fi->fib_priority,
2450 mask, 2590 mask,
2451 (fi->fib_advmss ? 2591 (fi->fib_advmss ?
2452 fi->fib_advmss + 40 : 0), 2592 fi->fib_advmss + 40 : 0),
2453 fi->fib_window, 2593 fi->fib_window,
2454 fi->fib_rtt >> 3); 2594 fi->fib_rtt >> 3);
2455 else 2595 else
2456 seq_printf(seq, 2596 seq_printf(seq,
2457 "*\t%08X\t%08X\t%04X\t%d\t%u\t" 2597 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2458 "%d\t%08X\t%d\t%u\t%u", 2598 "%d\t%08X\t%d\t%u\t%u",
2459 prefix, 0, flags, 0, 0, 0, 2599 prefix, 0, flags, 0, 0, 0,
2460 mask, 0, 0, 0); 2600 mask, 0, 0, 0);
2461 2601
2462 seq_pad(seq, '\n'); 2602 seq_pad(seq, '\n');
2463 }
2464 } 2603 }
2465 2604
2466 return 0; 2605 return 0;
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index ff069f6597ac..34968cd5c146 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -16,14 +16,12 @@
16#include <uapi/linux/fou.h> 16#include <uapi/linux/fou.h>
17#include <uapi/linux/genetlink.h> 17#include <uapi/linux/genetlink.h>
18 18
19static DEFINE_SPINLOCK(fou_lock);
20static LIST_HEAD(fou_list);
21
22struct fou { 19struct fou {
23 struct socket *sock; 20 struct socket *sock;
24 u8 protocol; 21 u8 protocol;
25 u8 flags; 22 u8 flags;
26 u16 port; 23 __be16 port;
24 u16 type;
27 struct udp_offload udp_offloads; 25 struct udp_offload udp_offloads;
28 struct list_head list; 26 struct list_head list;
29}; 27};
@@ -37,6 +35,13 @@ struct fou_cfg {
37 struct udp_port_cfg udp_config; 35 struct udp_port_cfg udp_config;
38}; 36};
39 37
38static unsigned int fou_net_id;
39
40struct fou_net {
41 struct list_head fou_list;
42 struct mutex fou_lock;
43};
44
40static inline struct fou *fou_from_sock(struct sock *sk) 45static inline struct fou *fou_from_sock(struct sock *sk)
41{ 46{
42 return sk->sk_user_data; 47 return sk->sk_user_data;
@@ -387,20 +392,21 @@ out_unlock:
387 return err; 392 return err;
388} 393}
389 394
390static int fou_add_to_port_list(struct fou *fou) 395static int fou_add_to_port_list(struct net *net, struct fou *fou)
391{ 396{
397 struct fou_net *fn = net_generic(net, fou_net_id);
392 struct fou *fout; 398 struct fou *fout;
393 399
394 spin_lock(&fou_lock); 400 mutex_lock(&fn->fou_lock);
395 list_for_each_entry(fout, &fou_list, list) { 401 list_for_each_entry(fout, &fn->fou_list, list) {
396 if (fou->port == fout->port) { 402 if (fou->port == fout->port) {
397 spin_unlock(&fou_lock); 403 mutex_unlock(&fn->fou_lock);
398 return -EALREADY; 404 return -EALREADY;
399 } 405 }
400 } 406 }
401 407
402 list_add(&fou->list, &fou_list); 408 list_add(&fou->list, &fn->fou_list);
403 spin_unlock(&fou_lock); 409 mutex_unlock(&fn->fou_lock);
404 410
405 return 0; 411 return 0;
406} 412}
@@ -410,14 +416,10 @@ static void fou_release(struct fou *fou)
410 struct socket *sock = fou->sock; 416 struct socket *sock = fou->sock;
411 struct sock *sk = sock->sk; 417 struct sock *sk = sock->sk;
412 418
413 udp_del_offload(&fou->udp_offloads); 419 if (sk->sk_family == AF_INET)
414 420 udp_del_offload(&fou->udp_offloads);
415 list_del(&fou->list); 421 list_del(&fou->list);
416 422 udp_tunnel_sock_release(sock);
417 /* Remove hooks into tunnel socket */
418 sk->sk_user_data = NULL;
419
420 sock_release(sock);
421 423
422 kfree(fou); 424 kfree(fou);
423} 425}
@@ -447,10 +449,10 @@ static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
447static int fou_create(struct net *net, struct fou_cfg *cfg, 449static int fou_create(struct net *net, struct fou_cfg *cfg,
448 struct socket **sockp) 450 struct socket **sockp)
449{ 451{
450 struct fou *fou = NULL;
451 int err;
452 struct socket *sock = NULL; 452 struct socket *sock = NULL;
453 struct fou *fou = NULL;
453 struct sock *sk; 454 struct sock *sk;
455 int err;
454 456
455 /* Open UDP socket */ 457 /* Open UDP socket */
456 err = udp_sock_create(net, &cfg->udp_config, &sock); 458 err = udp_sock_create(net, &cfg->udp_config, &sock);
@@ -486,6 +488,8 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
486 goto error; 488 goto error;
487 } 489 }
488 490
491 fou->type = cfg->type;
492
489 udp_sk(sk)->encap_type = 1; 493 udp_sk(sk)->encap_type = 1;
490 udp_encap_enable(); 494 udp_encap_enable();
491 495
@@ -502,7 +506,7 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
502 goto error; 506 goto error;
503 } 507 }
504 508
505 err = fou_add_to_port_list(fou); 509 err = fou_add_to_port_list(net, fou);
506 if (err) 510 if (err)
507 goto error; 511 goto error;
508 512
@@ -514,27 +518,27 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
514error: 518error:
515 kfree(fou); 519 kfree(fou);
516 if (sock) 520 if (sock)
517 sock_release(sock); 521 udp_tunnel_sock_release(sock);
518 522
519 return err; 523 return err;
520} 524}
521 525
522static int fou_destroy(struct net *net, struct fou_cfg *cfg) 526static int fou_destroy(struct net *net, struct fou_cfg *cfg)
523{ 527{
524 struct fou *fou; 528 struct fou_net *fn = net_generic(net, fou_net_id);
525 u16 port = cfg->udp_config.local_udp_port; 529 __be16 port = cfg->udp_config.local_udp_port;
526 int err = -EINVAL; 530 int err = -EINVAL;
531 struct fou *fou;
527 532
528 spin_lock(&fou_lock); 533 mutex_lock(&fn->fou_lock);
529 list_for_each_entry(fou, &fou_list, list) { 534 list_for_each_entry(fou, &fn->fou_list, list) {
530 if (fou->port == port) { 535 if (fou->port == port) {
531 udp_del_offload(&fou->udp_offloads);
532 fou_release(fou); 536 fou_release(fou);
533 err = 0; 537 err = 0;
534 break; 538 break;
535 } 539 }
536 } 540 }
537 spin_unlock(&fou_lock); 541 mutex_unlock(&fn->fou_lock);
538 542
539 return err; 543 return err;
540} 544}
@@ -573,7 +577,7 @@ static int parse_nl_config(struct genl_info *info,
573 } 577 }
574 578
575 if (info->attrs[FOU_ATTR_PORT]) { 579 if (info->attrs[FOU_ATTR_PORT]) {
576 u16 port = nla_get_u16(info->attrs[FOU_ATTR_PORT]); 580 __be16 port = nla_get_be16(info->attrs[FOU_ATTR_PORT]);
577 581
578 cfg->udp_config.local_udp_port = port; 582 cfg->udp_config.local_udp_port = port;
579 } 583 }
@@ -592,6 +596,7 @@ static int parse_nl_config(struct genl_info *info,
592 596
593static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info) 597static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info)
594{ 598{
599 struct net *net = genl_info_net(info);
595 struct fou_cfg cfg; 600 struct fou_cfg cfg;
596 int err; 601 int err;
597 602
@@ -599,16 +604,119 @@ static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info)
599 if (err) 604 if (err)
600 return err; 605 return err;
601 606
602 return fou_create(&init_net, &cfg, NULL); 607 return fou_create(net, &cfg, NULL);
603} 608}
604 609
605static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info) 610static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info)
606{ 611{
612 struct net *net = genl_info_net(info);
607 struct fou_cfg cfg; 613 struct fou_cfg cfg;
614 int err;
608 615
609 parse_nl_config(info, &cfg); 616 err = parse_nl_config(info, &cfg);
617 if (err)
618 return err;
610 619
611 return fou_destroy(&init_net, &cfg); 620 return fou_destroy(net, &cfg);
621}
622
623static int fou_fill_info(struct fou *fou, struct sk_buff *msg)
624{
625 if (nla_put_u8(msg, FOU_ATTR_AF, fou->sock->sk->sk_family) ||
626 nla_put_be16(msg, FOU_ATTR_PORT, fou->port) ||
627 nla_put_u8(msg, FOU_ATTR_IPPROTO, fou->protocol) ||
628 nla_put_u8(msg, FOU_ATTR_TYPE, fou->type))
629 return -1;
630
631 if (fou->flags & FOU_F_REMCSUM_NOPARTIAL)
632 if (nla_put_flag(msg, FOU_ATTR_REMCSUM_NOPARTIAL))
633 return -1;
634 return 0;
635}
636
637static int fou_dump_info(struct fou *fou, u32 portid, u32 seq,
638 u32 flags, struct sk_buff *skb, u8 cmd)
639{
640 void *hdr;
641
642 hdr = genlmsg_put(skb, portid, seq, &fou_nl_family, flags, cmd);
643 if (!hdr)
644 return -ENOMEM;
645
646 if (fou_fill_info(fou, skb) < 0)
647 goto nla_put_failure;
648
649 genlmsg_end(skb, hdr);
650 return 0;
651
652nla_put_failure:
653 genlmsg_cancel(skb, hdr);
654 return -EMSGSIZE;
655}
656
657static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info)
658{
659 struct net *net = genl_info_net(info);
660 struct fou_net *fn = net_generic(net, fou_net_id);
661 struct sk_buff *msg;
662 struct fou_cfg cfg;
663 struct fou *fout;
664 __be16 port;
665 int ret;
666
667 ret = parse_nl_config(info, &cfg);
668 if (ret)
669 return ret;
670 port = cfg.udp_config.local_udp_port;
671 if (port == 0)
672 return -EINVAL;
673
674 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
675 if (!msg)
676 return -ENOMEM;
677
678 ret = -ESRCH;
679 mutex_lock(&fn->fou_lock);
680 list_for_each_entry(fout, &fn->fou_list, list) {
681 if (port == fout->port) {
682 ret = fou_dump_info(fout, info->snd_portid,
683 info->snd_seq, 0, msg,
684 info->genlhdr->cmd);
685 break;
686 }
687 }
688 mutex_unlock(&fn->fou_lock);
689 if (ret < 0)
690 goto out_free;
691
692 return genlmsg_reply(msg, info);
693
694out_free:
695 nlmsg_free(msg);
696 return ret;
697}
698
699static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
700{
701 struct net *net = sock_net(skb->sk);
702 struct fou_net *fn = net_generic(net, fou_net_id);
703 struct fou *fout;
704 int idx = 0, ret;
705
706 mutex_lock(&fn->fou_lock);
707 list_for_each_entry(fout, &fn->fou_list, list) {
708 if (idx++ < cb->args[0])
709 continue;
710 ret = fou_dump_info(fout, NETLINK_CB(cb->skb).portid,
711 cb->nlh->nlmsg_seq, NLM_F_MULTI,
712 skb, FOU_CMD_GET);
713 if (ret)
714 break;
715 }
716 mutex_unlock(&fn->fou_lock);
717
718 cb->args[0] = idx;
719 return skb->len;
612} 720}
613 721
614static const struct genl_ops fou_nl_ops[] = { 722static const struct genl_ops fou_nl_ops[] = {
@@ -624,6 +732,12 @@ static const struct genl_ops fou_nl_ops[] = {
624 .policy = fou_nl_policy, 732 .policy = fou_nl_policy,
625 .flags = GENL_ADMIN_PERM, 733 .flags = GENL_ADMIN_PERM,
626 }, 734 },
735 {
736 .cmd = FOU_CMD_GET,
737 .doit = fou_nl_cmd_get_port,
738 .dumpit = fou_nl_dump,
739 .policy = fou_nl_policy,
740 },
627}; 741};
628 742
629size_t fou_encap_hlen(struct ip_tunnel_encap *e) 743size_t fou_encap_hlen(struct ip_tunnel_encap *e)
@@ -771,12 +885,12 @@ EXPORT_SYMBOL(gue_build_header);
771 885
772#ifdef CONFIG_NET_FOU_IP_TUNNELS 886#ifdef CONFIG_NET_FOU_IP_TUNNELS
773 887
774static const struct ip_tunnel_encap_ops __read_mostly fou_iptun_ops = { 888static const struct ip_tunnel_encap_ops fou_iptun_ops = {
775 .encap_hlen = fou_encap_hlen, 889 .encap_hlen = fou_encap_hlen,
776 .build_header = fou_build_header, 890 .build_header = fou_build_header,
777}; 891};
778 892
779static const struct ip_tunnel_encap_ops __read_mostly gue_iptun_ops = { 893static const struct ip_tunnel_encap_ops gue_iptun_ops = {
780 .encap_hlen = gue_encap_hlen, 894 .encap_hlen = gue_encap_hlen,
781 .build_header = gue_build_header, 895 .build_header = gue_build_header,
782}; 896};
@@ -820,38 +934,63 @@ static void ip_tunnel_encap_del_fou_ops(void)
820 934
821#endif 935#endif
822 936
937static __net_init int fou_init_net(struct net *net)
938{
939 struct fou_net *fn = net_generic(net, fou_net_id);
940
941 INIT_LIST_HEAD(&fn->fou_list);
942 mutex_init(&fn->fou_lock);
943 return 0;
944}
945
946static __net_exit void fou_exit_net(struct net *net)
947{
948 struct fou_net *fn = net_generic(net, fou_net_id);
949 struct fou *fou, *next;
950
951 /* Close all the FOU sockets */
952 mutex_lock(&fn->fou_lock);
953 list_for_each_entry_safe(fou, next, &fn->fou_list, list)
954 fou_release(fou);
955 mutex_unlock(&fn->fou_lock);
956}
957
958static struct pernet_operations fou_net_ops = {
959 .init = fou_init_net,
960 .exit = fou_exit_net,
961 .id = &fou_net_id,
962 .size = sizeof(struct fou_net),
963};
964
823static int __init fou_init(void) 965static int __init fou_init(void)
824{ 966{
825 int ret; 967 int ret;
826 968
969 ret = register_pernet_device(&fou_net_ops);
970 if (ret)
971 goto exit;
972
827 ret = genl_register_family_with_ops(&fou_nl_family, 973 ret = genl_register_family_with_ops(&fou_nl_family,
828 fou_nl_ops); 974 fou_nl_ops);
829
830 if (ret < 0) 975 if (ret < 0)
831 goto exit; 976 goto unregister;
832 977
833 ret = ip_tunnel_encap_add_fou_ops(); 978 ret = ip_tunnel_encap_add_fou_ops();
834 if (ret < 0) 979 if (ret == 0)
835 genl_unregister_family(&fou_nl_family); 980 return 0;
836 981
982 genl_unregister_family(&fou_nl_family);
983unregister:
984 unregister_pernet_device(&fou_net_ops);
837exit: 985exit:
838 return ret; 986 return ret;
839} 987}
840 988
841static void __exit fou_fini(void) 989static void __exit fou_fini(void)
842{ 990{
843 struct fou *fou, *next;
844
845 ip_tunnel_encap_del_fou_ops(); 991 ip_tunnel_encap_del_fou_ops();
846
847 genl_unregister_family(&fou_nl_family); 992 genl_unregister_family(&fou_nl_family);
848 993 unregister_pernet_device(&fou_net_ops);
849 /* Close all the FOU sockets */
850
851 spin_lock(&fou_lock);
852 list_for_each_entry_safe(fou, next, &fou_list, list)
853 fou_release(fou);
854 spin_unlock(&fou_lock);
855} 994}
856 995
857module_init(fou_init); 996module_init(fou_init);
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 5a4828ba05ad..8986e63f3bda 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -113,10 +113,6 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
113 int min_headroom; 113 int min_headroom;
114 int err; 114 int err;
115 115
116 skb = udp_tunnel_handle_offloads(skb, csum);
117 if (IS_ERR(skb))
118 return PTR_ERR(skb);
119
120 min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len 116 min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
121 + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) 117 + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
122 + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); 118 + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
@@ -131,12 +127,16 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
131 if (unlikely(!skb)) 127 if (unlikely(!skb))
132 return -ENOMEM; 128 return -ENOMEM;
133 129
130 skb = udp_tunnel_handle_offloads(skb, csum);
131 if (IS_ERR(skb))
132 return PTR_ERR(skb);
133
134 gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); 134 gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
135 geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); 135 geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
136 136
137 skb_set_inner_protocol(skb, htons(ETH_P_TEB)); 137 skb_set_inner_protocol(skb, htons(ETH_P_TEB));
138 138
139 return udp_tunnel_xmit_skb(rt, skb, src, dst, 139 return udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, src, dst,
140 tos, ttl, df, src_port, dst_port, xnet, 140 tos, ttl, df, src_port, dst_port, xnet,
141 !csum); 141 !csum);
142} 142}
@@ -196,7 +196,7 @@ static struct sk_buff **geneve_gro_receive(struct sk_buff **head,
196 196
197 rcu_read_lock(); 197 rcu_read_lock();
198 ptype = gro_find_receive_by_type(type); 198 ptype = gro_find_receive_by_type(type);
199 if (ptype == NULL) { 199 if (!ptype) {
200 flush = 1; 200 flush = 1;
201 goto out_unlock; 201 goto out_unlock;
202 } 202 }
@@ -230,7 +230,7 @@ static int geneve_gro_complete(struct sk_buff *skb, int nhoff,
230 230
231 rcu_read_lock(); 231 rcu_read_lock();
232 ptype = gro_find_complete_by_type(type); 232 ptype = gro_find_complete_by_type(type);
233 if (ptype != NULL) 233 if (ptype)
234 err = ptype->callbacks.gro_complete(skb, nhoff + gh_len); 234 err = ptype->callbacks.gro_complete(skb, nhoff + gh_len);
235 235
236 rcu_read_unlock(); 236 rcu_read_unlock();
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 51973ddc05a6..5aa46d4b44ef 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -149,7 +149,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
149 149
150 rcu_read_lock(); 150 rcu_read_lock();
151 ptype = gro_find_receive_by_type(type); 151 ptype = gro_find_receive_by_type(type);
152 if (ptype == NULL) 152 if (!ptype)
153 goto out_unlock; 153 goto out_unlock;
154 154
155 grehlen = GRE_HEADER_SECTION; 155 grehlen = GRE_HEADER_SECTION;
@@ -243,7 +243,7 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff)
243 243
244 rcu_read_lock(); 244 rcu_read_lock();
245 ptype = gro_find_complete_by_type(type); 245 ptype = gro_find_complete_by_type(type);
246 if (ptype != NULL) 246 if (ptype)
247 err = ptype->callbacks.gro_complete(skb, nhoff + grehlen); 247 err = ptype->callbacks.gro_complete(skb, nhoff + grehlen);
248 248
249 rcu_read_unlock(); 249 rcu_read_unlock();
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 5e564014a0b7..f5203fba6236 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -399,7 +399,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
399 return; 399 return;
400 400
401 sk = icmp_xmit_lock(net); 401 sk = icmp_xmit_lock(net);
402 if (sk == NULL) 402 if (!sk)
403 return; 403 return;
404 inet = inet_sk(sk); 404 inet = inet_sk(sk);
405 405
@@ -609,7 +609,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
609 skb_in->data, 609 skb_in->data,
610 sizeof(_inner_type), 610 sizeof(_inner_type),
611 &_inner_type); 611 &_inner_type);
612 if (itp == NULL) 612 if (!itp)
613 goto out; 613 goto out;
614 614
615 /* 615 /*
@@ -627,7 +627,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
627 return; 627 return;
628 628
629 sk = icmp_xmit_lock(net); 629 sk = icmp_xmit_lock(net);
630 if (sk == NULL) 630 if (!sk)
631 goto out_free; 631 goto out_free;
632 632
633 /* 633 /*
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 666cf364df86..a3a697f5ffba 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -97,6 +97,7 @@
97#include <net/route.h> 97#include <net/route.h>
98#include <net/sock.h> 98#include <net/sock.h>
99#include <net/checksum.h> 99#include <net/checksum.h>
100#include <net/inet_common.h>
100#include <linux/netfilter_ipv4.h> 101#include <linux/netfilter_ipv4.h>
101#ifdef CONFIG_IP_MROUTE 102#ifdef CONFIG_IP_MROUTE
102#include <linux/mroute.h> 103#include <linux/mroute.h>
@@ -369,7 +370,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
369 pip->saddr = fl4.saddr; 370 pip->saddr = fl4.saddr;
370 pip->protocol = IPPROTO_IGMP; 371 pip->protocol = IPPROTO_IGMP;
371 pip->tot_len = 0; /* filled in later */ 372 pip->tot_len = 0; /* filled in later */
372 ip_select_ident(skb, NULL); 373 ip_select_ident(net, skb, NULL);
373 ((u8 *)&pip[1])[0] = IPOPT_RA; 374 ((u8 *)&pip[1])[0] = IPOPT_RA;
374 ((u8 *)&pip[1])[1] = 4; 375 ((u8 *)&pip[1])[1] = 4;
375 ((u8 *)&pip[1])[2] = 0; 376 ((u8 *)&pip[1])[2] = 0;
@@ -691,7 +692,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
691 hlen = LL_RESERVED_SPACE(dev); 692 hlen = LL_RESERVED_SPACE(dev);
692 tlen = dev->needed_tailroom; 693 tlen = dev->needed_tailroom;
693 skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC); 694 skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC);
694 if (skb == NULL) { 695 if (!skb) {
695 ip_rt_put(rt); 696 ip_rt_put(rt);
696 return -1; 697 return -1;
697 } 698 }
@@ -713,7 +714,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
713 iph->daddr = dst; 714 iph->daddr = dst;
714 iph->saddr = fl4.saddr; 715 iph->saddr = fl4.saddr;
715 iph->protocol = IPPROTO_IGMP; 716 iph->protocol = IPPROTO_IGMP;
716 ip_select_ident(skb, NULL); 717 ip_select_ident(net, skb, NULL);
717 ((u8 *)&iph[1])[0] = IPOPT_RA; 718 ((u8 *)&iph[1])[0] = IPOPT_RA;
718 ((u8 *)&iph[1])[1] = 4; 719 ((u8 *)&iph[1])[1] = 4;
719 ((u8 *)&iph[1])[2] = 0; 720 ((u8 *)&iph[1])[2] = 0;
@@ -980,7 +981,7 @@ int igmp_rcv(struct sk_buff *skb)
980 int len = skb->len; 981 int len = skb->len;
981 bool dropped = true; 982 bool dropped = true;
982 983
983 if (in_dev == NULL) 984 if (!in_dev)
984 goto drop; 985 goto drop;
985 986
986 if (!pskb_may_pull(skb, sizeof(struct igmphdr))) 987 if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
@@ -1849,30 +1850,28 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc)
1849 pmc->sfcount[MCAST_EXCLUDE] = 1; 1850 pmc->sfcount[MCAST_EXCLUDE] = 1;
1850} 1851}
1851 1852
1852 1853/* Join a multicast group
1853/*
1854 * Join a multicast group
1855 */ 1854 */
1856int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) 1855
1856int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
1857{ 1857{
1858 int err;
1859 __be32 addr = imr->imr_multiaddr.s_addr; 1858 __be32 addr = imr->imr_multiaddr.s_addr;
1860 struct ip_mc_socklist *iml = NULL, *i; 1859 struct ip_mc_socklist *iml, *i;
1861 struct in_device *in_dev; 1860 struct in_device *in_dev;
1862 struct inet_sock *inet = inet_sk(sk); 1861 struct inet_sock *inet = inet_sk(sk);
1863 struct net *net = sock_net(sk); 1862 struct net *net = sock_net(sk);
1864 int ifindex; 1863 int ifindex;
1865 int count = 0; 1864 int count = 0;
1865 int err;
1866
1867 ASSERT_RTNL();
1866 1868
1867 if (!ipv4_is_multicast(addr)) 1869 if (!ipv4_is_multicast(addr))
1868 return -EINVAL; 1870 return -EINVAL;
1869 1871
1870 rtnl_lock();
1871
1872 in_dev = ip_mc_find_dev(net, imr); 1872 in_dev = ip_mc_find_dev(net, imr);
1873 1873
1874 if (!in_dev) { 1874 if (!in_dev) {
1875 iml = NULL;
1876 err = -ENODEV; 1875 err = -ENODEV;
1877 goto done; 1876 goto done;
1878 } 1877 }
@@ -1889,7 +1888,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1889 if (count >= sysctl_igmp_max_memberships) 1888 if (count >= sysctl_igmp_max_memberships)
1890 goto done; 1889 goto done;
1891 iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); 1890 iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
1892 if (iml == NULL) 1891 if (!iml)
1893 goto done; 1892 goto done;
1894 1893
1895 memcpy(&iml->multi, imr, sizeof(*imr)); 1894 memcpy(&iml->multi, imr, sizeof(*imr));
@@ -1900,7 +1899,6 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1900 ip_mc_inc_group(in_dev, addr); 1899 ip_mc_inc_group(in_dev, addr);
1901 err = 0; 1900 err = 0;
1902done: 1901done:
1903 rtnl_unlock();
1904 return err; 1902 return err;
1905} 1903}
1906EXPORT_SYMBOL(ip_mc_join_group); 1904EXPORT_SYMBOL(ip_mc_join_group);
@@ -1911,7 +1909,7 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
1911 struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist); 1909 struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist);
1912 int err; 1910 int err;
1913 1911
1914 if (psf == NULL) { 1912 if (!psf) {
1915 /* any-source empty exclude case */ 1913 /* any-source empty exclude case */
1916 return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, 1914 return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
1917 iml->sfmode, 0, NULL, 0); 1915 iml->sfmode, 0, NULL, 0);
@@ -1925,10 +1923,6 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
1925 return err; 1923 return err;
1926} 1924}
1927 1925
1928/*
1929 * Ask a socket to leave a group.
1930 */
1931
1932int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) 1926int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1933{ 1927{
1934 struct inet_sock *inet = inet_sk(sk); 1928 struct inet_sock *inet = inet_sk(sk);
@@ -1940,7 +1934,8 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1940 u32 ifindex; 1934 u32 ifindex;
1941 int ret = -EADDRNOTAVAIL; 1935 int ret = -EADDRNOTAVAIL;
1942 1936
1943 rtnl_lock(); 1937 ASSERT_RTNL();
1938
1944 in_dev = ip_mc_find_dev(net, imr); 1939 in_dev = ip_mc_find_dev(net, imr);
1945 if (!in_dev) { 1940 if (!in_dev) {
1946 ret = -ENODEV; 1941 ret = -ENODEV;
@@ -1964,14 +1959,13 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1964 *imlp = iml->next_rcu; 1959 *imlp = iml->next_rcu;
1965 1960
1966 ip_mc_dec_group(in_dev, group); 1961 ip_mc_dec_group(in_dev, group);
1967 rtnl_unlock(); 1962
1968 /* decrease mem now to avoid the memleak warning */ 1963 /* decrease mem now to avoid the memleak warning */
1969 atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); 1964 atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
1970 kfree_rcu(iml, rcu); 1965 kfree_rcu(iml, rcu);
1971 return 0; 1966 return 0;
1972 } 1967 }
1973out: 1968out:
1974 rtnl_unlock();
1975 return ret; 1969 return ret;
1976} 1970}
1977EXPORT_SYMBOL(ip_mc_leave_group); 1971EXPORT_SYMBOL(ip_mc_leave_group);
@@ -1993,7 +1987,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1993 if (!ipv4_is_multicast(addr)) 1987 if (!ipv4_is_multicast(addr))
1994 return -EINVAL; 1988 return -EINVAL;
1995 1989
1996 rtnl_lock(); 1990 ASSERT_RTNL();
1997 1991
1998 imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr; 1992 imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr;
1999 imr.imr_address.s_addr = mreqs->imr_interface; 1993 imr.imr_address.s_addr = mreqs->imr_interface;
@@ -2107,9 +2101,8 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
2107 ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1, 2101 ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
2108 &mreqs->imr_sourceaddr, 1); 2102 &mreqs->imr_sourceaddr, 1);
2109done: 2103done:
2110 rtnl_unlock();
2111 if (leavegroup) 2104 if (leavegroup)
2112 return ip_mc_leave_group(sk, &imr); 2105 err = ip_mc_leave_group(sk, &imr);
2113 return err; 2106 return err;
2114} 2107}
2115 2108
@@ -2131,7 +2124,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
2131 msf->imsf_fmode != MCAST_EXCLUDE) 2124 msf->imsf_fmode != MCAST_EXCLUDE)
2132 return -EINVAL; 2125 return -EINVAL;
2133 2126
2134 rtnl_lock(); 2127 ASSERT_RTNL();
2135 2128
2136 imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; 2129 imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
2137 imr.imr_address.s_addr = msf->imsf_interface; 2130 imr.imr_address.s_addr = msf->imsf_interface;
@@ -2193,7 +2186,6 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
2193 pmc->sfmode = msf->imsf_fmode; 2186 pmc->sfmode = msf->imsf_fmode;
2194 err = 0; 2187 err = 0;
2195done: 2188done:
2196 rtnl_unlock();
2197 if (leavegroup) 2189 if (leavegroup)
2198 err = ip_mc_leave_group(sk, &imr); 2190 err = ip_mc_leave_group(sk, &imr);
2199 return err; 2191 return err;
@@ -2368,7 +2360,7 @@ void ip_mc_drop_socket(struct sock *sk)
2368 struct ip_mc_socklist *iml; 2360 struct ip_mc_socklist *iml;
2369 struct net *net = sock_net(sk); 2361 struct net *net = sock_net(sk);
2370 2362
2371 if (inet->mc_list == NULL) 2363 if (!inet->mc_list)
2372 return; 2364 return;
2373 2365
2374 rtnl_lock(); 2366 rtnl_lock();
@@ -2378,7 +2370,7 @@ void ip_mc_drop_socket(struct sock *sk)
2378 inet->mc_list = iml->next_rcu; 2370 inet->mc_list = iml->next_rcu;
2379 in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); 2371 in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
2380 (void) ip_mc_leave_src(sk, iml, in_dev); 2372 (void) ip_mc_leave_src(sk, iml, in_dev);
2381 if (in_dev != NULL) 2373 if (in_dev)
2382 ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); 2374 ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
2383 /* decrease mem now to avoid the memleak warning */ 2375 /* decrease mem now to avoid the memleak warning */
2384 atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); 2376 atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
@@ -2595,13 +2587,13 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
2595 for_each_netdev_rcu(net, state->dev) { 2587 for_each_netdev_rcu(net, state->dev) {
2596 struct in_device *idev; 2588 struct in_device *idev;
2597 idev = __in_dev_get_rcu(state->dev); 2589 idev = __in_dev_get_rcu(state->dev);
2598 if (unlikely(idev == NULL)) 2590 if (unlikely(!idev))
2599 continue; 2591 continue;
2600 im = rcu_dereference(idev->mc_list); 2592 im = rcu_dereference(idev->mc_list);
2601 if (likely(im != NULL)) { 2593 if (likely(im)) {
2602 spin_lock_bh(&im->lock); 2594 spin_lock_bh(&im->lock);
2603 psf = im->sources; 2595 psf = im->sources;
2604 if (likely(psf != NULL)) { 2596 if (likely(psf)) {
2605 state->im = im; 2597 state->im = im;
2606 state->idev = idev; 2598 state->idev = idev;
2607 break; 2599 break;
@@ -2671,7 +2663,7 @@ static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
2671 __releases(rcu) 2663 __releases(rcu)
2672{ 2664{
2673 struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); 2665 struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
2674 if (likely(state->im != NULL)) { 2666 if (likely(state->im)) {
2675 spin_unlock_bh(&state->im->lock); 2667 spin_unlock_bh(&state->im->lock);
2676 state->im = NULL; 2668 state->im = NULL;
2677 } 2669 }
@@ -2724,6 +2716,7 @@ static const struct file_operations igmp_mcf_seq_fops = {
2724static int __net_init igmp_net_init(struct net *net) 2716static int __net_init igmp_net_init(struct net *net)
2725{ 2717{
2726 struct proc_dir_entry *pde; 2718 struct proc_dir_entry *pde;
2719 int err;
2727 2720
2728 pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops); 2721 pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops);
2729 if (!pde) 2722 if (!pde)
@@ -2732,8 +2725,18 @@ static int __net_init igmp_net_init(struct net *net)
2732 &igmp_mcf_seq_fops); 2725 &igmp_mcf_seq_fops);
2733 if (!pde) 2726 if (!pde)
2734 goto out_mcfilter; 2727 goto out_mcfilter;
2728 err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET,
2729 SOCK_DGRAM, 0, net);
2730 if (err < 0) {
2731 pr_err("Failed to initialize the IGMP autojoin socket (err %d)\n",
2732 err);
2733 goto out_sock;
2734 }
2735
2735 return 0; 2736 return 0;
2736 2737
2738out_sock:
2739 remove_proc_entry("mcfilter", net->proc_net);
2737out_mcfilter: 2740out_mcfilter:
2738 remove_proc_entry("igmp", net->proc_net); 2741 remove_proc_entry("igmp", net->proc_net);
2739out_igmp: 2742out_igmp:
@@ -2744,6 +2747,7 @@ static void __net_exit igmp_net_exit(struct net *net)
2744{ 2747{
2745 remove_proc_entry("mcfilter", net->proc_net); 2748 remove_proc_entry("mcfilter", net->proc_net);
2746 remove_proc_entry("igmp", net->proc_net); 2749 remove_proc_entry("igmp", net->proc_net);
2750 inet_ctl_sock_destroy(net->ipv4.mc_autojoin_sk);
2747} 2751}
2748 2752
2749static struct pernet_operations igmp_net_ops = { 2753static struct pernet_operations igmp_net_ops = {
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 14d02ea905b6..8976ca423a07 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -23,6 +23,7 @@
23#include <net/route.h> 23#include <net/route.h>
24#include <net/tcp_states.h> 24#include <net/tcp_states.h>
25#include <net/xfrm.h> 25#include <net/xfrm.h>
26#include <net/tcp.h>
26 27
27#ifdef INET_CSK_DEBUG 28#ifdef INET_CSK_DEBUG
28const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; 29const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
@@ -268,6 +269,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
268 release_sock(sk); 269 release_sock(sk);
269 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) 270 if (reqsk_queue_empty(&icsk->icsk_accept_queue))
270 timeo = schedule_timeout(timeo); 271 timeo = schedule_timeout(timeo);
272 sched_annotate_sleep();
271 lock_sock(sk); 273 lock_sock(sk);
272 err = 0; 274 err = 0;
273 if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) 275 if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
@@ -293,8 +295,8 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
293{ 295{
294 struct inet_connection_sock *icsk = inet_csk(sk); 296 struct inet_connection_sock *icsk = inet_csk(sk);
295 struct request_sock_queue *queue = &icsk->icsk_accept_queue; 297 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
296 struct sock *newsk;
297 struct request_sock *req; 298 struct request_sock *req;
299 struct sock *newsk;
298 int error; 300 int error;
299 301
300 lock_sock(sk); 302 lock_sock(sk);
@@ -323,9 +325,11 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
323 newsk = req->sk; 325 newsk = req->sk;
324 326
325 sk_acceptq_removed(sk); 327 sk_acceptq_removed(sk);
326 if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) { 328 if (sk->sk_protocol == IPPROTO_TCP &&
329 tcp_rsk(req)->tfo_listener &&
330 queue->fastopenq) {
327 spin_lock_bh(&queue->fastopenq->lock); 331 spin_lock_bh(&queue->fastopenq->lock);
328 if (tcp_rsk(req)->listener) { 332 if (tcp_rsk(req)->tfo_listener) {
329 /* We are still waiting for the final ACK from 3WHS 333 /* We are still waiting for the final ACK from 3WHS
330 * so can't free req now. Instead, we set req->sk to 334 * so can't free req now. Instead, we set req->sk to
331 * NULL to signify that the child socket is taken 335 * NULL to signify that the child socket is taken
@@ -340,7 +344,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
340out: 344out:
341 release_sock(sk); 345 release_sock(sk);
342 if (req) 346 if (req)
343 __reqsk_free(req); 347 reqsk_put(req);
344 return newsk; 348 return newsk;
345out_err: 349out_err:
346 newsk = NULL; 350 newsk = NULL;
@@ -399,18 +403,17 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
399 struct flowi4 *fl4, 403 struct flowi4 *fl4,
400 const struct request_sock *req) 404 const struct request_sock *req)
401{ 405{
402 struct rtable *rt;
403 const struct inet_request_sock *ireq = inet_rsk(req); 406 const struct inet_request_sock *ireq = inet_rsk(req);
404 struct ip_options_rcu *opt = inet_rsk(req)->opt; 407 struct net *net = read_pnet(&ireq->ireq_net);
405 struct net *net = sock_net(sk); 408 struct ip_options_rcu *opt = ireq->opt;
406 int flags = inet_sk_flowi_flags(sk); 409 struct rtable *rt;
407 410
408 flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark, 411 flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
409 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 412 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
410 sk->sk_protocol, 413 sk->sk_protocol, inet_sk_flowi_flags(sk),
411 flags,
412 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, 414 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
413 ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); 415 ireq->ir_loc_addr, ireq->ir_rmt_port,
416 htons(ireq->ir_num));
414 security_req_classify_flow(req, flowi4_to_flowi(fl4)); 417 security_req_classify_flow(req, flowi4_to_flowi(fl4));
415 rt = ip_route_output_flow(net, fl4, sk); 418 rt = ip_route_output_flow(net, fl4, sk);
416 if (IS_ERR(rt)) 419 if (IS_ERR(rt))
@@ -432,9 +435,9 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
432 const struct request_sock *req) 435 const struct request_sock *req)
433{ 436{
434 const struct inet_request_sock *ireq = inet_rsk(req); 437 const struct inet_request_sock *ireq = inet_rsk(req);
438 struct net *net = read_pnet(&ireq->ireq_net);
435 struct inet_sock *newinet = inet_sk(newsk); 439 struct inet_sock *newinet = inet_sk(newsk);
436 struct ip_options_rcu *opt; 440 struct ip_options_rcu *opt;
437 struct net *net = sock_net(sk);
438 struct flowi4 *fl4; 441 struct flowi4 *fl4;
439 struct rtable *rt; 442 struct rtable *rt;
440 443
@@ -442,11 +445,12 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
442 445
443 rcu_read_lock(); 446 rcu_read_lock();
444 opt = rcu_dereference(newinet->inet_opt); 447 opt = rcu_dereference(newinet->inet_opt);
445 flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark, 448 flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
446 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 449 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
447 sk->sk_protocol, inet_sk_flowi_flags(sk), 450 sk->sk_protocol, inet_sk_flowi_flags(sk),
448 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, 451 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
449 ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); 452 ireq->ir_loc_addr, ireq->ir_rmt_port,
453 htons(ireq->ir_num));
450 security_req_classify_flow(req, flowi4_to_flowi(fl4)); 454 security_req_classify_flow(req, flowi4_to_flowi(fl4));
451 rt = ip_route_output_flow(net, fl4, sk); 455 rt = ip_route_output_flow(net, fl4, sk);
452 if (IS_ERR(rt)) 456 if (IS_ERR(rt))
@@ -474,33 +478,37 @@ static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
474#if IS_ENABLED(CONFIG_IPV6) 478#if IS_ENABLED(CONFIG_IPV6)
475#define AF_INET_FAMILY(fam) ((fam) == AF_INET) 479#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
476#else 480#else
477#define AF_INET_FAMILY(fam) 1 481#define AF_INET_FAMILY(fam) true
478#endif 482#endif
479 483
480struct request_sock *inet_csk_search_req(const struct sock *sk, 484/* Note: this is temporary :
481 struct request_sock ***prevp, 485 * req sock will no longer be in listener hash table
482 const __be16 rport, const __be32 raddr, 486*/
487struct request_sock *inet_csk_search_req(struct sock *sk,
488 const __be16 rport,
489 const __be32 raddr,
483 const __be32 laddr) 490 const __be32 laddr)
484{ 491{
485 const struct inet_connection_sock *icsk = inet_csk(sk); 492 struct inet_connection_sock *icsk = inet_csk(sk);
486 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; 493 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
487 struct request_sock *req, **prev; 494 struct request_sock *req;
495 u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd,
496 lopt->nr_table_entries);
488 497
489 for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd, 498 spin_lock(&icsk->icsk_accept_queue.syn_wait_lock);
490 lopt->nr_table_entries)]; 499 for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) {
491 (req = *prev) != NULL;
492 prev = &req->dl_next) {
493 const struct inet_request_sock *ireq = inet_rsk(req); 500 const struct inet_request_sock *ireq = inet_rsk(req);
494 501
495 if (ireq->ir_rmt_port == rport && 502 if (ireq->ir_rmt_port == rport &&
496 ireq->ir_rmt_addr == raddr && 503 ireq->ir_rmt_addr == raddr &&
497 ireq->ir_loc_addr == laddr && 504 ireq->ir_loc_addr == laddr &&
498 AF_INET_FAMILY(req->rsk_ops->family)) { 505 AF_INET_FAMILY(req->rsk_ops->family)) {
506 atomic_inc(&req->rsk_refcnt);
499 WARN_ON(req->sk); 507 WARN_ON(req->sk);
500 *prevp = prev;
501 break; 508 break;
502 } 509 }
503 } 510 }
511 spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
504 512
505 return req; 513 return req;
506} 514}
@@ -556,23 +564,58 @@ int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
556} 564}
557EXPORT_SYMBOL(inet_rtx_syn_ack); 565EXPORT_SYMBOL(inet_rtx_syn_ack);
558 566
559void inet_csk_reqsk_queue_prune(struct sock *parent, 567/* return true if req was found in the syn_table[] */
560 const unsigned long interval, 568static bool reqsk_queue_unlink(struct request_sock_queue *queue,
561 const unsigned long timeout, 569 struct request_sock *req)
562 const unsigned long max_rto) 570{
571 struct listen_sock *lopt = queue->listen_opt;
572 struct request_sock **prev;
573 bool found = false;
574
575 spin_lock(&queue->syn_wait_lock);
576
577 for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL;
578 prev = &(*prev)->dl_next) {
579 if (*prev == req) {
580 *prev = req->dl_next;
581 found = true;
582 break;
583 }
584 }
585
586 spin_unlock(&queue->syn_wait_lock);
587 if (del_timer(&req->rsk_timer))
588 reqsk_put(req);
589 return found;
590}
591
592void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
563{ 593{
564 struct inet_connection_sock *icsk = inet_csk(parent); 594 if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) {
595 reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
596 reqsk_put(req);
597 }
598}
599EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
600
601static void reqsk_timer_handler(unsigned long data)
602{
603 struct request_sock *req = (struct request_sock *)data;
604 struct sock *sk_listener = req->rsk_listener;
605 struct inet_connection_sock *icsk = inet_csk(sk_listener);
565 struct request_sock_queue *queue = &icsk->icsk_accept_queue; 606 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
566 struct listen_sock *lopt = queue->listen_opt; 607 struct listen_sock *lopt = queue->listen_opt;
567 int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; 608 int qlen, expire = 0, resend = 0;
568 int thresh = max_retries; 609 int max_retries, thresh;
569 unsigned long now = jiffies; 610 u8 defer_accept;
570 struct request_sock **reqp, *req;
571 int i, budget;
572 611
573 if (lopt == NULL || lopt->qlen == 0) 612 if (sk_listener->sk_state != TCP_LISTEN || !lopt) {
613 reqsk_put(req);
574 return; 614 return;
615 }
575 616
617 max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
618 thresh = max_retries;
576 /* Normally all the openreqs are young and become mature 619 /* Normally all the openreqs are young and become mature
577 * (i.e. converted to established socket) for first timeout. 620 * (i.e. converted to established socket) for first timeout.
578 * If synack was not acknowledged for 1 second, it means 621 * If synack was not acknowledged for 1 second, it means
@@ -590,67 +633,65 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
590 * embrions; and abort old ones without pity, if old 633 * embrions; and abort old ones without pity, if old
591 * ones are about to clog our table. 634 * ones are about to clog our table.
592 */ 635 */
593 if (lopt->qlen>>(lopt->max_qlen_log-1)) { 636 qlen = listen_sock_qlen(lopt);
594 int young = (lopt->qlen_young<<1); 637 if (qlen >> (lopt->max_qlen_log - 1)) {
638 int young = listen_sock_young(lopt) << 1;
595 639
596 while (thresh > 2) { 640 while (thresh > 2) {
597 if (lopt->qlen < young) 641 if (qlen < young)
598 break; 642 break;
599 thresh--; 643 thresh--;
600 young <<= 1; 644 young <<= 1;
601 } 645 }
602 } 646 }
647 defer_accept = READ_ONCE(queue->rskq_defer_accept);
648 if (defer_accept)
649 max_retries = defer_accept;
650 syn_ack_recalc(req, thresh, max_retries, defer_accept,
651 &expire, &resend);
652 req->rsk_ops->syn_ack_timeout(req);
653 if (!expire &&
654 (!resend ||
655 !inet_rtx_syn_ack(sk_listener, req) ||
656 inet_rsk(req)->acked)) {
657 unsigned long timeo;
658
659 if (req->num_timeout++ == 0)
660 atomic_inc(&lopt->young_dec);
661 timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
662 mod_timer_pinned(&req->rsk_timer, jiffies + timeo);
663 return;
664 }
665 inet_csk_reqsk_queue_drop(sk_listener, req);
666 reqsk_put(req);
667}
603 668
604 if (queue->rskq_defer_accept) 669void reqsk_queue_hash_req(struct request_sock_queue *queue,
605 max_retries = queue->rskq_defer_accept; 670 u32 hash, struct request_sock *req,
606 671 unsigned long timeout)
607 budget = 2 * (lopt->nr_table_entries / (timeout / interval)); 672{
608 i = lopt->clock_hand; 673 struct listen_sock *lopt = queue->listen_opt;
609
610 do {
611 reqp=&lopt->syn_table[i];
612 while ((req = *reqp) != NULL) {
613 if (time_after_eq(now, req->expires)) {
614 int expire = 0, resend = 0;
615
616 syn_ack_recalc(req, thresh, max_retries,
617 queue->rskq_defer_accept,
618 &expire, &resend);
619 req->rsk_ops->syn_ack_timeout(parent, req);
620 if (!expire &&
621 (!resend ||
622 !inet_rtx_syn_ack(parent, req) ||
623 inet_rsk(req)->acked)) {
624 unsigned long timeo;
625
626 if (req->num_timeout++ == 0)
627 lopt->qlen_young--;
628 timeo = min(timeout << req->num_timeout,
629 max_rto);
630 req->expires = now + timeo;
631 reqp = &req->dl_next;
632 continue;
633 }
634
635 /* Drop this request */
636 inet_csk_reqsk_queue_unlink(parent, req, reqp);
637 reqsk_queue_removed(queue, req);
638 reqsk_free(req);
639 continue;
640 }
641 reqp = &req->dl_next;
642 }
643 674
644 i = (i + 1) & (lopt->nr_table_entries - 1); 675 req->num_retrans = 0;
676 req->num_timeout = 0;
677 req->sk = NULL;
645 678
646 } while (--budget > 0); 679 /* before letting lookups find us, make sure all req fields
680 * are committed to memory and refcnt initialized.
681 */
682 smp_wmb();
683 atomic_set(&req->rsk_refcnt, 2);
684 setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req);
685 req->rsk_hash = hash;
647 686
648 lopt->clock_hand = i; 687 spin_lock(&queue->syn_wait_lock);
688 req->dl_next = lopt->syn_table[hash];
689 lopt->syn_table[hash] = req;
690 spin_unlock(&queue->syn_wait_lock);
649 691
650 if (lopt->qlen) 692 mod_timer_pinned(&req->rsk_timer, jiffies + timeout);
651 inet_csk_reset_keepalive_timer(parent, interval);
652} 693}
653EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); 694EXPORT_SYMBOL(reqsk_queue_hash_req);
654 695
655/** 696/**
656 * inet_csk_clone_lock - clone an inet socket, and lock its clone 697 * inet_csk_clone_lock - clone an inet socket, and lock its clone
@@ -666,7 +707,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
666{ 707{
667 struct sock *newsk = sk_clone_lock(sk, priority); 708 struct sock *newsk = sk_clone_lock(sk, priority);
668 709
669 if (newsk != NULL) { 710 if (newsk) {
670 struct inet_connection_sock *newicsk = inet_csk(newsk); 711 struct inet_connection_sock *newicsk = inet_csk(newsk);
671 712
672 newsk->sk_state = TCP_SYN_RECV; 713 newsk->sk_state = TCP_SYN_RECV;
@@ -678,6 +719,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
678 newsk->sk_write_space = sk_stream_write_space; 719 newsk->sk_write_space = sk_stream_write_space;
679 720
680 newsk->sk_mark = inet_rsk(req)->ir_mark; 721 newsk->sk_mark = inet_rsk(req)->ir_mark;
722 atomic64_set(&newsk->sk_cookie,
723 atomic64_read(&inet_rsk(req)->ir_cookie));
681 724
682 newicsk->icsk_retransmits = 0; 725 newicsk->icsk_retransmits = 0;
683 newicsk->icsk_backoff = 0; 726 newicsk->icsk_backoff = 0;
@@ -784,8 +827,6 @@ void inet_csk_listen_stop(struct sock *sk)
784 struct request_sock *acc_req; 827 struct request_sock *acc_req;
785 struct request_sock *req; 828 struct request_sock *req;
786 829
787 inet_csk_delete_keepalive_timer(sk);
788
789 /* make all the listen_opt local to us */ 830 /* make all the listen_opt local to us */
790 acc_req = reqsk_queue_yank_acceptq(queue); 831 acc_req = reqsk_queue_yank_acceptq(queue);
791 832
@@ -815,9 +856,9 @@ void inet_csk_listen_stop(struct sock *sk)
815 856
816 percpu_counter_inc(sk->sk_prot->orphan_count); 857 percpu_counter_inc(sk->sk_prot->orphan_count);
817 858
818 if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) { 859 if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
819 BUG_ON(tcp_sk(child)->fastopen_rsk != req); 860 BUG_ON(tcp_sk(child)->fastopen_rsk != req);
820 BUG_ON(sk != tcp_rsk(req)->listener); 861 BUG_ON(sk != req->rsk_listener);
821 862
822 /* Paranoid, to prevent race condition if 863 /* Paranoid, to prevent race condition if
823 * an inbound pkt destined for child is 864 * an inbound pkt destined for child is
@@ -826,7 +867,6 @@ void inet_csk_listen_stop(struct sock *sk)
826 * tcp_v4_destroy_sock(). 867 * tcp_v4_destroy_sock().
827 */ 868 */
828 tcp_sk(child)->fastopen_rsk = NULL; 869 tcp_sk(child)->fastopen_rsk = NULL;
829 sock_put(sk);
830 } 870 }
831 inet_csk_destroy_sock(child); 871 inet_csk_destroy_sock(child);
832 872
@@ -835,9 +875,9 @@ void inet_csk_listen_stop(struct sock *sk)
835 sock_put(child); 875 sock_put(child);
836 876
837 sk_acceptq_removed(sk); 877 sk_acceptq_removed(sk);
838 __reqsk_free(req); 878 reqsk_put(req);
839 } 879 }
840 if (queue->fastopenq != NULL) { 880 if (queue->fastopenq) {
841 /* Free all the reqs queued in rskq_rst_head. */ 881 /* Free all the reqs queued in rskq_rst_head. */
842 spin_lock_bh(&queue->fastopenq->lock); 882 spin_lock_bh(&queue->fastopenq->lock);
843 acc_req = queue->fastopenq->rskq_rst_head; 883 acc_req = queue->fastopenq->rskq_rst_head;
@@ -845,7 +885,7 @@ void inet_csk_listen_stop(struct sock *sk)
845 spin_unlock_bh(&queue->fastopenq->lock); 885 spin_unlock_bh(&queue->fastopenq->lock);
846 while ((req = acc_req) != NULL) { 886 while ((req = acc_req) != NULL) {
847 acc_req = req->dl_next; 887 acc_req = req->dl_next;
848 __reqsk_free(req); 888 reqsk_put(req);
849 } 889 }
850 } 890 }
851 WARN_ON(sk->sk_ack_backlog); 891 WARN_ON(sk->sk_ack_backlog);
@@ -869,7 +909,7 @@ int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
869{ 909{
870 const struct inet_connection_sock *icsk = inet_csk(sk); 910 const struct inet_connection_sock *icsk = inet_csk(sk);
871 911
872 if (icsk->icsk_af_ops->compat_getsockopt != NULL) 912 if (icsk->icsk_af_ops->compat_getsockopt)
873 return icsk->icsk_af_ops->compat_getsockopt(sk, level, optname, 913 return icsk->icsk_af_ops->compat_getsockopt(sk, level, optname,
874 optval, optlen); 914 optval, optlen);
875 return icsk->icsk_af_ops->getsockopt(sk, level, optname, 915 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
@@ -882,7 +922,7 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
882{ 922{
883 const struct inet_connection_sock *icsk = inet_csk(sk); 923 const struct inet_connection_sock *icsk = inet_csk(sk);
884 924
885 if (icsk->icsk_af_ops->compat_setsockopt != NULL) 925 if (icsk->icsk_af_ops->compat_setsockopt)
886 return icsk->icsk_af_ops->compat_setsockopt(sk, level, optname, 926 return icsk->icsk_af_ops->compat_setsockopt(sk, level, optname,
887 optval, optlen); 927 optval, optlen);
888 return icsk->icsk_af_ops->setsockopt(sk, level, optname, 928 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 81751f12645f..bb77ebdae3b3 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -38,16 +38,12 @@
38static const struct inet_diag_handler **inet_diag_table; 38static const struct inet_diag_handler **inet_diag_table;
39 39
40struct inet_diag_entry { 40struct inet_diag_entry {
41 __be32 *saddr; 41 const __be32 *saddr;
42 __be32 *daddr; 42 const __be32 *daddr;
43 u16 sport; 43 u16 sport;
44 u16 dport; 44 u16 dport;
45 u16 family; 45 u16 family;
46 u16 userlocks; 46 u16 userlocks;
47#if IS_ENABLED(CONFIG_IPV6)
48 struct in6_addr saddr_storage; /* for IPv4-mapped-IPv6 addresses */
49 struct in6_addr daddr_storage; /* for IPv4-mapped-IPv6 addresses */
50#endif
51}; 47};
52 48
53static DEFINE_MUTEX(inet_diag_table_mutex); 49static DEFINE_MUTEX(inet_diag_table_mutex);
@@ -65,28 +61,66 @@ static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
65 return inet_diag_table[proto]; 61 return inet_diag_table[proto];
66} 62}
67 63
68static inline void inet_diag_unlock_handler( 64static void inet_diag_unlock_handler(const struct inet_diag_handler *handler)
69 const struct inet_diag_handler *handler)
70{ 65{
71 mutex_unlock(&inet_diag_table_mutex); 66 mutex_unlock(&inet_diag_table_mutex);
72} 67}
73 68
69static void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
70{
71 r->idiag_family = sk->sk_family;
72
73 r->id.idiag_sport = htons(sk->sk_num);
74 r->id.idiag_dport = sk->sk_dport;
75 r->id.idiag_if = sk->sk_bound_dev_if;
76 sock_diag_save_cookie(sk, r->id.idiag_cookie);
77
78#if IS_ENABLED(CONFIG_IPV6)
79 if (sk->sk_family == AF_INET6) {
80 *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
81 *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
82 } else
83#endif
84 {
85 memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
86 memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
87
88 r->id.idiag_src[0] = sk->sk_rcv_saddr;
89 r->id.idiag_dst[0] = sk->sk_daddr;
90 }
91}
92
93static size_t inet_sk_attr_size(void)
94{
95 return nla_total_size(sizeof(struct tcp_info))
96 + nla_total_size(1) /* INET_DIAG_SHUTDOWN */
97 + nla_total_size(1) /* INET_DIAG_TOS */
98 + nla_total_size(1) /* INET_DIAG_TCLASS */
99 + nla_total_size(sizeof(struct inet_diag_meminfo))
100 + nla_total_size(sizeof(struct inet_diag_msg))
101 + nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
102 + nla_total_size(TCP_CA_NAME_MAX)
103 + nla_total_size(sizeof(struct tcpvegas_info))
104 + 64;
105}
106
74int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, 107int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
75 struct sk_buff *skb, struct inet_diag_req_v2 *req, 108 struct sk_buff *skb, const struct inet_diag_req_v2 *req,
76 struct user_namespace *user_ns, 109 struct user_namespace *user_ns,
77 u32 portid, u32 seq, u16 nlmsg_flags, 110 u32 portid, u32 seq, u16 nlmsg_flags,
78 const struct nlmsghdr *unlh) 111 const struct nlmsghdr *unlh)
79{ 112{
80 const struct inet_sock *inet = inet_sk(sk); 113 const struct inet_sock *inet = inet_sk(sk);
114 const struct tcp_congestion_ops *ca_ops;
115 const struct inet_diag_handler *handler;
116 int ext = req->idiag_ext;
81 struct inet_diag_msg *r; 117 struct inet_diag_msg *r;
82 struct nlmsghdr *nlh; 118 struct nlmsghdr *nlh;
83 struct nlattr *attr; 119 struct nlattr *attr;
84 void *info = NULL; 120 void *info = NULL;
85 const struct inet_diag_handler *handler;
86 int ext = req->idiag_ext;
87 121
88 handler = inet_diag_table[req->sdiag_protocol]; 122 handler = inet_diag_table[req->sdiag_protocol];
89 BUG_ON(handler == NULL); 123 BUG_ON(!handler);
90 124
91 nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), 125 nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
92 nlmsg_flags); 126 nlmsg_flags);
@@ -94,25 +128,13 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
94 return -EMSGSIZE; 128 return -EMSGSIZE;
95 129
96 r = nlmsg_data(nlh); 130 r = nlmsg_data(nlh);
97 BUG_ON(sk->sk_state == TCP_TIME_WAIT); 131 BUG_ON(!sk_fullsock(sk));
98 132
99 r->idiag_family = sk->sk_family; 133 inet_diag_msg_common_fill(r, sk);
100 r->idiag_state = sk->sk_state; 134 r->idiag_state = sk->sk_state;
101 r->idiag_timer = 0; 135 r->idiag_timer = 0;
102 r->idiag_retrans = 0; 136 r->idiag_retrans = 0;
103 137
104 r->id.idiag_if = sk->sk_bound_dev_if;
105 sock_diag_save_cookie(sk, r->id.idiag_cookie);
106
107 r->id.idiag_sport = inet->inet_sport;
108 r->id.idiag_dport = inet->inet_dport;
109
110 memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
111 memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
112
113 r->id.idiag_src[0] = inet->inet_rcv_saddr;
114 r->id.idiag_dst[0] = inet->inet_daddr;
115
116 if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown)) 138 if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
117 goto errout; 139 goto errout;
118 140
@@ -125,10 +147,6 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
125 147
126#if IS_ENABLED(CONFIG_IPV6) 148#if IS_ENABLED(CONFIG_IPV6)
127 if (r->idiag_family == AF_INET6) { 149 if (r->idiag_family == AF_INET6) {
128
129 *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
130 *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
131
132 if (ext & (1 << (INET_DIAG_TCLASS - 1))) 150 if (ext & (1 << (INET_DIAG_TCLASS - 1)))
133 if (nla_put_u8(skb, INET_DIAG_TCLASS, 151 if (nla_put_u8(skb, INET_DIAG_TCLASS,
134 inet6_sk(sk)->tclass) < 0) 152 inet6_sk(sk)->tclass) < 0)
@@ -155,7 +173,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
155 if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO)) 173 if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
156 goto errout; 174 goto errout;
157 175
158 if (icsk == NULL) { 176 if (!icsk) {
159 handler->idiag_get_info(sk, r, NULL); 177 handler->idiag_get_info(sk, r, NULL);
160 goto out; 178 goto out;
161 } 179 }
@@ -191,16 +209,31 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
191 info = nla_data(attr); 209 info = nla_data(attr);
192 } 210 }
193 211
194 if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) 212 if (ext & (1 << (INET_DIAG_CONG - 1))) {
195 if (nla_put_string(skb, INET_DIAG_CONG, 213 int err = 0;
196 icsk->icsk_ca_ops->name) < 0) 214
215 rcu_read_lock();
216 ca_ops = READ_ONCE(icsk->icsk_ca_ops);
217 if (ca_ops)
218 err = nla_put_string(skb, INET_DIAG_CONG, ca_ops->name);
219 rcu_read_unlock();
220 if (err < 0)
197 goto errout; 221 goto errout;
222 }
198 223
199 handler->idiag_get_info(sk, r, info); 224 handler->idiag_get_info(sk, r, info);
200 225
201 if (sk->sk_state < TCP_TIME_WAIT && 226 if (sk->sk_state < TCP_TIME_WAIT) {
202 icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) 227 int err = 0;
203 icsk->icsk_ca_ops->get_info(sk, ext, skb); 228
229 rcu_read_lock();
230 ca_ops = READ_ONCE(icsk->icsk_ca_ops);
231 if (ca_ops && ca_ops->get_info)
232 err = ca_ops->get_info(sk, ext, skb);
233 rcu_read_unlock();
234 if (err < 0)
235 goto errout;
236 }
204 237
205out: 238out:
206 nlmsg_end(skb, nlh); 239 nlmsg_end(skb, nlh);
@@ -213,23 +246,25 @@ errout:
213EXPORT_SYMBOL_GPL(inet_sk_diag_fill); 246EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
214 247
215static int inet_csk_diag_fill(struct sock *sk, 248static int inet_csk_diag_fill(struct sock *sk,
216 struct sk_buff *skb, struct inet_diag_req_v2 *req, 249 struct sk_buff *skb,
250 const struct inet_diag_req_v2 *req,
217 struct user_namespace *user_ns, 251 struct user_namespace *user_ns,
218 u32 portid, u32 seq, u16 nlmsg_flags, 252 u32 portid, u32 seq, u16 nlmsg_flags,
219 const struct nlmsghdr *unlh) 253 const struct nlmsghdr *unlh)
220{ 254{
221 return inet_sk_diag_fill(sk, inet_csk(sk), 255 return inet_sk_diag_fill(sk, inet_csk(sk), skb, req,
222 skb, req, user_ns, portid, seq, nlmsg_flags, unlh); 256 user_ns, portid, seq, nlmsg_flags, unlh);
223} 257}
224 258
225static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, 259static int inet_twsk_diag_fill(struct sock *sk,
226 struct sk_buff *skb, struct inet_diag_req_v2 *req, 260 struct sk_buff *skb,
227 u32 portid, u32 seq, u16 nlmsg_flags, 261 u32 portid, u32 seq, u16 nlmsg_flags,
228 const struct nlmsghdr *unlh) 262 const struct nlmsghdr *unlh)
229{ 263{
230 s32 tmo; 264 struct inet_timewait_sock *tw = inet_twsk(sk);
231 struct inet_diag_msg *r; 265 struct inet_diag_msg *r;
232 struct nlmsghdr *nlh; 266 struct nlmsghdr *nlh;
267 long tmo;
233 268
234 nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), 269 nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
235 nlmsg_flags); 270 nlmsg_flags);
@@ -239,25 +274,13 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
239 r = nlmsg_data(nlh); 274 r = nlmsg_data(nlh);
240 BUG_ON(tw->tw_state != TCP_TIME_WAIT); 275 BUG_ON(tw->tw_state != TCP_TIME_WAIT);
241 276
242 tmo = tw->tw_ttd - inet_tw_time_stamp(); 277 tmo = tw->tw_timer.expires - jiffies;
243 if (tmo < 0) 278 if (tmo < 0)
244 tmo = 0; 279 tmo = 0;
245 280
246 r->idiag_family = tw->tw_family; 281 inet_diag_msg_common_fill(r, sk);
247 r->idiag_retrans = 0; 282 r->idiag_retrans = 0;
248 283
249 r->id.idiag_if = tw->tw_bound_dev_if;
250 sock_diag_save_cookie(tw, r->id.idiag_cookie);
251
252 r->id.idiag_sport = tw->tw_sport;
253 r->id.idiag_dport = tw->tw_dport;
254
255 memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
256 memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
257
258 r->id.idiag_src[0] = tw->tw_rcv_saddr;
259 r->id.idiag_dst[0] = tw->tw_daddr;
260
261 r->idiag_state = tw->tw_substate; 284 r->idiag_state = tw->tw_substate;
262 r->idiag_timer = 3; 285 r->idiag_timer = 3;
263 r->idiag_expires = jiffies_to_msecs(tmo); 286 r->idiag_expires = jiffies_to_msecs(tmo);
@@ -265,70 +288,98 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
265 r->idiag_wqueue = 0; 288 r->idiag_wqueue = 0;
266 r->idiag_uid = 0; 289 r->idiag_uid = 0;
267 r->idiag_inode = 0; 290 r->idiag_inode = 0;
268#if IS_ENABLED(CONFIG_IPV6) 291
269 if (tw->tw_family == AF_INET6) { 292 nlmsg_end(skb, nlh);
270 *(struct in6_addr *)r->id.idiag_src = tw->tw_v6_rcv_saddr; 293 return 0;
271 *(struct in6_addr *)r->id.idiag_dst = tw->tw_v6_daddr; 294}
272 } 295
273#endif 296static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
297 u32 portid, u32 seq, u16 nlmsg_flags,
298 const struct nlmsghdr *unlh)
299{
300 struct inet_diag_msg *r;
301 struct nlmsghdr *nlh;
302 long tmo;
303
304 nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
305 nlmsg_flags);
306 if (!nlh)
307 return -EMSGSIZE;
308
309 r = nlmsg_data(nlh);
310 inet_diag_msg_common_fill(r, sk);
311 r->idiag_state = TCP_SYN_RECV;
312 r->idiag_timer = 1;
313 r->idiag_retrans = inet_reqsk(sk)->num_retrans;
314
315 BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
316 offsetof(struct sock, sk_cookie));
317
318 tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies;
319 r->idiag_expires = (tmo >= 0) ? jiffies_to_msecs(tmo) : 0;
320 r->idiag_rqueue = 0;
321 r->idiag_wqueue = 0;
322 r->idiag_uid = 0;
323 r->idiag_inode = 0;
274 324
275 nlmsg_end(skb, nlh); 325 nlmsg_end(skb, nlh);
276 return 0; 326 return 0;
277} 327}
278 328
279static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, 329static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
280 struct inet_diag_req_v2 *r, 330 const struct inet_diag_req_v2 *r,
281 struct user_namespace *user_ns, 331 struct user_namespace *user_ns,
282 u32 portid, u32 seq, u16 nlmsg_flags, 332 u32 portid, u32 seq, u16 nlmsg_flags,
283 const struct nlmsghdr *unlh) 333 const struct nlmsghdr *unlh)
284{ 334{
285 if (sk->sk_state == TCP_TIME_WAIT) 335 if (sk->sk_state == TCP_TIME_WAIT)
286 return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq, 336 return inet_twsk_diag_fill(sk, skb, portid, seq,
287 nlmsg_flags, unlh); 337 nlmsg_flags, unlh);
288 338
339 if (sk->sk_state == TCP_NEW_SYN_RECV)
340 return inet_req_diag_fill(sk, skb, portid, seq,
341 nlmsg_flags, unlh);
342
289 return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, 343 return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
290 nlmsg_flags, unlh); 344 nlmsg_flags, unlh);
291} 345}
292 346
293int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb, 347int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
294 const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req) 348 struct sk_buff *in_skb,
349 const struct nlmsghdr *nlh,
350 const struct inet_diag_req_v2 *req)
295{ 351{
296 int err;
297 struct sock *sk;
298 struct sk_buff *rep;
299 struct net *net = sock_net(in_skb->sk); 352 struct net *net = sock_net(in_skb->sk);
353 struct sk_buff *rep;
354 struct sock *sk;
355 int err;
300 356
301 err = -EINVAL; 357 err = -EINVAL;
302 if (req->sdiag_family == AF_INET) { 358 if (req->sdiag_family == AF_INET)
303 sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0], 359 sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
304 req->id.idiag_dport, req->id.idiag_src[0], 360 req->id.idiag_dport, req->id.idiag_src[0],
305 req->id.idiag_sport, req->id.idiag_if); 361 req->id.idiag_sport, req->id.idiag_if);
306 }
307#if IS_ENABLED(CONFIG_IPV6) 362#if IS_ENABLED(CONFIG_IPV6)
308 else if (req->sdiag_family == AF_INET6) { 363 else if (req->sdiag_family == AF_INET6)
309 sk = inet6_lookup(net, hashinfo, 364 sk = inet6_lookup(net, hashinfo,
310 (struct in6_addr *)req->id.idiag_dst, 365 (struct in6_addr *)req->id.idiag_dst,
311 req->id.idiag_dport, 366 req->id.idiag_dport,
312 (struct in6_addr *)req->id.idiag_src, 367 (struct in6_addr *)req->id.idiag_src,
313 req->id.idiag_sport, 368 req->id.idiag_sport,
314 req->id.idiag_if); 369 req->id.idiag_if);
315 }
316#endif 370#endif
317 else { 371 else
318 goto out_nosk; 372 goto out_nosk;
319 }
320 373
321 err = -ENOENT; 374 err = -ENOENT;
322 if (sk == NULL) 375 if (!sk)
323 goto out_nosk; 376 goto out_nosk;
324 377
325 err = sock_diag_check_cookie(sk, req->id.idiag_cookie); 378 err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
326 if (err) 379 if (err)
327 goto out; 380 goto out;
328 381
329 rep = nlmsg_new(sizeof(struct inet_diag_msg) + 382 rep = nlmsg_new(inet_sk_attr_size(), GFP_KERNEL);
330 sizeof(struct inet_diag_meminfo) +
331 sizeof(struct tcp_info) + 64, GFP_KERNEL);
332 if (!rep) { 383 if (!rep) {
333 err = -ENOMEM; 384 err = -ENOMEM;
334 goto out; 385 goto out;
@@ -359,7 +410,7 @@ EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
359 410
360static int inet_diag_get_exact(struct sk_buff *in_skb, 411static int inet_diag_get_exact(struct sk_buff *in_skb,
361 const struct nlmsghdr *nlh, 412 const struct nlmsghdr *nlh,
362 struct inet_diag_req_v2 *req) 413 const struct inet_diag_req_v2 *req)
363{ 414{
364 const struct inet_diag_handler *handler; 415 const struct inet_diag_handler *handler;
365 int err; 416 int err;
@@ -400,9 +451,8 @@ static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits)
400 return 1; 451 return 1;
401} 452}
402 453
403
404static int inet_diag_bc_run(const struct nlattr *_bc, 454static int inet_diag_bc_run(const struct nlattr *_bc,
405 const struct inet_diag_entry *entry) 455 const struct inet_diag_entry *entry)
406{ 456{
407 const void *bc = nla_data(_bc); 457 const void *bc = nla_data(_bc);
408 int len = nla_len(_bc); 458 int len = nla_len(_bc);
@@ -434,10 +484,10 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
434 break; 484 break;
435 case INET_DIAG_BC_S_COND: 485 case INET_DIAG_BC_S_COND:
436 case INET_DIAG_BC_D_COND: { 486 case INET_DIAG_BC_D_COND: {
437 struct inet_diag_hostcond *cond; 487 const struct inet_diag_hostcond *cond;
438 __be32 *addr; 488 const __be32 *addr;
439 489
440 cond = (struct inet_diag_hostcond *)(op + 1); 490 cond = (const struct inet_diag_hostcond *)(op + 1);
441 if (cond->port != -1 && 491 if (cond->port != -1 &&
442 cond->port != (op->code == INET_DIAG_BC_S_COND ? 492 cond->port != (op->code == INET_DIAG_BC_S_COND ?
443 entry->sport : entry->dport)) { 493 entry->sport : entry->dport)) {
@@ -486,29 +536,36 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
486 return len == 0; 536 return len == 0;
487} 537}
488 538
539/* This helper is available for all sockets (ESTABLISH, TIMEWAIT, SYN_RECV)
540 */
541static void entry_fill_addrs(struct inet_diag_entry *entry,
542 const struct sock *sk)
543{
544#if IS_ENABLED(CONFIG_IPV6)
545 if (sk->sk_family == AF_INET6) {
546 entry->saddr = sk->sk_v6_rcv_saddr.s6_addr32;
547 entry->daddr = sk->sk_v6_daddr.s6_addr32;
548 } else
549#endif
550 {
551 entry->saddr = &sk->sk_rcv_saddr;
552 entry->daddr = &sk->sk_daddr;
553 }
554}
555
489int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) 556int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
490{ 557{
491 struct inet_diag_entry entry;
492 struct inet_sock *inet = inet_sk(sk); 558 struct inet_sock *inet = inet_sk(sk);
559 struct inet_diag_entry entry;
493 560
494 if (bc == NULL) 561 if (!bc)
495 return 1; 562 return 1;
496 563
497 entry.family = sk->sk_family; 564 entry.family = sk->sk_family;
498#if IS_ENABLED(CONFIG_IPV6) 565 entry_fill_addrs(&entry, sk);
499 if (entry.family == AF_INET6) {
500
501 entry.saddr = sk->sk_v6_rcv_saddr.s6_addr32;
502 entry.daddr = sk->sk_v6_daddr.s6_addr32;
503 } else
504#endif
505 {
506 entry.saddr = &inet->inet_rcv_saddr;
507 entry.daddr = &inet->inet_daddr;
508 }
509 entry.sport = inet->inet_num; 566 entry.sport = inet->inet_num;
510 entry.dport = ntohs(inet->inet_dport); 567 entry.dport = ntohs(inet->inet_dport);
511 entry.userlocks = sk->sk_userlocks; 568 entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0;
512 569
513 return inet_diag_bc_run(bc, &entry); 570 return inet_diag_bc_run(bc, &entry);
514} 571}
@@ -535,8 +592,8 @@ static int valid_cc(const void *bc, int len, int cc)
535static bool valid_hostcond(const struct inet_diag_bc_op *op, int len, 592static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
536 int *min_len) 593 int *min_len)
537{ 594{
538 int addr_len;
539 struct inet_diag_hostcond *cond; 595 struct inet_diag_hostcond *cond;
596 int addr_len;
540 597
541 /* Check hostcond space. */ 598 /* Check hostcond space. */
542 *min_len += sizeof(struct inet_diag_hostcond); 599 *min_len += sizeof(struct inet_diag_hostcond);
@@ -570,8 +627,8 @@ static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
570} 627}
571 628
572/* Validate a port comparison operator. */ 629/* Validate a port comparison operator. */
573static inline bool valid_port_comparison(const struct inet_diag_bc_op *op, 630static bool valid_port_comparison(const struct inet_diag_bc_op *op,
574 int len, int *min_len) 631 int len, int *min_len)
575{ 632{
576 /* Port comparisons put the port in a follow-on inet_diag_bc_op. */ 633 /* Port comparisons put the port in a follow-on inet_diag_bc_op. */
577 *min_len += sizeof(struct inet_diag_bc_op); 634 *min_len += sizeof(struct inet_diag_bc_op);
@@ -586,10 +643,9 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
586 int len = bytecode_len; 643 int len = bytecode_len;
587 644
588 while (len > 0) { 645 while (len > 0) {
589 const struct inet_diag_bc_op *op = bc;
590 int min_len = sizeof(struct inet_diag_bc_op); 646 int min_len = sizeof(struct inet_diag_bc_op);
647 const struct inet_diag_bc_op *op = bc;
591 648
592//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
593 switch (op->code) { 649 switch (op->code) {
594 case INET_DIAG_BC_S_COND: 650 case INET_DIAG_BC_S_COND:
595 case INET_DIAG_BC_D_COND: 651 case INET_DIAG_BC_D_COND:
@@ -630,7 +686,7 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
630static int inet_csk_diag_dump(struct sock *sk, 686static int inet_csk_diag_dump(struct sock *sk,
631 struct sk_buff *skb, 687 struct sk_buff *skb,
632 struct netlink_callback *cb, 688 struct netlink_callback *cb,
633 struct inet_diag_req_v2 *r, 689 const struct inet_diag_req_v2 *r,
634 const struct nlattr *bc) 690 const struct nlattr *bc)
635{ 691{
636 if (!inet_diag_bc_sk(bc, sk)) 692 if (!inet_diag_bc_sk(bc, sk))
@@ -642,139 +698,42 @@ static int inet_csk_diag_dump(struct sock *sk,
642 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); 698 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
643} 699}
644 700
645static int inet_twsk_diag_dump(struct sock *sk, 701static void twsk_build_assert(void)
646 struct sk_buff *skb,
647 struct netlink_callback *cb,
648 struct inet_diag_req_v2 *r,
649 const struct nlattr *bc)
650{ 702{
651 struct inet_timewait_sock *tw = inet_twsk(sk); 703 BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) !=
652 704 offsetof(struct sock, sk_family));
653 if (bc != NULL) {
654 struct inet_diag_entry entry;
655 705
656 entry.family = tw->tw_family; 706 BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) !=
657#if IS_ENABLED(CONFIG_IPV6) 707 offsetof(struct inet_sock, inet_num));
658 if (tw->tw_family == AF_INET6) {
659 entry.saddr = tw->tw_v6_rcv_saddr.s6_addr32;
660 entry.daddr = tw->tw_v6_daddr.s6_addr32;
661 } else
662#endif
663 {
664 entry.saddr = &tw->tw_rcv_saddr;
665 entry.daddr = &tw->tw_daddr;
666 }
667 entry.sport = tw->tw_num;
668 entry.dport = ntohs(tw->tw_dport);
669 entry.userlocks = 0;
670 708
671 if (!inet_diag_bc_run(bc, &entry)) 709 BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) !=
672 return 0; 710 offsetof(struct inet_sock, inet_dport));
673 }
674 711
675 return inet_twsk_diag_fill(tw, skb, r, 712 BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) !=
676 NETLINK_CB(cb->skb).portid, 713 offsetof(struct inet_sock, inet_rcv_saddr));
677 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
678}
679 714
680/* Get the IPv4, IPv6, or IPv4-mapped-IPv6 local and remote addresses 715 BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) !=
681 * from a request_sock. For IPv4-mapped-IPv6 we must map IPv4 to IPv6. 716 offsetof(struct inet_sock, inet_daddr));
682 */
683static inline void inet_diag_req_addrs(const struct sock *sk,
684 const struct request_sock *req,
685 struct inet_diag_entry *entry)
686{
687 struct inet_request_sock *ireq = inet_rsk(req);
688 717
689#if IS_ENABLED(CONFIG_IPV6) 718#if IS_ENABLED(CONFIG_IPV6)
690 if (sk->sk_family == AF_INET6) { 719 BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) !=
691 if (req->rsk_ops->family == AF_INET6) { 720 offsetof(struct sock, sk_v6_rcv_saddr));
692 entry->saddr = ireq->ir_v6_loc_addr.s6_addr32;
693 entry->daddr = ireq->ir_v6_rmt_addr.s6_addr32;
694 } else if (req->rsk_ops->family == AF_INET) {
695 ipv6_addr_set_v4mapped(ireq->ir_loc_addr,
696 &entry->saddr_storage);
697 ipv6_addr_set_v4mapped(ireq->ir_rmt_addr,
698 &entry->daddr_storage);
699 entry->saddr = entry->saddr_storage.s6_addr32;
700 entry->daddr = entry->daddr_storage.s6_addr32;
701 }
702 } else
703#endif
704 {
705 entry->saddr = &ireq->ir_loc_addr;
706 entry->daddr = &ireq->ir_rmt_addr;
707 }
708}
709 721
710static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, 722 BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) !=
711 struct request_sock *req, 723 offsetof(struct sock, sk_v6_daddr));
712 struct user_namespace *user_ns,
713 u32 portid, u32 seq,
714 const struct nlmsghdr *unlh)
715{
716 const struct inet_request_sock *ireq = inet_rsk(req);
717 struct inet_sock *inet = inet_sk(sk);
718 struct inet_diag_msg *r;
719 struct nlmsghdr *nlh;
720 long tmo;
721
722 nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
723 NLM_F_MULTI);
724 if (!nlh)
725 return -EMSGSIZE;
726
727 r = nlmsg_data(nlh);
728 r->idiag_family = sk->sk_family;
729 r->idiag_state = TCP_SYN_RECV;
730 r->idiag_timer = 1;
731 r->idiag_retrans = req->num_retrans;
732
733 r->id.idiag_if = sk->sk_bound_dev_if;
734 sock_diag_save_cookie(req, r->id.idiag_cookie);
735
736 tmo = req->expires - jiffies;
737 if (tmo < 0)
738 tmo = 0;
739
740 r->id.idiag_sport = inet->inet_sport;
741 r->id.idiag_dport = ireq->ir_rmt_port;
742
743 memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
744 memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
745
746 r->id.idiag_src[0] = ireq->ir_loc_addr;
747 r->id.idiag_dst[0] = ireq->ir_rmt_addr;
748
749 r->idiag_expires = jiffies_to_msecs(tmo);
750 r->idiag_rqueue = 0;
751 r->idiag_wqueue = 0;
752 r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
753 r->idiag_inode = 0;
754#if IS_ENABLED(CONFIG_IPV6)
755 if (r->idiag_family == AF_INET6) {
756 struct inet_diag_entry entry;
757 inet_diag_req_addrs(sk, req, &entry);
758 memcpy(r->id.idiag_src, entry.saddr, sizeof(struct in6_addr));
759 memcpy(r->id.idiag_dst, entry.daddr, sizeof(struct in6_addr));
760 }
761#endif 724#endif
762
763 nlmsg_end(skb, nlh);
764 return 0;
765} 725}
766 726
767static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, 727static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
768 struct netlink_callback *cb, 728 struct netlink_callback *cb,
769 struct inet_diag_req_v2 *r, 729 const struct inet_diag_req_v2 *r,
770 const struct nlattr *bc) 730 const struct nlattr *bc)
771{ 731{
772 struct inet_diag_entry entry;
773 struct inet_connection_sock *icsk = inet_csk(sk); 732 struct inet_connection_sock *icsk = inet_csk(sk);
774 struct listen_sock *lopt;
775 struct inet_sock *inet = inet_sk(sk); 733 struct inet_sock *inet = inet_sk(sk);
776 int j, s_j; 734 struct inet_diag_entry entry;
777 int reqnum, s_reqnum; 735 int j, s_j, reqnum, s_reqnum;
736 struct listen_sock *lopt;
778 int err = 0; 737 int err = 0;
779 738
780 s_j = cb->args[3]; 739 s_j = cb->args[3];
@@ -785,13 +744,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
785 744
786 entry.family = sk->sk_family; 745 entry.family = sk->sk_family;
787 746
788 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 747 spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
789 748
790 lopt = icsk->icsk_accept_queue.listen_opt; 749 lopt = icsk->icsk_accept_queue.listen_opt;
791 if (!lopt || !lopt->qlen) 750 if (!lopt || !listen_sock_qlen(lopt))
792 goto out; 751 goto out;
793 752
794 if (bc != NULL) { 753 if (bc) {
795 entry.sport = inet->inet_num; 754 entry.sport = inet->inet_num;
796 entry.userlocks = sk->sk_userlocks; 755 entry.userlocks = sk->sk_userlocks;
797 } 756 }
@@ -810,17 +769,18 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
810 continue; 769 continue;
811 770
812 if (bc) { 771 if (bc) {
813 inet_diag_req_addrs(sk, req, &entry); 772 /* Note: entry.sport and entry.userlocks are already set */
773 entry_fill_addrs(&entry, req_to_sk(req));
814 entry.dport = ntohs(ireq->ir_rmt_port); 774 entry.dport = ntohs(ireq->ir_rmt_port);
815 775
816 if (!inet_diag_bc_run(bc, &entry)) 776 if (!inet_diag_bc_run(bc, &entry))
817 continue; 777 continue;
818 } 778 }
819 779
820 err = inet_diag_fill_req(skb, sk, req, 780 err = inet_req_diag_fill(req_to_sk(req), skb,
821 sk_user_ns(NETLINK_CB(cb->skb).sk), 781 NETLINK_CB(cb->skb).portid,
822 NETLINK_CB(cb->skb).portid, 782 cb->nlh->nlmsg_seq,
823 cb->nlh->nlmsg_seq, cb->nlh); 783 NLM_F_MULTI, cb->nlh);
824 if (err < 0) { 784 if (err < 0) {
825 cb->args[3] = j + 1; 785 cb->args[3] = j + 1;
826 cb->args[4] = reqnum; 786 cb->args[4] = reqnum;
@@ -832,17 +792,17 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
832 } 792 }
833 793
834out: 794out:
835 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 795 spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
836 796
837 return err; 797 return err;
838} 798}
839 799
840void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, 800void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
841 struct netlink_callback *cb, struct inet_diag_req_v2 *r, struct nlattr *bc) 801 struct netlink_callback *cb,
802 const struct inet_diag_req_v2 *r, struct nlattr *bc)
842{ 803{
843 int i, num;
844 int s_i, s_num;
845 struct net *net = sock_net(skb->sk); 804 struct net *net = sock_net(skb->sk);
805 int i, num, s_i, s_num;
846 806
847 s_i = cb->args[1]; 807 s_i = cb->args[1];
848 s_num = num = cb->args[2]; 808 s_num = num = cb->args[2];
@@ -852,9 +812,9 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
852 goto skip_listen_ht; 812 goto skip_listen_ht;
853 813
854 for (i = s_i; i < INET_LHTABLE_SIZE; i++) { 814 for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
855 struct sock *sk;
856 struct hlist_nulls_node *node;
857 struct inet_listen_hashbucket *ilb; 815 struct inet_listen_hashbucket *ilb;
816 struct hlist_nulls_node *node;
817 struct sock *sk;
858 818
859 num = 0; 819 num = 0;
860 ilb = &hashinfo->listening_hash[i]; 820 ilb = &hashinfo->listening_hash[i];
@@ -871,7 +831,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
871 } 831 }
872 832
873 if (r->sdiag_family != AF_UNSPEC && 833 if (r->sdiag_family != AF_UNSPEC &&
874 sk->sk_family != r->sdiag_family) 834 sk->sk_family != r->sdiag_family)
875 goto next_listen; 835 goto next_listen;
876 836
877 if (r->id.idiag_sport != inet->inet_sport && 837 if (r->id.idiag_sport != inet->inet_sport &&
@@ -919,8 +879,8 @@ skip_listen_ht:
919 for (i = s_i; i <= hashinfo->ehash_mask; i++) { 879 for (i = s_i; i <= hashinfo->ehash_mask; i++) {
920 struct inet_ehash_bucket *head = &hashinfo->ehash[i]; 880 struct inet_ehash_bucket *head = &hashinfo->ehash[i];
921 spinlock_t *lock = inet_ehash_lockp(hashinfo, i); 881 spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
922 struct sock *sk;
923 struct hlist_nulls_node *node; 882 struct hlist_nulls_node *node;
883 struct sock *sk;
924 884
925 num = 0; 885 num = 0;
926 886
@@ -932,8 +892,7 @@ skip_listen_ht:
932 892
933 spin_lock_bh(lock); 893 spin_lock_bh(lock);
934 sk_nulls_for_each(sk, node, &head->chain) { 894 sk_nulls_for_each(sk, node, &head->chain) {
935 int res; 895 int state, res;
936 int state;
937 896
938 if (!net_eq(sock_net(sk), net)) 897 if (!net_eq(sock_net(sk), net))
939 continue; 898 continue;
@@ -952,10 +911,16 @@ skip_listen_ht:
952 if (r->id.idiag_dport != sk->sk_dport && 911 if (r->id.idiag_dport != sk->sk_dport &&
953 r->id.idiag_dport) 912 r->id.idiag_dport)
954 goto next_normal; 913 goto next_normal;
955 if (sk->sk_state == TCP_TIME_WAIT) 914 twsk_build_assert();
956 res = inet_twsk_diag_dump(sk, skb, cb, r, bc); 915
957 else 916 if (!inet_diag_bc_sk(bc, sk))
958 res = inet_csk_diag_dump(sk, skb, cb, r, bc); 917 goto next_normal;
918
919 res = sk_diag_fill(sk, skb, r,
920 sk_user_ns(NETLINK_CB(cb->skb).sk),
921 NETLINK_CB(cb->skb).portid,
922 cb->nlh->nlmsg_seq, NLM_F_MULTI,
923 cb->nlh);
959 if (res < 0) { 924 if (res < 0) {
960 spin_unlock_bh(lock); 925 spin_unlock_bh(lock);
961 goto done; 926 goto done;
@@ -976,7 +941,8 @@ out:
976EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); 941EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
977 942
978static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, 943static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
979 struct inet_diag_req_v2 *r, struct nlattr *bc) 944 const struct inet_diag_req_v2 *r,
945 struct nlattr *bc)
980{ 946{
981 const struct inet_diag_handler *handler; 947 const struct inet_diag_handler *handler;
982 int err = 0; 948 int err = 0;
@@ -993,8 +959,8 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
993 959
994static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) 960static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
995{ 961{
996 struct nlattr *bc = NULL;
997 int hdrlen = sizeof(struct inet_diag_req_v2); 962 int hdrlen = sizeof(struct inet_diag_req_v2);
963 struct nlattr *bc = NULL;
998 964
999 if (nlmsg_attrlen(cb->nlh, hdrlen)) 965 if (nlmsg_attrlen(cb->nlh, hdrlen))
1000 bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); 966 bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
@@ -1002,7 +968,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
1002 return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc); 968 return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
1003} 969}
1004 970
1005static inline int inet_diag_type2proto(int type) 971static int inet_diag_type2proto(int type)
1006{ 972{
1007 switch (type) { 973 switch (type) {
1008 case TCPDIAG_GETSOCK: 974 case TCPDIAG_GETSOCK:
@@ -1014,12 +980,13 @@ static inline int inet_diag_type2proto(int type)
1014 } 980 }
1015} 981}
1016 982
1017static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb) 983static int inet_diag_dump_compat(struct sk_buff *skb,
984 struct netlink_callback *cb)
1018{ 985{
1019 struct inet_diag_req *rc = nlmsg_data(cb->nlh); 986 struct inet_diag_req *rc = nlmsg_data(cb->nlh);
987 int hdrlen = sizeof(struct inet_diag_req);
1020 struct inet_diag_req_v2 req; 988 struct inet_diag_req_v2 req;
1021 struct nlattr *bc = NULL; 989 struct nlattr *bc = NULL;
1022 int hdrlen = sizeof(struct inet_diag_req);
1023 990
1024 req.sdiag_family = AF_UNSPEC; /* compatibility */ 991 req.sdiag_family = AF_UNSPEC; /* compatibility */
1025 req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type); 992 req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
@@ -1034,7 +1001,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *c
1034} 1001}
1035 1002
1036static int inet_diag_get_exact_compat(struct sk_buff *in_skb, 1003static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
1037 const struct nlmsghdr *nlh) 1004 const struct nlmsghdr *nlh)
1038{ 1005{
1039 struct inet_diag_req *rc = nlmsg_data(nlh); 1006 struct inet_diag_req *rc = nlmsg_data(nlh);
1040 struct inet_diag_req_v2 req; 1007 struct inet_diag_req_v2 req;
@@ -1063,7 +1030,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
1063 1030
1064 attr = nlmsg_find_attr(nlh, hdrlen, 1031 attr = nlmsg_find_attr(nlh, hdrlen,
1065 INET_DIAG_REQ_BYTECODE); 1032 INET_DIAG_REQ_BYTECODE);
1066 if (attr == NULL || 1033 if (!attr ||
1067 nla_len(attr) < sizeof(struct inet_diag_bc_op) || 1034 nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
1068 inet_diag_bc_audit(nla_data(attr), nla_len(attr))) 1035 inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
1069 return -EINVAL; 1036 return -EINVAL;
@@ -1090,9 +1057,10 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
1090 if (h->nlmsg_flags & NLM_F_DUMP) { 1057 if (h->nlmsg_flags & NLM_F_DUMP) {
1091 if (nlmsg_attrlen(h, hdrlen)) { 1058 if (nlmsg_attrlen(h, hdrlen)) {
1092 struct nlattr *attr; 1059 struct nlattr *attr;
1060
1093 attr = nlmsg_find_attr(h, hdrlen, 1061 attr = nlmsg_find_attr(h, hdrlen,
1094 INET_DIAG_REQ_BYTECODE); 1062 INET_DIAG_REQ_BYTECODE);
1095 if (attr == NULL || 1063 if (!attr ||
1096 nla_len(attr) < sizeof(struct inet_diag_bc_op) || 1064 nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
1097 inet_diag_bc_audit(nla_data(attr), nla_len(attr))) 1065 inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
1098 return -EINVAL; 1066 return -EINVAL;
@@ -1128,7 +1096,7 @@ int inet_diag_register(const struct inet_diag_handler *h)
1128 1096
1129 mutex_lock(&inet_diag_table_mutex); 1097 mutex_lock(&inet_diag_table_mutex);
1130 err = -EEXIST; 1098 err = -EEXIST;
1131 if (inet_diag_table[type] == NULL) { 1099 if (!inet_diag_table[type]) {
1132 inet_diag_table[type] = h; 1100 inet_diag_table[type] = h;
1133 err = 0; 1101 err = 0;
1134 } 1102 }
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index e7920352646a..5e346a082e5f 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -385,7 +385,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
385 } 385 }
386 386
387 q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); 387 q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
388 if (q == NULL) 388 if (!q)
389 return NULL; 389 return NULL;
390 390
391 q->net = nf; 391 q->net = nf;
@@ -406,7 +406,7 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
406 struct inet_frag_queue *q; 406 struct inet_frag_queue *q;
407 407
408 q = inet_frag_alloc(nf, f, arg); 408 q = inet_frag_alloc(nf, f, arg);
409 if (q == NULL) 409 if (!q)
410 return NULL; 410 return NULL;
411 411
412 return inet_frag_intern(nf, q, f, arg); 412 return inet_frag_intern(nf, q, f, arg);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 9111a4e22155..c6fb80bd5826 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -24,9 +24,9 @@
24#include <net/secure_seq.h> 24#include <net/secure_seq.h>
25#include <net/ip.h> 25#include <net/ip.h>
26 26
27static unsigned int inet_ehashfn(struct net *net, const __be32 laddr, 27static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
28 const __u16 lport, const __be32 faddr, 28 const __u16 lport, const __be32 faddr,
29 const __be16 fport) 29 const __be16 fport)
30{ 30{
31 static u32 inet_ehash_secret __read_mostly; 31 static u32 inet_ehash_secret __read_mostly;
32 32
@@ -36,17 +36,21 @@ static unsigned int inet_ehashfn(struct net *net, const __be32 laddr,
36 inet_ehash_secret + net_hash_mix(net)); 36 inet_ehash_secret + net_hash_mix(net));
37} 37}
38 38
39 39/* This function handles inet_sock, but also timewait and request sockets
40static unsigned int inet_sk_ehashfn(const struct sock *sk) 40 * for IPv4/IPv6.
41 */
42u32 sk_ehashfn(const struct sock *sk)
41{ 43{
42 const struct inet_sock *inet = inet_sk(sk); 44#if IS_ENABLED(CONFIG_IPV6)
43 const __be32 laddr = inet->inet_rcv_saddr; 45 if (sk->sk_family == AF_INET6 &&
44 const __u16 lport = inet->inet_num; 46 !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
45 const __be32 faddr = inet->inet_daddr; 47 return inet6_ehashfn(sock_net(sk),
46 const __be16 fport = inet->inet_dport; 48 &sk->sk_v6_rcv_saddr, sk->sk_num,
47 struct net *net = sock_net(sk); 49 &sk->sk_v6_daddr, sk->sk_dport);
48 50#endif
49 return inet_ehashfn(net, laddr, lport, faddr, fport); 51 return inet_ehashfn(sock_net(sk),
52 sk->sk_rcv_saddr, sk->sk_num,
53 sk->sk_daddr, sk->sk_dport);
50} 54}
51 55
52/* 56/*
@@ -60,8 +64,8 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
60{ 64{
61 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 65 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
62 66
63 if (tb != NULL) { 67 if (tb) {
64 write_pnet(&tb->ib_net, hold_net(net)); 68 write_pnet(&tb->ib_net, net);
65 tb->port = snum; 69 tb->port = snum;
66 tb->fastreuse = 0; 70 tb->fastreuse = 0;
67 tb->fastreuseport = 0; 71 tb->fastreuseport = 0;
@@ -79,7 +83,6 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
79{ 83{
80 if (hlist_empty(&tb->owners)) { 84 if (hlist_empty(&tb->owners)) {
81 __hlist_del(&tb->node); 85 __hlist_del(&tb->node);
82 release_net(ib_net(tb));
83 kmem_cache_free(cachep, tb); 86 kmem_cache_free(cachep, tb);
84 } 87 }
85} 88}
@@ -263,11 +266,19 @@ void sock_gen_put(struct sock *sk)
263 266
264 if (sk->sk_state == TCP_TIME_WAIT) 267 if (sk->sk_state == TCP_TIME_WAIT)
265 inet_twsk_free(inet_twsk(sk)); 268 inet_twsk_free(inet_twsk(sk));
269 else if (sk->sk_state == TCP_NEW_SYN_RECV)
270 reqsk_free(inet_reqsk(sk));
266 else 271 else
267 sk_free(sk); 272 sk_free(sk);
268} 273}
269EXPORT_SYMBOL_GPL(sock_gen_put); 274EXPORT_SYMBOL_GPL(sock_gen_put);
270 275
276void sock_edemux(struct sk_buff *skb)
277{
278 sock_gen_put(skb->sk);
279}
280EXPORT_SYMBOL(sock_edemux);
281
271struct sock *__inet_lookup_established(struct net *net, 282struct sock *__inet_lookup_established(struct net *net,
272 struct inet_hashinfo *hashinfo, 283 struct inet_hashinfo *hashinfo,
273 const __be32 saddr, const __be16 sport, 284 const __be32 saddr, const __be16 sport,
@@ -377,7 +388,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
377 *twp = tw; 388 *twp = tw;
378 } else if (tw) { 389 } else if (tw) {
379 /* Silly. Should hash-dance instead... */ 390 /* Silly. Should hash-dance instead... */
380 inet_twsk_deschedule(tw, death_row); 391 inet_twsk_deschedule(tw);
381 392
382 inet_twsk_put(tw); 393 inet_twsk_put(tw);
383 } 394 }
@@ -400,13 +411,13 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
400{ 411{
401 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 412 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
402 struct hlist_nulls_head *list; 413 struct hlist_nulls_head *list;
403 spinlock_t *lock;
404 struct inet_ehash_bucket *head; 414 struct inet_ehash_bucket *head;
415 spinlock_t *lock;
405 int twrefcnt = 0; 416 int twrefcnt = 0;
406 417
407 WARN_ON(!sk_unhashed(sk)); 418 WARN_ON(!sk_unhashed(sk));
408 419
409 sk->sk_hash = inet_sk_ehashfn(sk); 420 sk->sk_hash = sk_ehashfn(sk);
410 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 421 head = inet_ehash_bucket(hashinfo, sk->sk_hash);
411 list = &head->chain; 422 list = &head->chain;
412 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 423 lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
@@ -423,15 +434,13 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
423} 434}
424EXPORT_SYMBOL_GPL(__inet_hash_nolisten); 435EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
425 436
426static void __inet_hash(struct sock *sk) 437int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw)
427{ 438{
428 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 439 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
429 struct inet_listen_hashbucket *ilb; 440 struct inet_listen_hashbucket *ilb;
430 441
431 if (sk->sk_state != TCP_LISTEN) { 442 if (sk->sk_state != TCP_LISTEN)
432 __inet_hash_nolisten(sk, NULL); 443 return __inet_hash_nolisten(sk, tw);
433 return;
434 }
435 444
436 WARN_ON(!sk_unhashed(sk)); 445 WARN_ON(!sk_unhashed(sk));
437 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 446 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
@@ -440,13 +449,15 @@ static void __inet_hash(struct sock *sk)
440 __sk_nulls_add_node_rcu(sk, &ilb->head); 449 __sk_nulls_add_node_rcu(sk, &ilb->head);
441 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 450 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
442 spin_unlock(&ilb->lock); 451 spin_unlock(&ilb->lock);
452 return 0;
443} 453}
454EXPORT_SYMBOL(__inet_hash);
444 455
445void inet_hash(struct sock *sk) 456void inet_hash(struct sock *sk)
446{ 457{
447 if (sk->sk_state != TCP_CLOSE) { 458 if (sk->sk_state != TCP_CLOSE) {
448 local_bh_disable(); 459 local_bh_disable();
449 __inet_hash(sk); 460 __inet_hash(sk, NULL);
450 local_bh_enable(); 461 local_bh_enable();
451 } 462 }
452} 463}
@@ -477,8 +488,7 @@ EXPORT_SYMBOL_GPL(inet_unhash);
477int __inet_hash_connect(struct inet_timewait_death_row *death_row, 488int __inet_hash_connect(struct inet_timewait_death_row *death_row,
478 struct sock *sk, u32 port_offset, 489 struct sock *sk, u32 port_offset,
479 int (*check_established)(struct inet_timewait_death_row *, 490 int (*check_established)(struct inet_timewait_death_row *,
480 struct sock *, __u16, struct inet_timewait_sock **), 491 struct sock *, __u16, struct inet_timewait_sock **))
481 int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
482{ 492{
483 struct inet_hashinfo *hinfo = death_row->hashinfo; 493 struct inet_hashinfo *hinfo = death_row->hashinfo;
484 const unsigned short snum = inet_sk(sk)->inet_num; 494 const unsigned short snum = inet_sk(sk)->inet_num;
@@ -548,14 +558,14 @@ ok:
548 inet_bind_hash(sk, tb, port); 558 inet_bind_hash(sk, tb, port);
549 if (sk_unhashed(sk)) { 559 if (sk_unhashed(sk)) {
550 inet_sk(sk)->inet_sport = htons(port); 560 inet_sk(sk)->inet_sport = htons(port);
551 twrefcnt += hash(sk, tw); 561 twrefcnt += __inet_hash_nolisten(sk, tw);
552 } 562 }
553 if (tw) 563 if (tw)
554 twrefcnt += inet_twsk_bind_unhash(tw, hinfo); 564 twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
555 spin_unlock(&head->lock); 565 spin_unlock(&head->lock);
556 566
557 if (tw) { 567 if (tw) {
558 inet_twsk_deschedule(tw, death_row); 568 inet_twsk_deschedule(tw);
559 while (twrefcnt) { 569 while (twrefcnt) {
560 twrefcnt--; 570 twrefcnt--;
561 inet_twsk_put(tw); 571 inet_twsk_put(tw);
@@ -570,7 +580,7 @@ ok:
570 tb = inet_csk(sk)->icsk_bind_hash; 580 tb = inet_csk(sk)->icsk_bind_hash;
571 spin_lock_bh(&head->lock); 581 spin_lock_bh(&head->lock);
572 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 582 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
573 hash(sk, NULL); 583 __inet_hash_nolisten(sk, NULL);
574 spin_unlock_bh(&head->lock); 584 spin_unlock_bh(&head->lock);
575 return 0; 585 return 0;
576 } else { 586 } else {
@@ -590,7 +600,7 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row,
590 struct sock *sk) 600 struct sock *sk)
591{ 601{
592 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), 602 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
593 __inet_check_established, __inet_hash_nolisten); 603 __inet_check_established);
594} 604}
595EXPORT_SYMBOL_GPL(inet_hash_connect); 605EXPORT_SYMBOL_GPL(inet_hash_connect);
596 606
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 6d592f8555fb..00ec8d5d7e7e 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -67,9 +67,9 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
67} 67}
68 68
69/* Must be called with locally disabled BHs. */ 69/* Must be called with locally disabled BHs. */
70static void __inet_twsk_kill(struct inet_timewait_sock *tw, 70static void inet_twsk_kill(struct inet_timewait_sock *tw)
71 struct inet_hashinfo *hashinfo)
72{ 71{
72 struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
73 struct inet_bind_hashbucket *bhead; 73 struct inet_bind_hashbucket *bhead;
74 int refcnt; 74 int refcnt;
75 /* Unlink from established hashes. */ 75 /* Unlink from established hashes. */
@@ -89,6 +89,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
89 89
90 BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt)); 90 BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt));
91 atomic_sub(refcnt, &tw->tw_refcnt); 91 atomic_sub(refcnt, &tw->tw_refcnt);
92 atomic_dec(&tw->tw_dr->tw_count);
93 inet_twsk_put(tw);
92} 94}
93 95
94void inet_twsk_free(struct inet_timewait_sock *tw) 96void inet_twsk_free(struct inet_timewait_sock *tw)
@@ -98,7 +100,6 @@ void inet_twsk_free(struct inet_timewait_sock *tw)
98#ifdef SOCK_REFCNT_DEBUG 100#ifdef SOCK_REFCNT_DEBUG
99 pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw); 101 pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw);
100#endif 102#endif
101 release_net(twsk_net(tw));
102 kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); 103 kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
103 module_put(owner); 104 module_put(owner);
104} 105}
@@ -169,16 +170,34 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
169} 170}
170EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); 171EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
171 172
172struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) 173void tw_timer_handler(unsigned long data)
173{ 174{
174 struct inet_timewait_sock *tw = 175 struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data;
175 kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, 176
176 GFP_ATOMIC); 177 if (tw->tw_kill)
177 if (tw != NULL) { 178 NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
179 else
180 NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
181 inet_twsk_kill(tw);
182}
183
184struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
185 struct inet_timewait_death_row *dr,
186 const int state)
187{
188 struct inet_timewait_sock *tw;
189
190 if (atomic_read(&dr->tw_count) >= dr->sysctl_max_tw_buckets)
191 return NULL;
192
193 tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
194 GFP_ATOMIC);
195 if (tw) {
178 const struct inet_sock *inet = inet_sk(sk); 196 const struct inet_sock *inet = inet_sk(sk);
179 197
180 kmemcheck_annotate_bitfield(tw, flags); 198 kmemcheck_annotate_bitfield(tw, flags);
181 199
200 tw->tw_dr = dr;
182 /* Give us an identity. */ 201 /* Give us an identity. */
183 tw->tw_daddr = inet->inet_daddr; 202 tw->tw_daddr = inet->inet_daddr;
184 tw->tw_rcv_saddr = inet->inet_rcv_saddr; 203 tw->tw_rcv_saddr = inet->inet_rcv_saddr;
@@ -195,14 +214,16 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
195 tw->tw_ipv6only = 0; 214 tw->tw_ipv6only = 0;
196 tw->tw_transparent = inet->transparent; 215 tw->tw_transparent = inet->transparent;
197 tw->tw_prot = sk->sk_prot_creator; 216 tw->tw_prot = sk->sk_prot_creator;
198 twsk_net_set(tw, hold_net(sock_net(sk))); 217 atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
218 twsk_net_set(tw, sock_net(sk));
219 setup_timer(&tw->tw_timer, tw_timer_handler, (unsigned long)tw);
199 /* 220 /*
200 * Because we use RCU lookups, we should not set tw_refcnt 221 * Because we use RCU lookups, we should not set tw_refcnt
201 * to a non null value before everything is setup for this 222 * to a non null value before everything is setup for this
202 * timewait socket. 223 * timewait socket.
203 */ 224 */
204 atomic_set(&tw->tw_refcnt, 0); 225 atomic_set(&tw->tw_refcnt, 0);
205 inet_twsk_dead_node_init(tw); 226
206 __module_get(tw->tw_prot->owner); 227 __module_get(tw->tw_prot->owner);
207 } 228 }
208 229
@@ -210,139 +231,20 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
210} 231}
211EXPORT_SYMBOL_GPL(inet_twsk_alloc); 232EXPORT_SYMBOL_GPL(inet_twsk_alloc);
212 233
213/* Returns non-zero if quota exceeded. */
214static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
215 const int slot)
216{
217 struct inet_timewait_sock *tw;
218 unsigned int killed;
219 int ret;
220
221 /* NOTE: compare this to previous version where lock
222 * was released after detaching chain. It was racy,
223 * because tw buckets are scheduled in not serialized context
224 * in 2.3 (with netfilter), and with softnet it is common, because
225 * soft irqs are not sequenced.
226 */
227 killed = 0;
228 ret = 0;
229rescan:
230 inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) {
231 __inet_twsk_del_dead_node(tw);
232 spin_unlock(&twdr->death_lock);
233 __inet_twsk_kill(tw, twdr->hashinfo);
234#ifdef CONFIG_NET_NS
235 NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
236#endif
237 inet_twsk_put(tw);
238 killed++;
239 spin_lock(&twdr->death_lock);
240 if (killed > INET_TWDR_TWKILL_QUOTA) {
241 ret = 1;
242 break;
243 }
244
245 /* While we dropped twdr->death_lock, another cpu may have
246 * killed off the next TW bucket in the list, therefore
247 * do a fresh re-read of the hlist head node with the
248 * lock reacquired. We still use the hlist traversal
249 * macro in order to get the prefetches.
250 */
251 goto rescan;
252 }
253
254 twdr->tw_count -= killed;
255#ifndef CONFIG_NET_NS
256 NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed);
257#endif
258 return ret;
259}
260
261void inet_twdr_hangman(unsigned long data)
262{
263 struct inet_timewait_death_row *twdr;
264 unsigned int need_timer;
265
266 twdr = (struct inet_timewait_death_row *)data;
267 spin_lock(&twdr->death_lock);
268
269 if (twdr->tw_count == 0)
270 goto out;
271
272 need_timer = 0;
273 if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
274 twdr->thread_slots |= (1 << twdr->slot);
275 schedule_work(&twdr->twkill_work);
276 need_timer = 1;
277 } else {
278 /* We purged the entire slot, anything left? */
279 if (twdr->tw_count)
280 need_timer = 1;
281 twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
282 }
283 if (need_timer)
284 mod_timer(&twdr->tw_timer, jiffies + twdr->period);
285out:
286 spin_unlock(&twdr->death_lock);
287}
288EXPORT_SYMBOL_GPL(inet_twdr_hangman);
289
290void inet_twdr_twkill_work(struct work_struct *work)
291{
292 struct inet_timewait_death_row *twdr =
293 container_of(work, struct inet_timewait_death_row, twkill_work);
294 int i;
295
296 BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) >
297 (sizeof(twdr->thread_slots) * 8));
298
299 while (twdr->thread_slots) {
300 spin_lock_bh(&twdr->death_lock);
301 for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
302 if (!(twdr->thread_slots & (1 << i)))
303 continue;
304
305 while (inet_twdr_do_twkill_work(twdr, i) != 0) {
306 if (need_resched()) {
307 spin_unlock_bh(&twdr->death_lock);
308 schedule();
309 spin_lock_bh(&twdr->death_lock);
310 }
311 }
312
313 twdr->thread_slots &= ~(1 << i);
314 }
315 spin_unlock_bh(&twdr->death_lock);
316 }
317}
318EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
319
320/* These are always called from BH context. See callers in 234/* These are always called from BH context. See callers in
321 * tcp_input.c to verify this. 235 * tcp_input.c to verify this.
322 */ 236 */
323 237
324/* This is for handling early-kills of TIME_WAIT sockets. */ 238/* This is for handling early-kills of TIME_WAIT sockets. */
325void inet_twsk_deschedule(struct inet_timewait_sock *tw, 239void inet_twsk_deschedule(struct inet_timewait_sock *tw)
326 struct inet_timewait_death_row *twdr)
327{ 240{
328 spin_lock(&twdr->death_lock); 241 if (del_timer_sync(&tw->tw_timer))
329 if (inet_twsk_del_dead_node(tw)) { 242 inet_twsk_kill(tw);
330 inet_twsk_put(tw);
331 if (--twdr->tw_count == 0)
332 del_timer(&twdr->tw_timer);
333 }
334 spin_unlock(&twdr->death_lock);
335 __inet_twsk_kill(tw, twdr->hashinfo);
336} 243}
337EXPORT_SYMBOL(inet_twsk_deschedule); 244EXPORT_SYMBOL(inet_twsk_deschedule);
338 245
339void inet_twsk_schedule(struct inet_timewait_sock *tw, 246void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo)
340 struct inet_timewait_death_row *twdr,
341 const int timeo, const int timewait_len)
342{ 247{
343 struct hlist_head *list;
344 int slot;
345
346 /* timeout := RTO * 3.5 248 /* timeout := RTO * 3.5
347 * 249 *
348 * 3.5 = 1+2+0.5 to wait for two retransmits. 250 * 3.5 = 1+2+0.5 to wait for two retransmits.
@@ -367,115 +269,15 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
367 * is greater than TS tick!) and detect old duplicates with help 269 * is greater than TS tick!) and detect old duplicates with help
368 * of PAWS. 270 * of PAWS.
369 */ 271 */
370 slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
371 272
372 spin_lock(&twdr->death_lock); 273 tw->tw_kill = timeo <= 4*HZ;
373 274 if (!mod_timer_pinned(&tw->tw_timer, jiffies + timeo)) {
374 /* Unlink it, if it was scheduled */
375 if (inet_twsk_del_dead_node(tw))
376 twdr->tw_count--;
377 else
378 atomic_inc(&tw->tw_refcnt); 275 atomic_inc(&tw->tw_refcnt);
379 276 atomic_inc(&tw->tw_dr->tw_count);
380 if (slot >= INET_TWDR_RECYCLE_SLOTS) {
381 /* Schedule to slow timer */
382 if (timeo >= timewait_len) {
383 slot = INET_TWDR_TWKILL_SLOTS - 1;
384 } else {
385 slot = DIV_ROUND_UP(timeo, twdr->period);
386 if (slot >= INET_TWDR_TWKILL_SLOTS)
387 slot = INET_TWDR_TWKILL_SLOTS - 1;
388 }
389 tw->tw_ttd = inet_tw_time_stamp() + timeo;
390 slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
391 list = &twdr->cells[slot];
392 } else {
393 tw->tw_ttd = inet_tw_time_stamp() + (slot << INET_TWDR_RECYCLE_TICK);
394
395 if (twdr->twcal_hand < 0) {
396 twdr->twcal_hand = 0;
397 twdr->twcal_jiffie = jiffies;
398 twdr->twcal_timer.expires = twdr->twcal_jiffie +
399 (slot << INET_TWDR_RECYCLE_TICK);
400 add_timer(&twdr->twcal_timer);
401 } else {
402 if (time_after(twdr->twcal_timer.expires,
403 jiffies + (slot << INET_TWDR_RECYCLE_TICK)))
404 mod_timer(&twdr->twcal_timer,
405 jiffies + (slot << INET_TWDR_RECYCLE_TICK));
406 slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
407 }
408 list = &twdr->twcal_row[slot];
409 } 277 }
410
411 hlist_add_head(&tw->tw_death_node, list);
412
413 if (twdr->tw_count++ == 0)
414 mod_timer(&twdr->tw_timer, jiffies + twdr->period);
415 spin_unlock(&twdr->death_lock);
416} 278}
417EXPORT_SYMBOL_GPL(inet_twsk_schedule); 279EXPORT_SYMBOL_GPL(inet_twsk_schedule);
418 280
419void inet_twdr_twcal_tick(unsigned long data)
420{
421 struct inet_timewait_death_row *twdr;
422 int n, slot;
423 unsigned long j;
424 unsigned long now = jiffies;
425 int killed = 0;
426 int adv = 0;
427
428 twdr = (struct inet_timewait_death_row *)data;
429
430 spin_lock(&twdr->death_lock);
431 if (twdr->twcal_hand < 0)
432 goto out;
433
434 slot = twdr->twcal_hand;
435 j = twdr->twcal_jiffie;
436
437 for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
438 if (time_before_eq(j, now)) {
439 struct hlist_node *safe;
440 struct inet_timewait_sock *tw;
441
442 inet_twsk_for_each_inmate_safe(tw, safe,
443 &twdr->twcal_row[slot]) {
444 __inet_twsk_del_dead_node(tw);
445 __inet_twsk_kill(tw, twdr->hashinfo);
446#ifdef CONFIG_NET_NS
447 NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
448#endif
449 inet_twsk_put(tw);
450 killed++;
451 }
452 } else {
453 if (!adv) {
454 adv = 1;
455 twdr->twcal_jiffie = j;
456 twdr->twcal_hand = slot;
457 }
458
459 if (!hlist_empty(&twdr->twcal_row[slot])) {
460 mod_timer(&twdr->twcal_timer, j);
461 goto out;
462 }
463 }
464 j += 1 << INET_TWDR_RECYCLE_TICK;
465 slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);
466 }
467 twdr->twcal_hand = -1;
468
469out:
470 if ((twdr->tw_count -= killed) == 0)
471 del_timer(&twdr->tw_timer);
472#ifndef CONFIG_NET_NS
473 NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed);
474#endif
475 spin_unlock(&twdr->death_lock);
476}
477EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
478
479void inet_twsk_purge(struct inet_hashinfo *hashinfo, 281void inet_twsk_purge(struct inet_hashinfo *hashinfo,
480 struct inet_timewait_death_row *twdr, int family) 282 struct inet_timewait_death_row *twdr, int family)
481{ 283{
@@ -487,6 +289,7 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo,
487 for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { 289 for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
488 struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; 290 struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
489restart_rcu: 291restart_rcu:
292 cond_resched();
490 rcu_read_lock(); 293 rcu_read_lock();
491restart: 294restart:
492 sk_nulls_for_each_rcu(sk, node, &head->chain) { 295 sk_nulls_for_each_rcu(sk, node, &head->chain) {
@@ -508,7 +311,7 @@ restart:
508 311
509 rcu_read_unlock(); 312 rcu_read_unlock();
510 local_bh_disable(); 313 local_bh_disable();
511 inet_twsk_deschedule(tw, twdr); 314 inet_twsk_deschedule(tw);
512 local_bh_enable(); 315 local_bh_enable();
513 inet_twsk_put(tw); 316 inet_twsk_put(tw);
514 goto restart_rcu; 317 goto restart_rcu;
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 787b3c294ce6..3674484946a5 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -57,7 +57,7 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
57} 57}
58 58
59 59
60static int ip_forward_finish(struct sk_buff *skb) 60static int ip_forward_finish(struct sock *sk, struct sk_buff *skb)
61{ 61{
62 struct ip_options *opt = &(IPCB(skb)->opt); 62 struct ip_options *opt = &(IPCB(skb)->opt);
63 63
@@ -67,7 +67,8 @@ static int ip_forward_finish(struct sk_buff *skb)
67 if (unlikely(opt->optlen)) 67 if (unlikely(opt->optlen))
68 ip_forward_options(skb); 68 ip_forward_options(skb);
69 69
70 return dst_output(skb); 70 skb_sender_cpu_clear(skb);
71 return dst_output_sk(sk, skb);
71} 72}
72 73
73int ip_forward(struct sk_buff *skb) 74int ip_forward(struct sk_buff *skb)
@@ -81,6 +82,9 @@ int ip_forward(struct sk_buff *skb)
81 if (skb->pkt_type != PACKET_HOST) 82 if (skb->pkt_type != PACKET_HOST)
82 goto drop; 83 goto drop;
83 84
85 if (unlikely(skb->sk))
86 goto drop;
87
84 if (skb_warn_if_lro(skb)) 88 if (skb_warn_if_lro(skb))
85 goto drop; 89 goto drop;
86 90
@@ -135,8 +139,8 @@ int ip_forward(struct sk_buff *skb)
135 139
136 skb->priority = rt_tos2priority(iph->tos); 140 skb->priority = rt_tos2priority(iph->tos);
137 141
138 return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, 142 return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb,
139 rt->dst.dev, ip_forward_finish); 143 skb->dev, rt->dst.dev, ip_forward_finish);
140 144
141sr_failed: 145sr_failed:
142 /* 146 /*
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index e5b6d0ddcb58..cc1da6d9cb35 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -372,7 +372,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
372 goto err; 372 goto err;
373 373
374 err = -ENOMEM; 374 err = -ENOMEM;
375 if (pskb_pull(skb, ihl) == NULL) 375 if (!pskb_pull(skb, ihl))
376 goto err; 376 goto err;
377 377
378 err = pskb_trim_rcsum(skb, end - offset); 378 err = pskb_trim_rcsum(skb, end - offset);
@@ -537,7 +537,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
537 qp->q.fragments = head; 537 qp->q.fragments = head;
538 } 538 }
539 539
540 WARN_ON(head == NULL); 540 WARN_ON(!head);
541 WARN_ON(FRAG_CB(head)->offset != 0); 541 WARN_ON(FRAG_CB(head)->offset != 0);
542 542
543 /* Allocate a new buffer for the datagram. */ 543 /* Allocate a new buffer for the datagram. */
@@ -559,7 +559,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
559 struct sk_buff *clone; 559 struct sk_buff *clone;
560 int i, plen = 0; 560 int i, plen = 0;
561 561
562 if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) 562 clone = alloc_skb(0, GFP_ATOMIC);
563 if (!clone)
563 goto out_nomem; 564 goto out_nomem;
564 clone->next = head->next; 565 clone->next = head->next;
565 head->next = clone; 566 head->next = clone;
@@ -638,7 +639,8 @@ int ip_defrag(struct sk_buff *skb, u32 user)
638 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); 639 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
639 640
640 /* Lookup (or create) queue header */ 641 /* Lookup (or create) queue header */
641 if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { 642 qp = ip_find(net, ip_hdr(skb), user);
643 if (qp) {
642 int ret; 644 int ret;
643 645
644 spin_lock(&qp->q.lock); 646 spin_lock(&qp->q.lock);
@@ -659,27 +661,30 @@ EXPORT_SYMBOL(ip_defrag);
659struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) 661struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
660{ 662{
661 struct iphdr iph; 663 struct iphdr iph;
664 int netoff;
662 u32 len; 665 u32 len;
663 666
664 if (skb->protocol != htons(ETH_P_IP)) 667 if (skb->protocol != htons(ETH_P_IP))
665 return skb; 668 return skb;
666 669
667 if (!skb_copy_bits(skb, 0, &iph, sizeof(iph))) 670 netoff = skb_network_offset(skb);
671
672 if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0)
668 return skb; 673 return skb;
669 674
670 if (iph.ihl < 5 || iph.version != 4) 675 if (iph.ihl < 5 || iph.version != 4)
671 return skb; 676 return skb;
672 677
673 len = ntohs(iph.tot_len); 678 len = ntohs(iph.tot_len);
674 if (skb->len < len || len < (iph.ihl * 4)) 679 if (skb->len < netoff + len || len < (iph.ihl * 4))
675 return skb; 680 return skb;
676 681
677 if (ip_is_fragment(&iph)) { 682 if (ip_is_fragment(&iph)) {
678 skb = skb_share_check(skb, GFP_ATOMIC); 683 skb = skb_share_check(skb, GFP_ATOMIC);
679 if (skb) { 684 if (skb) {
680 if (!pskb_may_pull(skb, iph.ihl*4)) 685 if (!pskb_may_pull(skb, netoff + iph.ihl * 4))
681 return skb; 686 return skb;
682 if (pskb_trim_rcsum(skb, len)) 687 if (pskb_trim_rcsum(skb, netoff + len))
683 return skb; 688 return skb;
684 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); 689 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
685 if (ip_defrag(skb, user)) 690 if (ip_defrag(skb, user))
@@ -751,7 +756,7 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
751 table = ip4_frags_ns_ctl_table; 756 table = ip4_frags_ns_ctl_table;
752 if (!net_eq(net, &init_net)) { 757 if (!net_eq(net, &init_net)) {
753 table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL); 758 table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
754 if (table == NULL) 759 if (!table)
755 goto err_alloc; 760 goto err_alloc;
756 761
757 table[0].data = &net->ipv4.frags.high_thresh; 762 table[0].data = &net->ipv4.frags.high_thresh;
@@ -767,7 +772,7 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
767 } 772 }
768 773
769 hdr = register_net_sysctl(net, "net/ipv4", table); 774 hdr = register_net_sysctl(net, "net/ipv4", table);
770 if (hdr == NULL) 775 if (!hdr)
771 goto err_reg; 776 goto err_reg;
772 777
773 net->ipv4.frags_hdr = hdr; 778 net->ipv4.frags_hdr = hdr;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 6207275fc749..5fd706473c73 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -182,7 +182,7 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
182 t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, 182 t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
183 iph->daddr, iph->saddr, tpi->key); 183 iph->daddr, iph->saddr, tpi->key);
184 184
185 if (t == NULL) 185 if (!t)
186 return PACKET_REJECT; 186 return PACKET_REJECT;
187 187
188 if (t->parms.iph.daddr == 0 || 188 if (t->parms.iph.daddr == 0 ||
@@ -423,7 +423,7 @@ static int ipgre_open(struct net_device *dev)
423 return -EADDRNOTAVAIL; 423 return -EADDRNOTAVAIL;
424 dev = rt->dst.dev; 424 dev = rt->dst.dev;
425 ip_rt_put(rt); 425 ip_rt_put(rt);
426 if (__in_dev_get_rtnl(dev) == NULL) 426 if (!__in_dev_get_rtnl(dev))
427 return -EADDRNOTAVAIL; 427 return -EADDRNOTAVAIL;
428 t->mlink = dev->ifindex; 428 t->mlink = dev->ifindex;
429 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr); 429 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
@@ -456,6 +456,7 @@ static const struct net_device_ops ipgre_netdev_ops = {
456 .ndo_do_ioctl = ipgre_tunnel_ioctl, 456 .ndo_do_ioctl = ipgre_tunnel_ioctl,
457 .ndo_change_mtu = ip_tunnel_change_mtu, 457 .ndo_change_mtu = ip_tunnel_change_mtu,
458 .ndo_get_stats64 = ip_tunnel_get_stats64, 458 .ndo_get_stats64 = ip_tunnel_get_stats64,
459 .ndo_get_iflink = ip_tunnel_get_iflink,
459}; 460};
460 461
461#define GRE_FEATURES (NETIF_F_SG | \ 462#define GRE_FEATURES (NETIF_F_SG | \
@@ -621,10 +622,10 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
621 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); 622 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
622 623
623 if (data[IFLA_GRE_LOCAL]) 624 if (data[IFLA_GRE_LOCAL])
624 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]); 625 parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
625 626
626 if (data[IFLA_GRE_REMOTE]) 627 if (data[IFLA_GRE_REMOTE])
627 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]); 628 parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
628 629
629 if (data[IFLA_GRE_TTL]) 630 if (data[IFLA_GRE_TTL])
630 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]); 631 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
@@ -686,6 +687,7 @@ static const struct net_device_ops gre_tap_netdev_ops = {
686 .ndo_validate_addr = eth_validate_addr, 687 .ndo_validate_addr = eth_validate_addr,
687 .ndo_change_mtu = ip_tunnel_change_mtu, 688 .ndo_change_mtu = ip_tunnel_change_mtu,
688 .ndo_get_stats64 = ip_tunnel_get_stats64, 689 .ndo_get_stats64 = ip_tunnel_get_stats64,
690 .ndo_get_iflink = ip_tunnel_get_iflink,
689}; 691};
690 692
691static void ipgre_tap_setup(struct net_device *dev) 693static void ipgre_tap_setup(struct net_device *dev)
@@ -776,8 +778,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
776 nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) || 778 nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) ||
777 nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || 779 nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
778 nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || 780 nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
779 nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) || 781 nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
780 nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) || 782 nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
781 nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) || 783 nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
782 nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) || 784 nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
783 nla_put_u8(skb, IFLA_GRE_PMTUDISC, 785 nla_put_u8(skb, IFLA_GRE_PMTUDISC,
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 3d4da2c16b6a..2db4c8773c1b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -187,7 +187,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
187 return false; 187 return false;
188} 188}
189 189
190static int ip_local_deliver_finish(struct sk_buff *skb) 190static int ip_local_deliver_finish(struct sock *sk, struct sk_buff *skb)
191{ 191{
192 struct net *net = dev_net(skb->dev); 192 struct net *net = dev_net(skb->dev);
193 193
@@ -203,7 +203,7 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
203 raw = raw_local_deliver(skb, protocol); 203 raw = raw_local_deliver(skb, protocol);
204 204
205 ipprot = rcu_dereference(inet_protos[protocol]); 205 ipprot = rcu_dereference(inet_protos[protocol]);
206 if (ipprot != NULL) { 206 if (ipprot) {
207 int ret; 207 int ret;
208 208
209 if (!ipprot->no_policy) { 209 if (!ipprot->no_policy) {
@@ -253,7 +253,8 @@ int ip_local_deliver(struct sk_buff *skb)
253 return 0; 253 return 0;
254 } 254 }
255 255
256 return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL, 256 return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, NULL, skb,
257 skb->dev, NULL,
257 ip_local_deliver_finish); 258 ip_local_deliver_finish);
258} 259}
259 260
@@ -309,12 +310,12 @@ drop:
309int sysctl_ip_early_demux __read_mostly = 1; 310int sysctl_ip_early_demux __read_mostly = 1;
310EXPORT_SYMBOL(sysctl_ip_early_demux); 311EXPORT_SYMBOL(sysctl_ip_early_demux);
311 312
312static int ip_rcv_finish(struct sk_buff *skb) 313static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb)
313{ 314{
314 const struct iphdr *iph = ip_hdr(skb); 315 const struct iphdr *iph = ip_hdr(skb);
315 struct rtable *rt; 316 struct rtable *rt;
316 317
317 if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { 318 if (sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk) {
318 const struct net_protocol *ipprot; 319 const struct net_protocol *ipprot;
319 int protocol = iph->protocol; 320 int protocol = iph->protocol;
320 321
@@ -387,7 +388,8 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
387 388
388 IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len); 389 IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);
389 390
390 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { 391 skb = skb_share_check(skb, GFP_ATOMIC);
392 if (!skb) {
391 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); 393 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
392 goto out; 394 goto out;
393 } 395 }
@@ -450,7 +452,8 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
450 /* Must drop socket now because of tproxy. */ 452 /* Must drop socket now because of tproxy. */
451 skb_orphan(skb); 453 skb_orphan(skb);
452 454
453 return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL, 455 return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb,
456 dev, NULL,
454 ip_rcv_finish); 457 ip_rcv_finish);
455 458
456csum_error: 459csum_error:
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 5b3d91be2db0..bd246792360b 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -264,7 +264,7 @@ int ip_options_compile(struct net *net,
264 unsigned char *iph; 264 unsigned char *iph;
265 int optlen, l; 265 int optlen, l;
266 266
267 if (skb != NULL) { 267 if (skb) {
268 rt = skb_rtable(skb); 268 rt = skb_rtable(skb);
269 optptr = (unsigned char *)&(ip_hdr(skb)[1]); 269 optptr = (unsigned char *)&(ip_hdr(skb)[1]);
270 } else 270 } else
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index d68199d9b2b0..c65b93a7b711 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -91,14 +91,19 @@ void ip_send_check(struct iphdr *iph)
91} 91}
92EXPORT_SYMBOL(ip_send_check); 92EXPORT_SYMBOL(ip_send_check);
93 93
94int __ip_local_out(struct sk_buff *skb) 94int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
95{ 95{
96 struct iphdr *iph = ip_hdr(skb); 96 struct iphdr *iph = ip_hdr(skb);
97 97
98 iph->tot_len = htons(skb->len); 98 iph->tot_len = htons(skb->len);
99 ip_send_check(iph); 99 ip_send_check(iph);
100 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL, 100 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, NULL,
101 skb_dst(skb)->dev, dst_output); 101 skb_dst(skb)->dev, dst_output_sk);
102}
103
104int __ip_local_out(struct sk_buff *skb)
105{
106 return __ip_local_out_sk(skb->sk, skb);
102} 107}
103 108
104int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) 109int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
@@ -148,7 +153,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
148 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); 153 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
149 iph->saddr = saddr; 154 iph->saddr = saddr;
150 iph->protocol = sk->sk_protocol; 155 iph->protocol = sk->sk_protocol;
151 ip_select_ident(skb, sk); 156 ip_select_ident(sock_net(sk), skb, sk);
152 157
153 if (opt && opt->opt.optlen) { 158 if (opt && opt->opt.optlen) {
154 iph->ihl += opt->opt.optlen>>2; 159 iph->ihl += opt->opt.optlen>>2;
@@ -163,7 +168,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
163} 168}
164EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); 169EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
165 170
166static inline int ip_finish_output2(struct sk_buff *skb) 171static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
167{ 172{
168 struct dst_entry *dst = skb_dst(skb); 173 struct dst_entry *dst = skb_dst(skb);
169 struct rtable *rt = (struct rtable *)dst; 174 struct rtable *rt = (struct rtable *)dst;
@@ -182,7 +187,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
182 struct sk_buff *skb2; 187 struct sk_buff *skb2;
183 188
184 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); 189 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
185 if (skb2 == NULL) { 190 if (!skb2) {
186 kfree_skb(skb); 191 kfree_skb(skb);
187 return -ENOMEM; 192 return -ENOMEM;
188 } 193 }
@@ -211,7 +216,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
211 return -EINVAL; 216 return -EINVAL;
212} 217}
213 218
214static int ip_finish_output_gso(struct sk_buff *skb) 219static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
215{ 220{
216 netdev_features_t features; 221 netdev_features_t features;
217 struct sk_buff *segs; 222 struct sk_buff *segs;
@@ -220,7 +225,7 @@ static int ip_finish_output_gso(struct sk_buff *skb)
220 /* common case: locally created skb or seglen is <= mtu */ 225 /* common case: locally created skb or seglen is <= mtu */
221 if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || 226 if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
222 skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb)) 227 skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
223 return ip_finish_output2(skb); 228 return ip_finish_output2(sk, skb);
224 229
225 /* Slowpath - GSO segment length is exceeding the dst MTU. 230 /* Slowpath - GSO segment length is exceeding the dst MTU.
226 * 231 *
@@ -243,7 +248,7 @@ static int ip_finish_output_gso(struct sk_buff *skb)
243 int err; 248 int err;
244 249
245 segs->next = NULL; 250 segs->next = NULL;
246 err = ip_fragment(segs, ip_finish_output2); 251 err = ip_fragment(sk, segs, ip_finish_output2);
247 252
248 if (err && ret == 0) 253 if (err && ret == 0)
249 ret = err; 254 ret = err;
@@ -253,22 +258,22 @@ static int ip_finish_output_gso(struct sk_buff *skb)
253 return ret; 258 return ret;
254} 259}
255 260
256static int ip_finish_output(struct sk_buff *skb) 261static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
257{ 262{
258#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 263#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
259 /* Policy lookup after SNAT yielded a new policy */ 264 /* Policy lookup after SNAT yielded a new policy */
260 if (skb_dst(skb)->xfrm != NULL) { 265 if (skb_dst(skb)->xfrm) {
261 IPCB(skb)->flags |= IPSKB_REROUTED; 266 IPCB(skb)->flags |= IPSKB_REROUTED;
262 return dst_output(skb); 267 return dst_output_sk(sk, skb);
263 } 268 }
264#endif 269#endif
265 if (skb_is_gso(skb)) 270 if (skb_is_gso(skb))
266 return ip_finish_output_gso(skb); 271 return ip_finish_output_gso(sk, skb);
267 272
268 if (skb->len > ip_skb_dst_mtu(skb)) 273 if (skb->len > ip_skb_dst_mtu(skb))
269 return ip_fragment(skb, ip_finish_output2); 274 return ip_fragment(sk, skb, ip_finish_output2);
270 275
271 return ip_finish_output2(skb); 276 return ip_finish_output2(sk, skb);
272} 277}
273 278
274int ip_mc_output(struct sock *sk, struct sk_buff *skb) 279int ip_mc_output(struct sock *sk, struct sk_buff *skb)
@@ -307,7 +312,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb)
307 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 312 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
308 if (newskb) 313 if (newskb)
309 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, 314 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
310 newskb, NULL, newskb->dev, 315 sk, newskb, NULL, newskb->dev,
311 dev_loopback_xmit); 316 dev_loopback_xmit);
312 } 317 }
313 318
@@ -322,11 +327,11 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb)
322 if (rt->rt_flags&RTCF_BROADCAST) { 327 if (rt->rt_flags&RTCF_BROADCAST) {
323 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 328 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
324 if (newskb) 329 if (newskb)
325 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, 330 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, newskb,
326 NULL, newskb->dev, dev_loopback_xmit); 331 NULL, newskb->dev, dev_loopback_xmit);
327 } 332 }
328 333
329 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, 334 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, NULL,
330 skb->dev, ip_finish_output, 335 skb->dev, ip_finish_output,
331 !(IPCB(skb)->flags & IPSKB_REROUTED)); 336 !(IPCB(skb)->flags & IPSKB_REROUTED));
332} 337}
@@ -340,7 +345,8 @@ int ip_output(struct sock *sk, struct sk_buff *skb)
340 skb->dev = dev; 345 skb->dev = dev;
341 skb->protocol = htons(ETH_P_IP); 346 skb->protocol = htons(ETH_P_IP);
342 347
343 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev, 348 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb,
349 NULL, dev,
344 ip_finish_output, 350 ip_finish_output,
345 !(IPCB(skb)->flags & IPSKB_REROUTED)); 351 !(IPCB(skb)->flags & IPSKB_REROUTED));
346} 352}
@@ -376,12 +382,12 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
376 inet_opt = rcu_dereference(inet->inet_opt); 382 inet_opt = rcu_dereference(inet->inet_opt);
377 fl4 = &fl->u.ip4; 383 fl4 = &fl->u.ip4;
378 rt = skb_rtable(skb); 384 rt = skb_rtable(skb);
379 if (rt != NULL) 385 if (rt)
380 goto packet_routed; 386 goto packet_routed;
381 387
382 /* Make sure we can route this packet. */ 388 /* Make sure we can route this packet. */
383 rt = (struct rtable *)__sk_dst_check(sk, 0); 389 rt = (struct rtable *)__sk_dst_check(sk, 0);
384 if (rt == NULL) { 390 if (!rt) {
385 __be32 daddr; 391 __be32 daddr;
386 392
387 /* Use correct destination address if we have options. */ 393 /* Use correct destination address if we have options. */
@@ -430,7 +436,8 @@ packet_routed:
430 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); 436 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
431 } 437 }
432 438
433 ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1); 439 ip_select_ident_segs(sock_net(sk), skb, sk,
440 skb_shinfo(skb)->gso_segs ?: 1);
434 441
435 /* TODO : should we use skb->sk here instead of sk ? */ 442 /* TODO : should we use skb->sk here instead of sk ? */
436 skb->priority = sk->sk_priority; 443 skb->priority = sk->sk_priority;
@@ -448,7 +455,6 @@ no_route:
448} 455}
449EXPORT_SYMBOL(ip_queue_xmit); 456EXPORT_SYMBOL(ip_queue_xmit);
450 457
451
452static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) 458static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
453{ 459{
454 to->pkt_type = from->pkt_type; 460 to->pkt_type = from->pkt_type;
@@ -479,7 +485,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
479 * single device frame, and queue such a frame for sending. 485 * single device frame, and queue such a frame for sending.
480 */ 486 */
481 487
482int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) 488int ip_fragment(struct sock *sk, struct sk_buff *skb,
489 int (*output)(struct sock *, struct sk_buff *))
483{ 490{
484 struct iphdr *iph; 491 struct iphdr *iph;
485 int ptr; 492 int ptr;
@@ -586,13 +593,13 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
586 ip_options_fragment(frag); 593 ip_options_fragment(frag);
587 offset += skb->len - hlen; 594 offset += skb->len - hlen;
588 iph->frag_off = htons(offset>>3); 595 iph->frag_off = htons(offset>>3);
589 if (frag->next != NULL) 596 if (frag->next)
590 iph->frag_off |= htons(IP_MF); 597 iph->frag_off |= htons(IP_MF);
591 /* Ready, complete checksum */ 598 /* Ready, complete checksum */
592 ip_send_check(iph); 599 ip_send_check(iph);
593 } 600 }
594 601
595 err = output(skb); 602 err = output(sk, skb);
596 603
597 if (!err) 604 if (!err)
598 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); 605 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
@@ -636,10 +643,7 @@ slow_path:
636 left = skb->len - hlen; /* Space per frame */ 643 left = skb->len - hlen; /* Space per frame */
637 ptr = hlen; /* Where to start from */ 644 ptr = hlen; /* Where to start from */
638 645
639 /* for bridged IP traffic encapsulated inside f.e. a vlan header, 646 ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
640 * we need to make room for the encapsulating header
641 */
642 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
643 647
644 /* 648 /*
645 * Fragment the datagram. 649 * Fragment the datagram.
@@ -732,7 +736,7 @@ slow_path:
732 736
733 ip_send_check(iph); 737 ip_send_check(iph);
734 738
735 err = output(skb2); 739 err = output(sk, skb2);
736 if (err) 740 if (err)
737 goto fail; 741 goto fail;
738 742
@@ -792,12 +796,13 @@ static inline int ip_ufo_append_data(struct sock *sk,
792 * device, so create one single skb packet containing complete 796 * device, so create one single skb packet containing complete
793 * udp datagram 797 * udp datagram
794 */ 798 */
795 if ((skb = skb_peek_tail(queue)) == NULL) { 799 skb = skb_peek_tail(queue);
800 if (!skb) {
796 skb = sock_alloc_send_skb(sk, 801 skb = sock_alloc_send_skb(sk,
797 hh_len + fragheaderlen + transhdrlen + 20, 802 hh_len + fragheaderlen + transhdrlen + 20,
798 (flags & MSG_DONTWAIT), &err); 803 (flags & MSG_DONTWAIT), &err);
799 804
800 if (skb == NULL) 805 if (!skb)
801 return err; 806 return err;
802 807
803 /* reserve space for Hardware header */ 808 /* reserve space for Hardware header */
@@ -814,7 +819,6 @@ static inline int ip_ufo_append_data(struct sock *sk,
814 819
815 skb->csum = 0; 820 skb->csum = 0;
816 821
817
818 __skb_queue_tail(queue, skb); 822 __skb_queue_tail(queue, skb);
819 } else if (skb_is_gso(skb)) { 823 } else if (skb_is_gso(skb)) {
820 goto append; 824 goto append;
@@ -888,7 +892,8 @@ static int __ip_append_data(struct sock *sk,
888 cork->length += length; 892 cork->length += length;
889 if (((length > mtu) || (skb && skb_is_gso(skb))) && 893 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
890 (sk->sk_protocol == IPPROTO_UDP) && 894 (sk->sk_protocol == IPPROTO_UDP) &&
891 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) { 895 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
896 (sk->sk_type == SOCK_DGRAM)) {
892 err = ip_ufo_append_data(sk, queue, getfrag, from, length, 897 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
893 hh_len, fragheaderlen, transhdrlen, 898 hh_len, fragheaderlen, transhdrlen,
894 maxfraglen, flags); 899 maxfraglen, flags);
@@ -962,10 +967,10 @@ alloc_new_skb:
962 skb = sock_wmalloc(sk, 967 skb = sock_wmalloc(sk,
963 alloclen + hh_len + 15, 1, 968 alloclen + hh_len + 15, 1,
964 sk->sk_allocation); 969 sk->sk_allocation);
965 if (unlikely(skb == NULL)) 970 if (unlikely(!skb))
966 err = -ENOBUFS; 971 err = -ENOBUFS;
967 } 972 }
968 if (skb == NULL) 973 if (!skb)
969 goto error; 974 goto error;
970 975
971 /* 976 /*
@@ -1089,10 +1094,10 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1089 */ 1094 */
1090 opt = ipc->opt; 1095 opt = ipc->opt;
1091 if (opt) { 1096 if (opt) {
1092 if (cork->opt == NULL) { 1097 if (!cork->opt) {
1093 cork->opt = kmalloc(sizeof(struct ip_options) + 40, 1098 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1094 sk->sk_allocation); 1099 sk->sk_allocation);
1095 if (unlikely(cork->opt == NULL)) 1100 if (unlikely(!cork->opt))
1096 return -ENOBUFS; 1101 return -ENOBUFS;
1097 } 1102 }
1098 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen); 1103 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
@@ -1199,7 +1204,8 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1199 return -EMSGSIZE; 1204 return -EMSGSIZE;
1200 } 1205 }
1201 1206
1202 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) 1207 skb = skb_peek_tail(&sk->sk_write_queue);
1208 if (!skb)
1203 return -EINVAL; 1209 return -EINVAL;
1204 1210
1205 cork->length += size; 1211 cork->length += size;
@@ -1210,7 +1216,6 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1210 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 1216 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1211 } 1217 }
1212 1218
1213
1214 while (size > 0) { 1219 while (size > 0) {
1215 int i; 1220 int i;
1216 1221
@@ -1330,7 +1335,8 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
1330 __be16 df = 0; 1335 __be16 df = 0;
1331 __u8 ttl; 1336 __u8 ttl;
1332 1337
1333 if ((skb = __skb_dequeue(queue)) == NULL) 1338 skb = __skb_dequeue(queue);
1339 if (!skb)
1334 goto out; 1340 goto out;
1335 tail_skb = &(skb_shinfo(skb)->frag_list); 1341 tail_skb = &(skb_shinfo(skb)->frag_list);
1336 1342
@@ -1381,7 +1387,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
1381 iph->ttl = ttl; 1387 iph->ttl = ttl;
1382 iph->protocol = sk->sk_protocol; 1388 iph->protocol = sk->sk_protocol;
1383 ip_copy_addrs(iph, fl4); 1389 ip_copy_addrs(iph, fl4);
1384 ip_select_ident(skb, sk); 1390 ip_select_ident(net, skb, sk);
1385 1391
1386 if (opt) { 1392 if (opt) {
1387 iph->ihl += opt->optlen>>2; 1393 iph->ihl += opt->optlen>>2;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 31d8c71986b4..7cfb0893f263 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -351,7 +351,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
351 return 0; 351 return 0;
352 } 352 }
353 } 353 }
354 if (new_ra == NULL) { 354 if (!new_ra) {
355 spin_unlock_bh(&ip_ra_lock); 355 spin_unlock_bh(&ip_ra_lock);
356 return -ENOBUFS; 356 return -ENOBUFS;
357 } 357 }
@@ -387,7 +387,7 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
387 skb_network_header(skb); 387 skb_network_header(skb);
388 serr->port = port; 388 serr->port = port;
389 389
390 if (skb_pull(skb, payload - skb->data) != NULL) { 390 if (skb_pull(skb, payload - skb->data)) {
391 skb_reset_transport_header(skb); 391 skb_reset_transport_header(skb);
392 if (sock_queue_err_skb(sk, skb) == 0) 392 if (sock_queue_err_skb(sk, skb) == 0)
393 return; 393 return;
@@ -432,17 +432,32 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf
432 kfree_skb(skb); 432 kfree_skb(skb);
433} 433}
434 434
435static bool ipv4_pktinfo_prepare_errqueue(const struct sock *sk, 435/* IPv4 supports cmsg on all imcp errors and some timestamps
436 const struct sk_buff *skb, 436 *
437 int ee_origin) 437 * Timestamp code paths do not initialize the fields expected by cmsg:
438 * the PKTINFO fields in skb->cb[]. Fill those in here.
439 */
440static bool ipv4_datagram_support_cmsg(const struct sock *sk,
441 struct sk_buff *skb,
442 int ee_origin)
438{ 443{
439 struct in_pktinfo *info = PKTINFO_SKB_CB(skb); 444 struct in_pktinfo *info;
440 445
441 if ((ee_origin != SO_EE_ORIGIN_TIMESTAMPING) || 446 if (ee_origin == SO_EE_ORIGIN_ICMP)
442 (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) || 447 return true;
448
449 if (ee_origin == SO_EE_ORIGIN_LOCAL)
450 return false;
451
452 /* Support IP_PKTINFO on tstamp packets if requested, to correlate
453 * timestamp with egress dev. Not possible for packets without dev
454 * or without payload (SOF_TIMESTAMPING_OPT_TSONLY).
455 */
456 if ((!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) ||
443 (!skb->dev)) 457 (!skb->dev))
444 return false; 458 return false;
445 459
460 info = PKTINFO_SKB_CB(skb);
446 info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr; 461 info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr;
447 info->ipi_ifindex = skb->dev->ifindex; 462 info->ipi_ifindex = skb->dev->ifindex;
448 return true; 463 return true;
@@ -467,7 +482,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
467 482
468 err = -EAGAIN; 483 err = -EAGAIN;
469 skb = sock_dequeue_err_skb(sk); 484 skb = sock_dequeue_err_skb(sk);
470 if (skb == NULL) 485 if (!skb)
471 goto out; 486 goto out;
472 487
473 copied = skb->len; 488 copied = skb->len;
@@ -483,7 +498,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
483 498
484 serr = SKB_EXT_ERR(skb); 499 serr = SKB_EXT_ERR(skb);
485 500
486 if (sin && skb->len) { 501 if (sin && serr->port) {
487 sin->sin_family = AF_INET; 502 sin->sin_family = AF_INET;
488 sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + 503 sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) +
489 serr->addr_offset); 504 serr->addr_offset);
@@ -496,9 +511,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
496 sin = &errhdr.offender; 511 sin = &errhdr.offender;
497 memset(sin, 0, sizeof(*sin)); 512 memset(sin, 0, sizeof(*sin));
498 513
499 if (skb->len && 514 if (ipv4_datagram_support_cmsg(sk, skb, serr->ee.ee_origin)) {
500 (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
501 ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin))) {
502 sin->sin_family = AF_INET; 515 sin->sin_family = AF_INET;
503 sin->sin_addr.s_addr = ip_hdr(skb)->saddr; 516 sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
504 if (inet_sk(sk)->cmsg_flags) 517 if (inet_sk(sk)->cmsg_flags)
@@ -523,12 +536,34 @@ out:
523 * Socket option code for IP. This is the end of the line after any 536 * Socket option code for IP. This is the end of the line after any
524 * TCP,UDP etc options on an IP socket. 537 * TCP,UDP etc options on an IP socket.
525 */ 538 */
539static bool setsockopt_needs_rtnl(int optname)
540{
541 switch (optname) {
542 case IP_ADD_MEMBERSHIP:
543 case IP_ADD_SOURCE_MEMBERSHIP:
544 case IP_BLOCK_SOURCE:
545 case IP_DROP_MEMBERSHIP:
546 case IP_DROP_SOURCE_MEMBERSHIP:
547 case IP_MSFILTER:
548 case IP_UNBLOCK_SOURCE:
549 case MCAST_BLOCK_SOURCE:
550 case MCAST_MSFILTER:
551 case MCAST_JOIN_GROUP:
552 case MCAST_JOIN_SOURCE_GROUP:
553 case MCAST_LEAVE_GROUP:
554 case MCAST_LEAVE_SOURCE_GROUP:
555 case MCAST_UNBLOCK_SOURCE:
556 return true;
557 }
558 return false;
559}
526 560
527static int do_ip_setsockopt(struct sock *sk, int level, 561static int do_ip_setsockopt(struct sock *sk, int level,
528 int optname, char __user *optval, unsigned int optlen) 562 int optname, char __user *optval, unsigned int optlen)
529{ 563{
530 struct inet_sock *inet = inet_sk(sk); 564 struct inet_sock *inet = inet_sk(sk);
531 int val = 0, err; 565 int val = 0, err;
566 bool needs_rtnl = setsockopt_needs_rtnl(optname);
532 567
533 switch (optname) { 568 switch (optname) {
534 case IP_PKTINFO: 569 case IP_PKTINFO:
@@ -571,6 +606,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
571 return ip_mroute_setsockopt(sk, optname, optval, optlen); 606 return ip_mroute_setsockopt(sk, optname, optval, optlen);
572 607
573 err = 0; 608 err = 0;
609 if (needs_rtnl)
610 rtnl_lock();
574 lock_sock(sk); 611 lock_sock(sk);
575 612
576 switch (optname) { 613 switch (optname) {
@@ -1105,10 +1142,14 @@ mc_msf_out:
1105 break; 1142 break;
1106 } 1143 }
1107 release_sock(sk); 1144 release_sock(sk);
1145 if (needs_rtnl)
1146 rtnl_unlock();
1108 return err; 1147 return err;
1109 1148
1110e_inval: 1149e_inval:
1111 release_sock(sk); 1150 release_sock(sk);
1151 if (needs_rtnl)
1152 rtnl_unlock();
1112 return -EINVAL; 1153 return -EINVAL;
1113} 1154}
1114 1155
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 2cd08280c77b..4c2c3ba4ba65 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -389,7 +389,6 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
389 hlen = tdev->hard_header_len + tdev->needed_headroom; 389 hlen = tdev->hard_header_len + tdev->needed_headroom;
390 mtu = tdev->mtu; 390 mtu = tdev->mtu;
391 } 391 }
392 dev->iflink = tunnel->parms.link;
393 392
394 dev->needed_headroom = t_hlen + hlen; 393 dev->needed_headroom = t_hlen + hlen;
395 mtu -= (dev->hard_header_len + t_hlen); 394 mtu -= (dev->hard_header_len + t_hlen);
@@ -655,7 +654,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
655 if (dst == 0) { 654 if (dst == 0) {
656 /* NBMA tunnel */ 655 /* NBMA tunnel */
657 656
658 if (skb_dst(skb) == NULL) { 657 if (!skb_dst(skb)) {
659 dev->stats.tx_fifo_errors++; 658 dev->stats.tx_fifo_errors++;
660 goto tx_error; 659 goto tx_error;
661 } 660 }
@@ -673,7 +672,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
673 672
674 neigh = dst_neigh_lookup(skb_dst(skb), 673 neigh = dst_neigh_lookup(skb_dst(skb),
675 &ipv6_hdr(skb)->daddr); 674 &ipv6_hdr(skb)->daddr);
676 if (neigh == NULL) 675 if (!neigh)
677 goto tx_error; 676 goto tx_error;
678 677
679 addr6 = (const struct in6_addr *)&neigh->primary_key; 678 addr6 = (const struct in6_addr *)&neigh->primary_key;
@@ -783,7 +782,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
783 return; 782 return;
784 } 783 }
785 784
786 err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol, 785 err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol,
787 tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); 786 tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
788 iptunnel_xmit_stats(err, &dev->stats, dev->tstats); 787 iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
789 788
@@ -844,7 +843,7 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
844 case SIOCGETTUNNEL: 843 case SIOCGETTUNNEL:
845 if (dev == itn->fb_tunnel_dev) { 844 if (dev == itn->fb_tunnel_dev) {
846 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 845 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
847 if (t == NULL) 846 if (!t)
848 t = netdev_priv(dev); 847 t = netdev_priv(dev);
849 } 848 }
850 memcpy(p, &t->parms, sizeof(*p)); 849 memcpy(p, &t->parms, sizeof(*p));
@@ -877,7 +876,7 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
877 break; 876 break;
878 } 877 }
879 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { 878 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
880 if (t != NULL) { 879 if (t) {
881 if (t->dev != dev) { 880 if (t->dev != dev) {
882 err = -EEXIST; 881 err = -EEXIST;
883 break; 882 break;
@@ -915,7 +914,7 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
915 if (dev == itn->fb_tunnel_dev) { 914 if (dev == itn->fb_tunnel_dev) {
916 err = -ENOENT; 915 err = -ENOENT;
917 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 916 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
918 if (t == NULL) 917 if (!t)
919 goto done; 918 goto done;
920 err = -EPERM; 919 err = -EPERM;
921 if (t == netdev_priv(itn->fb_tunnel_dev)) 920 if (t == netdev_priv(itn->fb_tunnel_dev))
@@ -980,6 +979,14 @@ struct net *ip_tunnel_get_link_net(const struct net_device *dev)
980} 979}
981EXPORT_SYMBOL(ip_tunnel_get_link_net); 980EXPORT_SYMBOL(ip_tunnel_get_link_net);
982 981
982int ip_tunnel_get_iflink(const struct net_device *dev)
983{
984 struct ip_tunnel *tunnel = netdev_priv(dev);
985
986 return tunnel->parms.link;
987}
988EXPORT_SYMBOL(ip_tunnel_get_iflink);
989
983int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, 990int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
984 struct rtnl_link_ops *ops, char *devname) 991 struct rtnl_link_ops *ops, char *devname)
985{ 992{
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 88c386cf7d85..ce63ab21b6cd 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -74,7 +74,8 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
74 iph->daddr = dst; 74 iph->daddr = dst;
75 iph->saddr = src; 75 iph->saddr = src;
76 iph->ttl = ttl; 76 iph->ttl = ttl;
77 __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1); 77 __ip_select_ident(dev_net(rt->dst.dev), iph,
78 skb_shinfo(skb)->gso_segs ?: 1);
78 79
79 err = ip_local_out_sk(sk, skb); 80 err = ip_local_out_sk(sk, skb);
80 if (unlikely(net_xmit_eval(err))) 81 if (unlikely(net_xmit_eval(err)))
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 94efe148181c..9f7269f3c54a 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -60,7 +60,7 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
60 60
61 tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, 61 tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
62 iph->saddr, iph->daddr, 0); 62 iph->saddr, iph->daddr, 0);
63 if (tunnel != NULL) { 63 if (tunnel) {
64 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 64 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
65 goto drop; 65 goto drop;
66 66
@@ -341,6 +341,7 @@ static const struct net_device_ops vti_netdev_ops = {
341 .ndo_do_ioctl = vti_tunnel_ioctl, 341 .ndo_do_ioctl = vti_tunnel_ioctl,
342 .ndo_change_mtu = ip_tunnel_change_mtu, 342 .ndo_change_mtu = ip_tunnel_change_mtu,
343 .ndo_get_stats64 = ip_tunnel_get_stats64, 343 .ndo_get_stats64 = ip_tunnel_get_stats64,
344 .ndo_get_iflink = ip_tunnel_get_iflink,
344}; 345};
345 346
346static void vti_tunnel_setup(struct net_device *dev) 347static void vti_tunnel_setup(struct net_device *dev)
@@ -361,7 +362,6 @@ static int vti_tunnel_init(struct net_device *dev)
361 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); 362 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
362 dev->mtu = ETH_DATA_LEN; 363 dev->mtu = ETH_DATA_LEN;
363 dev->flags = IFF_NOARP; 364 dev->flags = IFF_NOARP;
364 dev->iflink = 0;
365 dev->addr_len = 4; 365 dev->addr_len = 4;
366 dev->features |= NETIF_F_LLTX; 366 dev->features |= NETIF_F_LLTX;
367 netif_keep_dst(dev); 367 netif_keep_dst(dev);
@@ -456,10 +456,10 @@ static void vti_netlink_parms(struct nlattr *data[],
456 parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); 456 parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]);
457 457
458 if (data[IFLA_VTI_LOCAL]) 458 if (data[IFLA_VTI_LOCAL])
459 parms->iph.saddr = nla_get_be32(data[IFLA_VTI_LOCAL]); 459 parms->iph.saddr = nla_get_in_addr(data[IFLA_VTI_LOCAL]);
460 460
461 if (data[IFLA_VTI_REMOTE]) 461 if (data[IFLA_VTI_REMOTE])
462 parms->iph.daddr = nla_get_be32(data[IFLA_VTI_REMOTE]); 462 parms->iph.daddr = nla_get_in_addr(data[IFLA_VTI_REMOTE]);
463 463
464} 464}
465 465
@@ -505,8 +505,8 @@ static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev)
505 nla_put_u32(skb, IFLA_VTI_LINK, p->link); 505 nla_put_u32(skb, IFLA_VTI_LINK, p->link);
506 nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key); 506 nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key);
507 nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key); 507 nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key);
508 nla_put_be32(skb, IFLA_VTI_LOCAL, p->iph.saddr); 508 nla_put_in_addr(skb, IFLA_VTI_LOCAL, p->iph.saddr);
509 nla_put_be32(skb, IFLA_VTI_REMOTE, p->iph.daddr); 509 nla_put_in_addr(skb, IFLA_VTI_REMOTE, p->iph.daddr);
510 510
511 return 0; 511 return 0;
512} 512}
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index c0855d50a3fa..d97f4f2787f5 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -63,7 +63,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
63 struct xfrm_state *t; 63 struct xfrm_state *t;
64 64
65 t = xfrm_state_alloc(net); 65 t = xfrm_state_alloc(net);
66 if (t == NULL) 66 if (!t)
67 goto out; 67 goto out;
68 68
69 t->id.proto = IPPROTO_IPIP; 69 t->id.proto = IPPROTO_IPIP;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index b26376ef87f6..8e7328c6a390 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -504,7 +504,8 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
504 if (!net_eq(dev_net(dev), &init_net)) 504 if (!net_eq(dev_net(dev), &init_net))
505 goto drop; 505 goto drop;
506 506
507 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) 507 skb = skb_share_check(skb, GFP_ATOMIC);
508 if (!skb)
508 return NET_RX_DROP; 509 return NET_RX_DROP;
509 510
510 if (!pskb_may_pull(skb, sizeof(struct arphdr))) 511 if (!pskb_may_pull(skb, sizeof(struct arphdr)))
@@ -958,7 +959,8 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
958 if (skb->pkt_type == PACKET_OTHERHOST) 959 if (skb->pkt_type == PACKET_OTHERHOST)
959 goto drop; 960 goto drop;
960 961
961 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) 962 skb = skb_share_check(skb, GFP_ATOMIC);
963 if (!skb)
962 return NET_RX_DROP; 964 return NET_RX_DROP;
963 965
964 if (!pskb_may_pull(skb, 966 if (!pskb_may_pull(skb,
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 915d215a7d14..ff96396ebec5 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -144,7 +144,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
144 err = -ENOENT; 144 err = -ENOENT;
145 t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, 145 t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
146 iph->daddr, iph->saddr, 0); 146 iph->daddr, iph->saddr, 0);
147 if (t == NULL) 147 if (!t)
148 goto out; 148 goto out;
149 149
150 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { 150 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
@@ -272,6 +272,7 @@ static const struct net_device_ops ipip_netdev_ops = {
272 .ndo_do_ioctl = ipip_tunnel_ioctl, 272 .ndo_do_ioctl = ipip_tunnel_ioctl,
273 .ndo_change_mtu = ip_tunnel_change_mtu, 273 .ndo_change_mtu = ip_tunnel_change_mtu,
274 .ndo_get_stats64 = ip_tunnel_get_stats64, 274 .ndo_get_stats64 = ip_tunnel_get_stats64,
275 .ndo_get_iflink = ip_tunnel_get_iflink,
275}; 276};
276 277
277#define IPIP_FEATURES (NETIF_F_SG | \ 278#define IPIP_FEATURES (NETIF_F_SG | \
@@ -286,7 +287,6 @@ static void ipip_tunnel_setup(struct net_device *dev)
286 287
287 dev->type = ARPHRD_TUNNEL; 288 dev->type = ARPHRD_TUNNEL;
288 dev->flags = IFF_NOARP; 289 dev->flags = IFF_NOARP;
289 dev->iflink = 0;
290 dev->addr_len = 4; 290 dev->addr_len = 4;
291 dev->features |= NETIF_F_LLTX; 291 dev->features |= NETIF_F_LLTX;
292 netif_keep_dst(dev); 292 netif_keep_dst(dev);
@@ -325,10 +325,10 @@ static void ipip_netlink_parms(struct nlattr *data[],
325 parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); 325 parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
326 326
327 if (data[IFLA_IPTUN_LOCAL]) 327 if (data[IFLA_IPTUN_LOCAL])
328 parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]); 328 parms->iph.saddr = nla_get_in_addr(data[IFLA_IPTUN_LOCAL]);
329 329
330 if (data[IFLA_IPTUN_REMOTE]) 330 if (data[IFLA_IPTUN_REMOTE])
331 parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]); 331 parms->iph.daddr = nla_get_in_addr(data[IFLA_IPTUN_REMOTE]);
332 332
333 if (data[IFLA_IPTUN_TTL]) { 333 if (data[IFLA_IPTUN_TTL]) {
334 parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]); 334 parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
@@ -450,8 +450,8 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
450 struct ip_tunnel_parm *parm = &tunnel->parms; 450 struct ip_tunnel_parm *parm = &tunnel->parms;
451 451
452 if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || 452 if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
453 nla_put_be32(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || 453 nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
454 nla_put_be32(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || 454 nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) ||
455 nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) || 455 nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) ||
456 nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || 456 nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) ||
457 nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, 457 nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 9d78427652d2..3a2c0162c3ba 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -73,9 +73,7 @@
73 73
74struct mr_table { 74struct mr_table {
75 struct list_head list; 75 struct list_head list;
76#ifdef CONFIG_NET_NS 76 possible_net_t net;
77 struct net *net;
78#endif
79 u32 id; 77 u32 id;
80 struct sock __rcu *mroute_sk; 78 struct sock __rcu *mroute_sk;
81 struct timer_list ipmr_expire_timer; 79 struct timer_list ipmr_expire_timer;
@@ -191,7 +189,7 @@ static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
191 } 189 }
192 190
193 mrt = ipmr_get_table(rule->fr_net, rule->table); 191 mrt = ipmr_get_table(rule->fr_net, rule->table);
194 if (mrt == NULL) 192 if (!mrt)
195 return -EAGAIN; 193 return -EAGAIN;
196 res->mrt = mrt; 194 res->mrt = mrt;
197 return 0; 195 return 0;
@@ -255,7 +253,7 @@ static int __net_init ipmr_rules_init(struct net *net)
255 INIT_LIST_HEAD(&net->ipv4.mr_tables); 253 INIT_LIST_HEAD(&net->ipv4.mr_tables);
256 254
257 mrt = ipmr_new_table(net, RT_TABLE_DEFAULT); 255 mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
258 if (mrt == NULL) { 256 if (!mrt) {
259 err = -ENOMEM; 257 err = -ENOMEM;
260 goto err1; 258 goto err1;
261 } 259 }
@@ -268,7 +266,7 @@ static int __net_init ipmr_rules_init(struct net *net)
268 return 0; 266 return 0;
269 267
270err2: 268err2:
271 kfree(mrt); 269 ipmr_free_table(mrt);
272err1: 270err1:
273 fib_rules_unregister(ops); 271 fib_rules_unregister(ops);
274 return err; 272 return err;
@@ -278,11 +276,13 @@ static void __net_exit ipmr_rules_exit(struct net *net)
278{ 276{
279 struct mr_table *mrt, *next; 277 struct mr_table *mrt, *next;
280 278
279 rtnl_lock();
281 list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { 280 list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
282 list_del(&mrt->list); 281 list_del(&mrt->list);
283 ipmr_free_table(mrt); 282 ipmr_free_table(mrt);
284 } 283 }
285 fib_rules_unregister(net->ipv4.mr_rules_ops); 284 fib_rules_unregister(net->ipv4.mr_rules_ops);
285 rtnl_unlock();
286} 286}
287#else 287#else
288#define ipmr_for_each_table(mrt, net) \ 288#define ipmr_for_each_table(mrt, net) \
@@ -308,7 +308,10 @@ static int __net_init ipmr_rules_init(struct net *net)
308 308
309static void __net_exit ipmr_rules_exit(struct net *net) 309static void __net_exit ipmr_rules_exit(struct net *net)
310{ 310{
311 rtnl_lock();
311 ipmr_free_table(net->ipv4.mrt); 312 ipmr_free_table(net->ipv4.mrt);
313 net->ipv4.mrt = NULL;
314 rtnl_unlock();
312} 315}
313#endif 316#endif
314 317
@@ -318,11 +321,11 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
318 unsigned int i; 321 unsigned int i;
319 322
320 mrt = ipmr_get_table(net, id); 323 mrt = ipmr_get_table(net, id);
321 if (mrt != NULL) 324 if (mrt)
322 return mrt; 325 return mrt;
323 326
324 mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); 327 mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
325 if (mrt == NULL) 328 if (!mrt)
326 return NULL; 329 return NULL;
327 write_pnet(&mrt->net, net); 330 write_pnet(&mrt->net, net);
328 mrt->id = id; 331 mrt->id = id;
@@ -424,7 +427,7 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
424 dev->flags |= IFF_MULTICAST; 427 dev->flags |= IFF_MULTICAST;
425 428
426 in_dev = __in_dev_get_rtnl(dev); 429 in_dev = __in_dev_get_rtnl(dev);
427 if (in_dev == NULL) 430 if (!in_dev)
428 goto failure; 431 goto failure;
429 432
430 ipv4_devconf_setall(in_dev); 433 ipv4_devconf_setall(in_dev);
@@ -475,8 +478,14 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
475 return NETDEV_TX_OK; 478 return NETDEV_TX_OK;
476} 479}
477 480
481static int reg_vif_get_iflink(const struct net_device *dev)
482{
483 return 0;
484}
485
478static const struct net_device_ops reg_vif_netdev_ops = { 486static const struct net_device_ops reg_vif_netdev_ops = {
479 .ndo_start_xmit = reg_vif_xmit, 487 .ndo_start_xmit = reg_vif_xmit,
488 .ndo_get_iflink = reg_vif_get_iflink,
480}; 489};
481 490
482static void reg_vif_setup(struct net_device *dev) 491static void reg_vif_setup(struct net_device *dev)
@@ -502,7 +511,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
502 511
503 dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup); 512 dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);
504 513
505 if (dev == NULL) 514 if (!dev)
506 return NULL; 515 return NULL;
507 516
508 dev_net_set(dev, net); 517 dev_net_set(dev, net);
@@ -511,7 +520,6 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
511 free_netdev(dev); 520 free_netdev(dev);
512 return NULL; 521 return NULL;
513 } 522 }
514 dev->iflink = 0;
515 523
516 rcu_read_lock(); 524 rcu_read_lock();
517 in_dev = __in_dev_get_rcu(dev); 525 in_dev = __in_dev_get_rcu(dev);
@@ -759,7 +767,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
759 case 0: 767 case 0:
760 if (vifc->vifc_flags == VIFF_USE_IFINDEX) { 768 if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
761 dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex); 769 dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
762 if (dev && __in_dev_get_rtnl(dev) == NULL) { 770 if (dev && !__in_dev_get_rtnl(dev)) {
763 dev_put(dev); 771 dev_put(dev);
764 return -EADDRNOTAVAIL; 772 return -EADDRNOTAVAIL;
765 } 773 }
@@ -803,7 +811,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
803 v->pkt_out = 0; 811 v->pkt_out = 0;
804 v->link = dev->ifindex; 812 v->link = dev->ifindex;
805 if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER)) 813 if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER))
806 v->link = dev->iflink; 814 v->link = dev_get_iflink(dev);
807 815
808 /* And finish update writing critical data */ 816 /* And finish update writing critical data */
809 write_lock_bh(&mrt_lock); 817 write_lock_bh(&mrt_lock);
@@ -1005,7 +1013,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
1005 1013
1006 rcu_read_lock(); 1014 rcu_read_lock();
1007 mroute_sk = rcu_dereference(mrt->mroute_sk); 1015 mroute_sk = rcu_dereference(mrt->mroute_sk);
1008 if (mroute_sk == NULL) { 1016 if (!mroute_sk) {
1009 rcu_read_unlock(); 1017 rcu_read_unlock();
1010 kfree_skb(skb); 1018 kfree_skb(skb);
1011 return -EINVAL; 1019 return -EINVAL;
@@ -1158,7 +1166,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1158 return -EINVAL; 1166 return -EINVAL;
1159 1167
1160 c = ipmr_cache_alloc(); 1168 c = ipmr_cache_alloc();
1161 if (c == NULL) 1169 if (!c)
1162 return -ENOMEM; 1170 return -ENOMEM;
1163 1171
1164 c->mfc_origin = mfc->mfcc_origin.s_addr; 1172 c->mfc_origin = mfc->mfcc_origin.s_addr;
@@ -1280,7 +1288,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1280 return -EOPNOTSUPP; 1288 return -EOPNOTSUPP;
1281 1289
1282 mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); 1290 mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1283 if (mrt == NULL) 1291 if (!mrt)
1284 return -ENOENT; 1292 return -ENOENT;
1285 1293
1286 if (optname != MRT_INIT) { 1294 if (optname != MRT_INIT) {
@@ -1443,7 +1451,7 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
1443 return -EOPNOTSUPP; 1451 return -EOPNOTSUPP;
1444 1452
1445 mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); 1453 mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1446 if (mrt == NULL) 1454 if (!mrt)
1447 return -ENOENT; 1455 return -ENOENT;
1448 1456
1449 if (optname != MRT_VERSION && 1457 if (optname != MRT_VERSION &&
@@ -1489,7 +1497,7 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1489 struct mr_table *mrt; 1497 struct mr_table *mrt;
1490 1498
1491 mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); 1499 mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1492 if (mrt == NULL) 1500 if (!mrt)
1493 return -ENOENT; 1501 return -ENOENT;
1494 1502
1495 switch (cmd) { 1503 switch (cmd) {
@@ -1563,7 +1571,7 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
1563 struct mr_table *mrt; 1571 struct mr_table *mrt;
1564 1572
1565 mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); 1573 mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1566 if (mrt == NULL) 1574 if (!mrt)
1567 return -ENOENT; 1575 return -ENOENT;
1568 1576
1569 switch (cmd) { 1577 switch (cmd) {
@@ -1644,7 +1652,8 @@ static struct notifier_block ip_mr_notifier = {
1644 * important for multicast video. 1652 * important for multicast video.
1645 */ 1653 */
1646 1654
1647static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) 1655static void ip_encap(struct net *net, struct sk_buff *skb,
1656 __be32 saddr, __be32 daddr)
1648{ 1657{
1649 struct iphdr *iph; 1658 struct iphdr *iph;
1650 const struct iphdr *old_iph = ip_hdr(skb); 1659 const struct iphdr *old_iph = ip_hdr(skb);
@@ -1663,14 +1672,14 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1663 iph->protocol = IPPROTO_IPIP; 1672 iph->protocol = IPPROTO_IPIP;
1664 iph->ihl = 5; 1673 iph->ihl = 5;
1665 iph->tot_len = htons(skb->len); 1674 iph->tot_len = htons(skb->len);
1666 ip_select_ident(skb, NULL); 1675 ip_select_ident(net, skb, NULL);
1667 ip_send_check(iph); 1676 ip_send_check(iph);
1668 1677
1669 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 1678 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1670 nf_reset(skb); 1679 nf_reset(skb);
1671} 1680}
1672 1681
1673static inline int ipmr_forward_finish(struct sk_buff *skb) 1682static inline int ipmr_forward_finish(struct sock *sk, struct sk_buff *skb)
1674{ 1683{
1675 struct ip_options *opt = &(IPCB(skb)->opt); 1684 struct ip_options *opt = &(IPCB(skb)->opt);
1676 1685
@@ -1680,7 +1689,7 @@ static inline int ipmr_forward_finish(struct sk_buff *skb)
1680 if (unlikely(opt->optlen)) 1689 if (unlikely(opt->optlen))
1681 ip_forward_options(skb); 1690 ip_forward_options(skb);
1682 1691
1683 return dst_output(skb); 1692 return dst_output_sk(sk, skb);
1684} 1693}
1685 1694
1686/* 1695/*
@@ -1697,7 +1706,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1697 struct flowi4 fl4; 1706 struct flowi4 fl4;
1698 int encap = 0; 1707 int encap = 0;
1699 1708
1700 if (vif->dev == NULL) 1709 if (!vif->dev)
1701 goto out_free; 1710 goto out_free;
1702 1711
1703#ifdef CONFIG_IP_PIMSM 1712#ifdef CONFIG_IP_PIMSM
@@ -1760,7 +1769,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1760 * What do we do with netfilter? -- RR 1769 * What do we do with netfilter? -- RR
1761 */ 1770 */
1762 if (vif->flags & VIFF_TUNNEL) { 1771 if (vif->flags & VIFF_TUNNEL) {
1763 ip_encap(skb, vif->local, vif->remote); 1772 ip_encap(net, skb, vif->local, vif->remote);
1764 /* FIXME: extra output firewall step used to be here. --RR */ 1773 /* FIXME: extra output firewall step used to be here. --RR */
1765 vif->dev->stats.tx_packets++; 1774 vif->dev->stats.tx_packets++;
1766 vif->dev->stats.tx_bytes += skb->len; 1775 vif->dev->stats.tx_bytes += skb->len;
@@ -1779,7 +1788,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1779 * not mrouter) cannot join to more than one interface - it will 1788 * not mrouter) cannot join to more than one interface - it will
1780 * result in receiving multiple packets. 1789 * result in receiving multiple packets.
1781 */ 1790 */
1782 NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev, 1791 NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb,
1792 skb->dev, dev,
1783 ipmr_forward_finish); 1793 ipmr_forward_finish);
1784 return; 1794 return;
1785 1795
@@ -1988,7 +1998,7 @@ int ip_mr_input(struct sk_buff *skb)
1988 1998
1989 /* already under rcu_read_lock() */ 1999 /* already under rcu_read_lock() */
1990 cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); 2000 cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1991 if (cache == NULL) { 2001 if (!cache) {
1992 int vif = ipmr_find_vif(mrt, skb->dev); 2002 int vif = ipmr_find_vif(mrt, skb->dev);
1993 2003
1994 if (vif >= 0) 2004 if (vif >= 0)
@@ -1999,13 +2009,13 @@ int ip_mr_input(struct sk_buff *skb)
1999 /* 2009 /*
2000 * No usable cache entry 2010 * No usable cache entry
2001 */ 2011 */
2002 if (cache == NULL) { 2012 if (!cache) {
2003 int vif; 2013 int vif;
2004 2014
2005 if (local) { 2015 if (local) {
2006 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 2016 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
2007 ip_local_deliver(skb); 2017 ip_local_deliver(skb);
2008 if (skb2 == NULL) 2018 if (!skb2)
2009 return -ENOBUFS; 2019 return -ENOBUFS;
2010 skb = skb2; 2020 skb = skb2;
2011 } 2021 }
@@ -2064,7 +2074,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
2064 reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev; 2074 reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
2065 read_unlock(&mrt_lock); 2075 read_unlock(&mrt_lock);
2066 2076
2067 if (reg_dev == NULL) 2077 if (!reg_dev)
2068 return 1; 2078 return 1;
2069 2079
2070 skb->mac_header = skb->network_header; 2080 skb->mac_header = skb->network_header;
@@ -2194,18 +2204,18 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
2194 int err; 2204 int err;
2195 2205
2196 mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); 2206 mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2197 if (mrt == NULL) 2207 if (!mrt)
2198 return -ENOENT; 2208 return -ENOENT;
2199 2209
2200 rcu_read_lock(); 2210 rcu_read_lock();
2201 cache = ipmr_cache_find(mrt, saddr, daddr); 2211 cache = ipmr_cache_find(mrt, saddr, daddr);
2202 if (cache == NULL && skb->dev) { 2212 if (!cache && skb->dev) {
2203 int vif = ipmr_find_vif(mrt, skb->dev); 2213 int vif = ipmr_find_vif(mrt, skb->dev);
2204 2214
2205 if (vif >= 0) 2215 if (vif >= 0)
2206 cache = ipmr_cache_find_any(mrt, daddr, vif); 2216 cache = ipmr_cache_find_any(mrt, daddr, vif);
2207 } 2217 }
2208 if (cache == NULL) { 2218 if (!cache) {
2209 struct sk_buff *skb2; 2219 struct sk_buff *skb2;
2210 struct iphdr *iph; 2220 struct iphdr *iph;
2211 struct net_device *dev; 2221 struct net_device *dev;
@@ -2263,7 +2273,7 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2263 int err; 2273 int err;
2264 2274
2265 nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags); 2275 nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags);
2266 if (nlh == NULL) 2276 if (!nlh)
2267 return -EMSGSIZE; 2277 return -EMSGSIZE;
2268 2278
2269 rtm = nlmsg_data(nlh); 2279 rtm = nlmsg_data(nlh);
@@ -2282,8 +2292,8 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2282 rtm->rtm_protocol = RTPROT_MROUTED; 2292 rtm->rtm_protocol = RTPROT_MROUTED;
2283 rtm->rtm_flags = 0; 2293 rtm->rtm_flags = 0;
2284 2294
2285 if (nla_put_be32(skb, RTA_SRC, c->mfc_origin) || 2295 if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) ||
2286 nla_put_be32(skb, RTA_DST, c->mfc_mcastgrp)) 2296 nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp))
2287 goto nla_put_failure; 2297 goto nla_put_failure;
2288 err = __ipmr_fill_mroute(mrt, skb, c, rtm); 2298 err = __ipmr_fill_mroute(mrt, skb, c, rtm);
2289 /* do not break the dump if cache is unresolved */ 2299 /* do not break the dump if cache is unresolved */
@@ -2328,7 +2338,7 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
2328 2338
2329 skb = nlmsg_new(mroute_msgsize(mfc->mfc_parent >= MAXVIFS, mrt->maxvif), 2339 skb = nlmsg_new(mroute_msgsize(mfc->mfc_parent >= MAXVIFS, mrt->maxvif),
2330 GFP_ATOMIC); 2340 GFP_ATOMIC);
2331 if (skb == NULL) 2341 if (!skb)
2332 goto errout; 2342 goto errout;
2333 2343
2334 err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); 2344 err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0);
@@ -2443,7 +2453,7 @@ static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
2443 struct mr_table *mrt; 2453 struct mr_table *mrt;
2444 2454
2445 mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); 2455 mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2446 if (mrt == NULL) 2456 if (!mrt)
2447 return ERR_PTR(-ENOENT); 2457 return ERR_PTR(-ENOENT);
2448 2458
2449 iter->mrt = mrt; 2459 iter->mrt = mrt;
@@ -2562,7 +2572,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
2562 struct mr_table *mrt; 2572 struct mr_table *mrt;
2563 2573
2564 mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); 2574 mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2565 if (mrt == NULL) 2575 if (!mrt)
2566 return ERR_PTR(-ENOENT); 2576 return ERR_PTR(-ENOENT);
2567 2577
2568 it->mrt = mrt; 2578 it->mrt = mrt;
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 7ebd6e37875c..65de0684e22a 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -94,7 +94,7 @@ static void nf_ip_saveroute(const struct sk_buff *skb,
94{ 94{
95 struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); 95 struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
96 96
97 if (entry->hook == NF_INET_LOCAL_OUT) { 97 if (entry->state.hook == NF_INET_LOCAL_OUT) {
98 const struct iphdr *iph = ip_hdr(skb); 98 const struct iphdr *iph = ip_hdr(skb);
99 99
100 rt_info->tos = iph->tos; 100 rt_info->tos = iph->tos;
@@ -109,7 +109,7 @@ static int nf_ip_reroute(struct sk_buff *skb,
109{ 109{
110 const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); 110 const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
111 111
112 if (entry->hook == NF_INET_LOCAL_OUT) { 112 if (entry->state.hook == NF_INET_LOCAL_OUT) {
113 const struct iphdr *iph = ip_hdr(skb); 113 const struct iphdr *iph = ip_hdr(skb);
114 114
115 if (!(iph->tos == rt_info->tos && 115 if (!(iph->tos == rt_info->tos &&
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 59f883d9cadf..fb20f363151f 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -36,24 +36,16 @@ config NF_CONNTRACK_PROC_COMPAT
36 36
37 If unsure, say Y. 37 If unsure, say Y.
38 38
39config NF_LOG_ARP 39if NF_TABLES
40 tristate "ARP packet logging"
41 default m if NETFILTER_ADVANCED=n
42 select NF_LOG_COMMON
43
44config NF_LOG_IPV4
45 tristate "IPv4 packet logging"
46 default m if NETFILTER_ADVANCED=n
47 select NF_LOG_COMMON
48 40
49config NF_TABLES_IPV4 41config NF_TABLES_IPV4
50 depends on NF_TABLES
51 tristate "IPv4 nf_tables support" 42 tristate "IPv4 nf_tables support"
52 help 43 help
53 This option enables the IPv4 support for nf_tables. 44 This option enables the IPv4 support for nf_tables.
54 45
46if NF_TABLES_IPV4
47
55config NFT_CHAIN_ROUTE_IPV4 48config NFT_CHAIN_ROUTE_IPV4
56 depends on NF_TABLES_IPV4
57 tristate "IPv4 nf_tables route chain support" 49 tristate "IPv4 nf_tables route chain support"
58 help 50 help
59 This option enables the "route" chain for IPv4 in nf_tables. This 51 This option enables the "route" chain for IPv4 in nf_tables. This
@@ -61,22 +53,34 @@ config NFT_CHAIN_ROUTE_IPV4
61 fields such as the source, destination, type of service and 53 fields such as the source, destination, type of service and
62 the packet mark. 54 the packet mark.
63 55
64config NF_REJECT_IPV4
65 tristate "IPv4 packet rejection"
66 default m if NETFILTER_ADVANCED=n
67
68config NFT_REJECT_IPV4 56config NFT_REJECT_IPV4
69 depends on NF_TABLES_IPV4
70 select NF_REJECT_IPV4 57 select NF_REJECT_IPV4
71 default NFT_REJECT 58 default NFT_REJECT
72 tristate 59 tristate
73 60
61endif # NF_TABLES_IPV4
62
74config NF_TABLES_ARP 63config NF_TABLES_ARP
75 depends on NF_TABLES
76 tristate "ARP nf_tables support" 64 tristate "ARP nf_tables support"
77 help 65 help
78 This option enables the ARP support for nf_tables. 66 This option enables the ARP support for nf_tables.
79 67
68endif # NF_TABLES
69
70config NF_LOG_ARP
71 tristate "ARP packet logging"
72 default m if NETFILTER_ADVANCED=n
73 select NF_LOG_COMMON
74
75config NF_LOG_IPV4
76 tristate "IPv4 packet logging"
77 default m if NETFILTER_ADVANCED=n
78 select NF_LOG_COMMON
79
80config NF_REJECT_IPV4
81 tristate "IPv4 packet rejection"
82 default m if NETFILTER_ADVANCED=n
83
80config NF_NAT_IPV4 84config NF_NAT_IPV4
81 tristate "IPv4 NAT" 85 tristate "IPv4 NAT"
82 depends on NF_CONNTRACK_IPV4 86 depends on NF_CONNTRACK_IPV4
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index f95b6f93814b..13bfe84bf3ca 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -248,8 +248,7 @@ struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry)
248 248
249unsigned int arpt_do_table(struct sk_buff *skb, 249unsigned int arpt_do_table(struct sk_buff *skb,
250 unsigned int hook, 250 unsigned int hook,
251 const struct net_device *in, 251 const struct nf_hook_state *state,
252 const struct net_device *out,
253 struct xt_table *table) 252 struct xt_table *table)
254{ 253{
255 static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); 254 static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
@@ -265,8 +264,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
265 if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) 264 if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
266 return NF_DROP; 265 return NF_DROP;
267 266
268 indev = in ? in->name : nulldevname; 267 indev = state->in ? state->in->name : nulldevname;
269 outdev = out ? out->name : nulldevname; 268 outdev = state->out ? state->out->name : nulldevname;
270 269
271 local_bh_disable(); 270 local_bh_disable();
272 addend = xt_write_recseq_begin(); 271 addend = xt_write_recseq_begin();
@@ -281,8 +280,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
281 e = get_entry(table_base, private->hook_entry[hook]); 280 e = get_entry(table_base, private->hook_entry[hook]);
282 back = get_entry(table_base, private->underflow[hook]); 281 back = get_entry(table_base, private->underflow[hook]);
283 282
284 acpar.in = in; 283 acpar.in = state->in;
285 acpar.out = out; 284 acpar.out = state->out;
286 acpar.hooknum = hook; 285 acpar.hooknum = hook;
287 acpar.family = NFPROTO_ARP; 286 acpar.family = NFPROTO_ARP;
288 acpar.hotdrop = false; 287 acpar.hotdrop = false;
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 802ddecb30b8..93876d03120c 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -28,12 +28,11 @@ static const struct xt_table packet_filter = {
28/* The work comes in here from netfilter.c */ 28/* The work comes in here from netfilter.c */
29static unsigned int 29static unsigned int
30arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, 30arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
31 const struct net_device *in, const struct net_device *out, 31 const struct nf_hook_state *state)
32 int (*okfn)(struct sk_buff *))
33{ 32{
34 const struct net *net = dev_net((in != NULL) ? in : out); 33 const struct net *net = dev_net(state->in ? state->in : state->out);
35 34
36 return arpt_do_table(skb, ops->hooknum, in, out, 35 return arpt_do_table(skb, ops->hooknum, state,
37 net->ipv4.arptable_filter); 36 net->ipv4.arptable_filter);
38} 37}
39 38
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 99e810f84671..c69db7fa25ee 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -272,9 +272,9 @@ static void trace_packet(const struct sk_buff *skb,
272 &chainname, &comment, &rulenum) != 0) 272 &chainname, &comment, &rulenum) != 0)
273 break; 273 break;
274 274
275 nf_log_packet(net, AF_INET, hook, skb, in, out, &trace_loginfo, 275 nf_log_trace(net, AF_INET, hook, skb, in, out, &trace_loginfo,
276 "TRACE: %s:%s:%s:%u ", 276 "TRACE: %s:%s:%s:%u ",
277 tablename, chainname, comment, rulenum); 277 tablename, chainname, comment, rulenum);
278} 278}
279#endif 279#endif
280 280
@@ -288,8 +288,7 @@ struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry)
288unsigned int 288unsigned int
289ipt_do_table(struct sk_buff *skb, 289ipt_do_table(struct sk_buff *skb,
290 unsigned int hook, 290 unsigned int hook,
291 const struct net_device *in, 291 const struct nf_hook_state *state,
292 const struct net_device *out,
293 struct xt_table *table) 292 struct xt_table *table)
294{ 293{
295 static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); 294 static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
@@ -306,8 +305,8 @@ ipt_do_table(struct sk_buff *skb,
306 305
307 /* Initialization */ 306 /* Initialization */
308 ip = ip_hdr(skb); 307 ip = ip_hdr(skb);
309 indev = in ? in->name : nulldevname; 308 indev = state->in ? state->in->name : nulldevname;
310 outdev = out ? out->name : nulldevname; 309 outdev = state->out ? state->out->name : nulldevname;
311 /* We handle fragments by dealing with the first fragment as 310 /* We handle fragments by dealing with the first fragment as
312 * if it was a normal packet. All other fragments are treated 311 * if it was a normal packet. All other fragments are treated
313 * normally, except that they will NEVER match rules that ask 312 * normally, except that they will NEVER match rules that ask
@@ -317,8 +316,8 @@ ipt_do_table(struct sk_buff *skb,
317 acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET; 316 acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
318 acpar.thoff = ip_hdrlen(skb); 317 acpar.thoff = ip_hdrlen(skb);
319 acpar.hotdrop = false; 318 acpar.hotdrop = false;
320 acpar.in = in; 319 acpar.in = state->in;
321 acpar.out = out; 320 acpar.out = state->out;
322 acpar.family = NFPROTO_IPV4; 321 acpar.family = NFPROTO_IPV4;
323 acpar.hooknum = hook; 322 acpar.hooknum = hook;
324 323
@@ -370,7 +369,7 @@ ipt_do_table(struct sk_buff *skb,
370#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) 369#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
371 /* The packet is traced: log it */ 370 /* The packet is traced: log it */
372 if (unlikely(skb->nf_trace)) 371 if (unlikely(skb->nf_trace))
373 trace_packet(skb, hook, in, out, 372 trace_packet(skb, hook, state->in, state->out,
374 table->name, private, e); 373 table->name, private, e);
375#endif 374#endif
376 /* Standard target? */ 375 /* Standard target? */
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index e90f83a3415b..771ab3d01ad3 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -418,6 +418,13 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
418 if (ret < 0) 418 if (ret < 0)
419 pr_info("cannot load conntrack support for proto=%u\n", 419 pr_info("cannot load conntrack support for proto=%u\n",
420 par->family); 420 par->family);
421
422 if (!par->net->xt.clusterip_deprecated_warning) {
423 pr_info("ipt_CLUSTERIP is deprecated and it will removed soon, "
424 "use xt_cluster instead\n");
425 par->net->xt.clusterip_deprecated_warning = true;
426 }
427
421 return ret; 428 return ret;
422} 429}
423 430
@@ -497,14 +504,12 @@ static void arp_print(struct arp_payload *payload)
497static unsigned int 504static unsigned int
498arp_mangle(const struct nf_hook_ops *ops, 505arp_mangle(const struct nf_hook_ops *ops,
499 struct sk_buff *skb, 506 struct sk_buff *skb,
500 const struct net_device *in, 507 const struct nf_hook_state *state)
501 const struct net_device *out,
502 int (*okfn)(struct sk_buff *))
503{ 508{
504 struct arphdr *arp = arp_hdr(skb); 509 struct arphdr *arp = arp_hdr(skb);
505 struct arp_payload *payload; 510 struct arp_payload *payload;
506 struct clusterip_config *c; 511 struct clusterip_config *c;
507 struct net *net = dev_net(in ? in : out); 512 struct net *net = dev_net(state->in ? state->in : state->out);
508 513
509 /* we don't care about non-ethernet and non-ipv4 ARP */ 514 /* we don't care about non-ethernet and non-ipv4 ARP */
510 if (arp->ar_hrd != htons(ARPHRD_ETHER) || 515 if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
@@ -529,10 +534,10 @@ arp_mangle(const struct nf_hook_ops *ops,
529 * addresses on different interfacs. However, in the CLUSTERIP case 534 * addresses on different interfacs. However, in the CLUSTERIP case
530 * this wouldn't work, since we didn't subscribe the mcast group on 535 * this wouldn't work, since we didn't subscribe the mcast group on
531 * other interfaces */ 536 * other interfaces */
532 if (c->dev != out) { 537 if (c->dev != state->out) {
533 pr_debug("not mangling arp reply on different " 538 pr_debug("not mangling arp reply on different "
534 "interface: cip'%s'-skb'%s'\n", 539 "interface: cip'%s'-skb'%s'\n",
535 c->dev->name, out->name); 540 c->dev->name, state->out->name);
536 clusterip_config_put(c); 541 clusterip_config_put(c);
537 return NF_ACCEPT; 542 return NF_ACCEPT;
538 } 543 }
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 8f48f5517e33..87907d4bd259 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -34,31 +34,32 @@ static unsigned int
34reject_tg(struct sk_buff *skb, const struct xt_action_param *par) 34reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
35{ 35{
36 const struct ipt_reject_info *reject = par->targinfo; 36 const struct ipt_reject_info *reject = par->targinfo;
37 int hook = par->hooknum;
37 38
38 switch (reject->with) { 39 switch (reject->with) {
39 case IPT_ICMP_NET_UNREACHABLE: 40 case IPT_ICMP_NET_UNREACHABLE:
40 nf_send_unreach(skb, ICMP_NET_UNREACH); 41 nf_send_unreach(skb, ICMP_NET_UNREACH, hook);
41 break; 42 break;
42 case IPT_ICMP_HOST_UNREACHABLE: 43 case IPT_ICMP_HOST_UNREACHABLE:
43 nf_send_unreach(skb, ICMP_HOST_UNREACH); 44 nf_send_unreach(skb, ICMP_HOST_UNREACH, hook);
44 break; 45 break;
45 case IPT_ICMP_PROT_UNREACHABLE: 46 case IPT_ICMP_PROT_UNREACHABLE:
46 nf_send_unreach(skb, ICMP_PROT_UNREACH); 47 nf_send_unreach(skb, ICMP_PROT_UNREACH, hook);
47 break; 48 break;
48 case IPT_ICMP_PORT_UNREACHABLE: 49 case IPT_ICMP_PORT_UNREACHABLE:
49 nf_send_unreach(skb, ICMP_PORT_UNREACH); 50 nf_send_unreach(skb, ICMP_PORT_UNREACH, hook);
50 break; 51 break;
51 case IPT_ICMP_NET_PROHIBITED: 52 case IPT_ICMP_NET_PROHIBITED:
52 nf_send_unreach(skb, ICMP_NET_ANO); 53 nf_send_unreach(skb, ICMP_NET_ANO, hook);
53 break; 54 break;
54 case IPT_ICMP_HOST_PROHIBITED: 55 case IPT_ICMP_HOST_PROHIBITED:
55 nf_send_unreach(skb, ICMP_HOST_ANO); 56 nf_send_unreach(skb, ICMP_HOST_ANO, hook);
56 break; 57 break;
57 case IPT_ICMP_ADMIN_PROHIBITED: 58 case IPT_ICMP_ADMIN_PROHIBITED:
58 nf_send_unreach(skb, ICMP_PKT_FILTERED); 59 nf_send_unreach(skb, ICMP_PKT_FILTERED, hook);
59 break; 60 break;
60 case IPT_TCP_RESET: 61 case IPT_TCP_RESET:
61 nf_send_reset(skb, par->hooknum); 62 nf_send_reset(skb, hook);
62 case IPT_ICMP_ECHOREPLY: 63 case IPT_ICMP_ECHOREPLY:
63 /* Doesn't happen. */ 64 /* Doesn't happen. */
64 break; 65 break;
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index a313c3fbeb46..e9e67793055f 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -300,11 +300,9 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
300 300
301static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops, 301static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops,
302 struct sk_buff *skb, 302 struct sk_buff *skb,
303 const struct net_device *in, 303 const struct nf_hook_state *nhs)
304 const struct net_device *out,
305 int (*okfn)(struct sk_buff *))
306{ 304{
307 struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out)); 305 struct synproxy_net *snet = synproxy_pernet(dev_net(nhs->in ? : nhs->out));
308 enum ip_conntrack_info ctinfo; 306 enum ip_conntrack_info ctinfo;
309 struct nf_conn *ct; 307 struct nf_conn *ct;
310 struct nf_conn_synproxy *synproxy; 308 struct nf_conn_synproxy *synproxy;
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index e08a74a243a8..a0f3beca52d2 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -34,8 +34,7 @@ static const struct xt_table packet_filter = {
34 34
35static unsigned int 35static unsigned int
36iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, 36iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
37 const struct net_device *in, const struct net_device *out, 37 const struct nf_hook_state *state)
38 int (*okfn)(struct sk_buff *))
39{ 38{
40 const struct net *net; 39 const struct net *net;
41 40
@@ -45,9 +44,8 @@ iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
45 /* root is playing with raw sockets. */ 44 /* root is playing with raw sockets. */
46 return NF_ACCEPT; 45 return NF_ACCEPT;
47 46
48 net = dev_net((in != NULL) ? in : out); 47 net = dev_net(state->in ? state->in : state->out);
49 return ipt_do_table(skb, ops->hooknum, in, out, 48 return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_filter);
50 net->ipv4.iptable_filter);
51} 49}
52 50
53static struct nf_hook_ops *filter_ops __read_mostly; 51static struct nf_hook_ops *filter_ops __read_mostly;
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 6a5079c34bb3..62cbb8c5f4a8 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -37,8 +37,9 @@ static const struct xt_table packet_mangler = {
37}; 37};
38 38
39static unsigned int 39static unsigned int
40ipt_mangle_out(struct sk_buff *skb, const struct net_device *out) 40ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
41{ 41{
42 struct net_device *out = state->out;
42 unsigned int ret; 43 unsigned int ret;
43 const struct iphdr *iph; 44 const struct iphdr *iph;
44 u_int8_t tos; 45 u_int8_t tos;
@@ -58,7 +59,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
58 daddr = iph->daddr; 59 daddr = iph->daddr;
59 tos = iph->tos; 60 tos = iph->tos;
60 61
61 ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, 62 ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, state,
62 dev_net(out)->ipv4.iptable_mangle); 63 dev_net(out)->ipv4.iptable_mangle);
63 /* Reroute for ANY change. */ 64 /* Reroute for ANY change. */
64 if (ret != NF_DROP && ret != NF_STOLEN) { 65 if (ret != NF_DROP && ret != NF_STOLEN) {
@@ -81,18 +82,16 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
81static unsigned int 82static unsigned int
82iptable_mangle_hook(const struct nf_hook_ops *ops, 83iptable_mangle_hook(const struct nf_hook_ops *ops,
83 struct sk_buff *skb, 84 struct sk_buff *skb,
84 const struct net_device *in, 85 const struct nf_hook_state *state)
85 const struct net_device *out,
86 int (*okfn)(struct sk_buff *))
87{ 86{
88 if (ops->hooknum == NF_INET_LOCAL_OUT) 87 if (ops->hooknum == NF_INET_LOCAL_OUT)
89 return ipt_mangle_out(skb, out); 88 return ipt_mangle_out(skb, state);
90 if (ops->hooknum == NF_INET_POST_ROUTING) 89 if (ops->hooknum == NF_INET_POST_ROUTING)
91 return ipt_do_table(skb, ops->hooknum, in, out, 90 return ipt_do_table(skb, ops->hooknum, state,
92 dev_net(out)->ipv4.iptable_mangle); 91 dev_net(state->out)->ipv4.iptable_mangle);
93 /* PREROUTING/INPUT/FORWARD: */ 92 /* PREROUTING/INPUT/FORWARD: */
94 return ipt_do_table(skb, ops->hooknum, in, out, 93 return ipt_do_table(skb, ops->hooknum, state,
95 dev_net(in)->ipv4.iptable_mangle); 94 dev_net(state->in)->ipv4.iptable_mangle);
96} 95}
97 96
98static struct nf_hook_ops *mangle_ops __read_mostly; 97static struct nf_hook_ops *mangle_ops __read_mostly;
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 6b67d7e9a75d..0d4d9cdf98a4 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -30,49 +30,40 @@ static const struct xt_table nf_nat_ipv4_table = {
30 30
31static unsigned int iptable_nat_do_chain(const struct nf_hook_ops *ops, 31static unsigned int iptable_nat_do_chain(const struct nf_hook_ops *ops,
32 struct sk_buff *skb, 32 struct sk_buff *skb,
33 const struct net_device *in, 33 const struct nf_hook_state *state,
34 const struct net_device *out,
35 struct nf_conn *ct) 34 struct nf_conn *ct)
36{ 35{
37 struct net *net = nf_ct_net(ct); 36 struct net *net = nf_ct_net(ct);
38 37
39 return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.nat_table); 38 return ipt_do_table(skb, ops->hooknum, state, net->ipv4.nat_table);
40} 39}
41 40
42static unsigned int iptable_nat_ipv4_fn(const struct nf_hook_ops *ops, 41static unsigned int iptable_nat_ipv4_fn(const struct nf_hook_ops *ops,
43 struct sk_buff *skb, 42 struct sk_buff *skb,
44 const struct net_device *in, 43 const struct nf_hook_state *state)
45 const struct net_device *out,
46 int (*okfn)(struct sk_buff *))
47{ 44{
48 return nf_nat_ipv4_fn(ops, skb, in, out, iptable_nat_do_chain); 45 return nf_nat_ipv4_fn(ops, skb, state, iptable_nat_do_chain);
49} 46}
50 47
51static unsigned int iptable_nat_ipv4_in(const struct nf_hook_ops *ops, 48static unsigned int iptable_nat_ipv4_in(const struct nf_hook_ops *ops,
52 struct sk_buff *skb, 49 struct sk_buff *skb,
53 const struct net_device *in, 50 const struct nf_hook_state *state)
54 const struct net_device *out,
55 int (*okfn)(struct sk_buff *))
56{ 51{
57 return nf_nat_ipv4_in(ops, skb, in, out, iptable_nat_do_chain); 52 return nf_nat_ipv4_in(ops, skb, state, iptable_nat_do_chain);
58} 53}
59 54
60static unsigned int iptable_nat_ipv4_out(const struct nf_hook_ops *ops, 55static unsigned int iptable_nat_ipv4_out(const struct nf_hook_ops *ops,
61 struct sk_buff *skb, 56 struct sk_buff *skb,
62 const struct net_device *in, 57 const struct nf_hook_state *state)
63 const struct net_device *out,
64 int (*okfn)(struct sk_buff *))
65{ 58{
66 return nf_nat_ipv4_out(ops, skb, in, out, iptable_nat_do_chain); 59 return nf_nat_ipv4_out(ops, skb, state, iptable_nat_do_chain);
67} 60}
68 61
69static unsigned int iptable_nat_ipv4_local_fn(const struct nf_hook_ops *ops, 62static unsigned int iptable_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
70 struct sk_buff *skb, 63 struct sk_buff *skb,
71 const struct net_device *in, 64 const struct nf_hook_state *state)
72 const struct net_device *out,
73 int (*okfn)(struct sk_buff *))
74{ 65{
75 return nf_nat_ipv4_local_fn(ops, skb, in, out, iptable_nat_do_chain); 66 return nf_nat_ipv4_local_fn(ops, skb, state, iptable_nat_do_chain);
76} 67}
77 68
78static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { 69static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index b2f7e8f98316..0356e6da4bb7 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -21,8 +21,7 @@ static const struct xt_table packet_raw = {
21/* The work comes in here from netfilter.c. */ 21/* The work comes in here from netfilter.c. */
22static unsigned int 22static unsigned int
23iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, 23iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
24 const struct net_device *in, const struct net_device *out, 24 const struct nf_hook_state *state)
25 int (*okfn)(struct sk_buff *))
26{ 25{
27 const struct net *net; 26 const struct net *net;
28 27
@@ -32,8 +31,8 @@ iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
32 /* root is playing with raw sockets. */ 31 /* root is playing with raw sockets. */
33 return NF_ACCEPT; 32 return NF_ACCEPT;
34 33
35 net = dev_net((in != NULL) ? in : out); 34 net = dev_net(state->in ? state->in : state->out);
36 return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.iptable_raw); 35 return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_raw);
37} 36}
38 37
39static struct nf_hook_ops *rawtable_ops __read_mostly; 38static struct nf_hook_ops *rawtable_ops __read_mostly;
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index c86647ed2078..4bce3980ccd9 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -38,9 +38,7 @@ static const struct xt_table security_table = {
38 38
39static unsigned int 39static unsigned int
40iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, 40iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
41 const struct net_device *in, 41 const struct nf_hook_state *state)
42 const struct net_device *out,
43 int (*okfn)(struct sk_buff *))
44{ 42{
45 const struct net *net; 43 const struct net *net;
46 44
@@ -50,8 +48,8 @@ iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
50 /* Somebody is playing with raw sockets. */ 48 /* Somebody is playing with raw sockets. */
51 return NF_ACCEPT; 49 return NF_ACCEPT;
52 50
53 net = dev_net((in != NULL) ? in : out); 51 net = dev_net(state->in ? state->in : state->out);
54 return ipt_do_table(skb, ops->hooknum, in, out, 52 return ipt_do_table(skb, ops->hooknum, state,
55 net->ipv4.iptable_security); 53 net->ipv4.iptable_security);
56} 54}
57 55
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 5c61328b7704..30ad9554b5e9 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -94,9 +94,7 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
94 94
95static unsigned int ipv4_helper(const struct nf_hook_ops *ops, 95static unsigned int ipv4_helper(const struct nf_hook_ops *ops,
96 struct sk_buff *skb, 96 struct sk_buff *skb,
97 const struct net_device *in, 97 const struct nf_hook_state *state)
98 const struct net_device *out,
99 int (*okfn)(struct sk_buff *))
100{ 98{
101 struct nf_conn *ct; 99 struct nf_conn *ct;
102 enum ip_conntrack_info ctinfo; 100 enum ip_conntrack_info ctinfo;
@@ -123,9 +121,7 @@ static unsigned int ipv4_helper(const struct nf_hook_ops *ops,
123 121
124static unsigned int ipv4_confirm(const struct nf_hook_ops *ops, 122static unsigned int ipv4_confirm(const struct nf_hook_ops *ops,
125 struct sk_buff *skb, 123 struct sk_buff *skb,
126 const struct net_device *in, 124 const struct nf_hook_state *state)
127 const struct net_device *out,
128 int (*okfn)(struct sk_buff *))
129{ 125{
130 struct nf_conn *ct; 126 struct nf_conn *ct;
131 enum ip_conntrack_info ctinfo; 127 enum ip_conntrack_info ctinfo;
@@ -149,24 +145,20 @@ out:
149 145
150static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops, 146static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops,
151 struct sk_buff *skb, 147 struct sk_buff *skb,
152 const struct net_device *in, 148 const struct nf_hook_state *state)
153 const struct net_device *out,
154 int (*okfn)(struct sk_buff *))
155{ 149{
156 return nf_conntrack_in(dev_net(in), PF_INET, ops->hooknum, skb); 150 return nf_conntrack_in(dev_net(state->in), PF_INET, ops->hooknum, skb);
157} 151}
158 152
159static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops, 153static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
160 struct sk_buff *skb, 154 struct sk_buff *skb,
161 const struct net_device *in, 155 const struct nf_hook_state *state)
162 const struct net_device *out,
163 int (*okfn)(struct sk_buff *))
164{ 156{
165 /* root is playing with raw sockets. */ 157 /* root is playing with raw sockets. */
166 if (skb->len < sizeof(struct iphdr) || 158 if (skb->len < sizeof(struct iphdr) ||
167 ip_hdrlen(skb) < sizeof(struct iphdr)) 159 ip_hdrlen(skb) < sizeof(struct iphdr))
168 return NF_ACCEPT; 160 return NF_ACCEPT;
169 return nf_conntrack_in(dev_net(out), PF_INET, ops->hooknum, skb); 161 return nf_conntrack_in(dev_net(state->out), PF_INET, ops->hooknum, skb);
170} 162}
171 163
172/* Connection tracking may drop packets, but never alters them, so 164/* Connection tracking may drop packets, but never alters them, so
@@ -322,8 +314,8 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
322static int ipv4_tuple_to_nlattr(struct sk_buff *skb, 314static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
323 const struct nf_conntrack_tuple *tuple) 315 const struct nf_conntrack_tuple *tuple)
324{ 316{
325 if (nla_put_be32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) || 317 if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
326 nla_put_be32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip)) 318 nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
327 goto nla_put_failure; 319 goto nla_put_failure;
328 return 0; 320 return 0;
329 321
@@ -342,8 +334,8 @@ static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
342 if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) 334 if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
343 return -EINVAL; 335 return -EINVAL;
344 336
345 t->src.u3.ip = nla_get_be32(tb[CTA_IP_V4_SRC]); 337 t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
346 t->dst.u3.ip = nla_get_be32(tb[CTA_IP_V4_DST]); 338 t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
347 339
348 return 0; 340 return 0;
349} 341}
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index a460a87e14f8..f0dfe92a00d6 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -300,7 +300,9 @@ static int exp_seq_show(struct seq_file *s, void *v)
300 __nf_ct_l3proto_find(exp->tuple.src.l3num), 300 __nf_ct_l3proto_find(exp->tuple.src.l3num),
301 __nf_ct_l4proto_find(exp->tuple.src.l3num, 301 __nf_ct_l4proto_find(exp->tuple.src.l3num,
302 exp->tuple.dst.protonum)); 302 exp->tuple.dst.protonum));
303 return seq_putc(s, '\n'); 303 seq_putc(s, '\n');
304
305 return 0;
304} 306}
305 307
306static const struct seq_operations exp_seq_ops = { 308static const struct seq_operations exp_seq_ops = {
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 7e5ca6f2d0cd..c88b7d434718 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -63,9 +63,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
63 63
64static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, 64static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
65 struct sk_buff *skb, 65 struct sk_buff *skb,
66 const struct net_device *in, 66 const struct nf_hook_state *state)
67 const struct net_device *out,
68 int (*okfn)(struct sk_buff *))
69{ 67{
70 struct sock *sk = skb->sk; 68 struct sock *sk = skb->sk;
71 struct inet_sock *inet = inet_sk(skb->sk); 69 struct inet_sock *inet = inet_sk(skb->sk);
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index d059182c1466..e7ad950cf9ef 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -10,8 +10,10 @@
10 * it under the terms of the GNU General Public License version 2 as 10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation. 11 * published by the Free Software Foundation.
12 */ 12 */
13
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 15
16#include <linux/kernel.h>
15#include <linux/module.h> 17#include <linux/module.h>
16#include <linux/spinlock.h> 18#include <linux/spinlock.h>
17#include <linux/skbuff.h> 19#include <linux/skbuff.h>
@@ -27,7 +29,7 @@ static struct nf_loginfo default_loginfo = {
27 .type = NF_LOG_TYPE_LOG, 29 .type = NF_LOG_TYPE_LOG,
28 .u = { 30 .u = {
29 .log = { 31 .log = {
30 .level = 5, 32 .level = LOGLEVEL_NOTICE,
31 .logflags = NF_LOG_MASK, 33 .logflags = NF_LOG_MASK,
32 }, 34 },
33 }, 35 },
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 75101980eeee..076aadda0473 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -5,8 +5,10 @@
5 * it under the terms of the GNU General Public License version 2 as 5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation. 6 * published by the Free Software Foundation.
7 */ 7 */
8
8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9 10
11#include <linux/kernel.h>
10#include <linux/module.h> 12#include <linux/module.h>
11#include <linux/spinlock.h> 13#include <linux/spinlock.h>
12#include <linux/skbuff.h> 14#include <linux/skbuff.h>
@@ -26,7 +28,7 @@ static struct nf_loginfo default_loginfo = {
26 .type = NF_LOG_TYPE_LOG, 28 .type = NF_LOG_TYPE_LOG,
27 .u = { 29 .u = {
28 .log = { 30 .log = {
29 .level = 5, 31 .level = LOGLEVEL_NOTICE,
30 .logflags = NF_LOG_MASK, 32 .logflags = NF_LOG_MASK,
31 }, 33 },
32 }, 34 },
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index fc37711e11f3..e59cc05c09e9 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -256,11 +256,10 @@ EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
256 256
257unsigned int 257unsigned int
258nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, 258nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
259 const struct net_device *in, const struct net_device *out, 259 const struct nf_hook_state *state,
260 unsigned int (*do_chain)(const struct nf_hook_ops *ops, 260 unsigned int (*do_chain)(const struct nf_hook_ops *ops,
261 struct sk_buff *skb, 261 struct sk_buff *skb,
262 const struct net_device *in, 262 const struct nf_hook_state *state,
263 const struct net_device *out,
264 struct nf_conn *ct)) 263 struct nf_conn *ct))
265{ 264{
266 struct nf_conn *ct; 265 struct nf_conn *ct;
@@ -309,7 +308,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
309 if (!nf_nat_initialized(ct, maniptype)) { 308 if (!nf_nat_initialized(ct, maniptype)) {
310 unsigned int ret; 309 unsigned int ret;
311 310
312 ret = do_chain(ops, skb, in, out, ct); 311 ret = do_chain(ops, skb, state, ct);
313 if (ret != NF_ACCEPT) 312 if (ret != NF_ACCEPT)
314 return ret; 313 return ret;
315 314
@@ -323,7 +322,8 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
323 pr_debug("Already setup manip %s for ct %p\n", 322 pr_debug("Already setup manip %s for ct %p\n",
324 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", 323 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
325 ct); 324 ct);
326 if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) 325 if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat,
326 state->out))
327 goto oif_changed; 327 goto oif_changed;
328 } 328 }
329 break; 329 break;
@@ -332,7 +332,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
332 /* ESTABLISHED */ 332 /* ESTABLISHED */
333 NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || 333 NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
334 ctinfo == IP_CT_ESTABLISHED_REPLY); 334 ctinfo == IP_CT_ESTABLISHED_REPLY);
335 if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) 335 if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out))
336 goto oif_changed; 336 goto oif_changed;
337 } 337 }
338 338
@@ -346,17 +346,16 @@ EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
346 346
347unsigned int 347unsigned int
348nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, 348nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
349 const struct net_device *in, const struct net_device *out, 349 const struct nf_hook_state *state,
350 unsigned int (*do_chain)(const struct nf_hook_ops *ops, 350 unsigned int (*do_chain)(const struct nf_hook_ops *ops,
351 struct sk_buff *skb, 351 struct sk_buff *skb,
352 const struct net_device *in, 352 const struct nf_hook_state *state,
353 const struct net_device *out,
354 struct nf_conn *ct)) 353 struct nf_conn *ct))
355{ 354{
356 unsigned int ret; 355 unsigned int ret;
357 __be32 daddr = ip_hdr(skb)->daddr; 356 __be32 daddr = ip_hdr(skb)->daddr;
358 357
359 ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain); 358 ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
360 if (ret != NF_DROP && ret != NF_STOLEN && 359 if (ret != NF_DROP && ret != NF_STOLEN &&
361 daddr != ip_hdr(skb)->daddr) 360 daddr != ip_hdr(skb)->daddr)
362 skb_dst_drop(skb); 361 skb_dst_drop(skb);
@@ -367,11 +366,10 @@ EXPORT_SYMBOL_GPL(nf_nat_ipv4_in);
367 366
368unsigned int 367unsigned int
369nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, 368nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
370 const struct net_device *in, const struct net_device *out, 369 const struct nf_hook_state *state,
371 unsigned int (*do_chain)(const struct nf_hook_ops *ops, 370 unsigned int (*do_chain)(const struct nf_hook_ops *ops,
372 struct sk_buff *skb, 371 struct sk_buff *skb,
373 const struct net_device *in, 372 const struct nf_hook_state *state,
374 const struct net_device *out,
375 struct nf_conn *ct)) 373 struct nf_conn *ct))
376{ 374{
377#ifdef CONFIG_XFRM 375#ifdef CONFIG_XFRM
@@ -386,7 +384,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
386 ip_hdrlen(skb) < sizeof(struct iphdr)) 384 ip_hdrlen(skb) < sizeof(struct iphdr))
387 return NF_ACCEPT; 385 return NF_ACCEPT;
388 386
389 ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain); 387 ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
390#ifdef CONFIG_XFRM 388#ifdef CONFIG_XFRM
391 if (ret != NF_DROP && ret != NF_STOLEN && 389 if (ret != NF_DROP && ret != NF_STOLEN &&
392 !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && 390 !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
@@ -410,11 +408,10 @@ EXPORT_SYMBOL_GPL(nf_nat_ipv4_out);
410 408
411unsigned int 409unsigned int
412nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, 410nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
413 const struct net_device *in, const struct net_device *out, 411 const struct nf_hook_state *state,
414 unsigned int (*do_chain)(const struct nf_hook_ops *ops, 412 unsigned int (*do_chain)(const struct nf_hook_ops *ops,
415 struct sk_buff *skb, 413 struct sk_buff *skb,
416 const struct net_device *in, 414 const struct nf_hook_state *state,
417 const struct net_device *out,
418 struct nf_conn *ct)) 415 struct nf_conn *ct))
419{ 416{
420 const struct nf_conn *ct; 417 const struct nf_conn *ct;
@@ -427,7 +424,7 @@ nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
427 ip_hdrlen(skb) < sizeof(struct iphdr)) 424 ip_hdrlen(skb) < sizeof(struct iphdr))
428 return NF_ACCEPT; 425 return NF_ACCEPT;
429 426
430 ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain); 427 ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
431 if (ret != NF_DROP && ret != NF_STOLEN && 428 if (ret != NF_DROP && ret != NF_STOLEN &&
432 (ct = nf_ct_get(skb, &ctinfo)) != NULL) { 429 (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
433 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 430 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 536da7bc598a..3262e41ff76f 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -13,6 +13,7 @@
13#include <net/dst.h> 13#include <net/dst.h>
14#include <net/netfilter/ipv4/nf_reject.h> 14#include <net/netfilter/ipv4/nf_reject.h>
15#include <linux/netfilter_ipv4.h> 15#include <linux/netfilter_ipv4.h>
16#include <linux/netfilter_bridge.h>
16#include <net/netfilter/ipv4/nf_reject.h> 17#include <net/netfilter/ipv4/nf_reject.h>
17 18
18const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, 19const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
@@ -43,7 +44,7 @@ EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get);
43 44
44struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, 45struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
45 const struct sk_buff *oldskb, 46 const struct sk_buff *oldskb,
46 __be16 protocol, int ttl) 47 __u8 protocol, int ttl)
47{ 48{
48 struct iphdr *niph, *oiph = ip_hdr(oldskb); 49 struct iphdr *niph, *oiph = ip_hdr(oldskb);
49 50
@@ -146,7 +147,8 @@ void nf_send_reset(struct sk_buff *oldskb, int hook)
146 */ 147 */
147 if (oldskb->nf_bridge) { 148 if (oldskb->nf_bridge) {
148 struct ethhdr *oeth = eth_hdr(oldskb); 149 struct ethhdr *oeth = eth_hdr(oldskb);
149 nskb->dev = oldskb->nf_bridge->physindev; 150
151 nskb->dev = nf_bridge_get_physindev(oldskb);
150 niph->tot_len = htons(nskb->len); 152 niph->tot_len = htons(nskb->len);
151 ip_send_check(niph); 153 ip_send_check(niph);
152 if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), 154 if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
@@ -164,4 +166,27 @@ void nf_send_reset(struct sk_buff *oldskb, int hook)
164} 166}
165EXPORT_SYMBOL_GPL(nf_send_reset); 167EXPORT_SYMBOL_GPL(nf_send_reset);
166 168
169void nf_send_unreach(struct sk_buff *skb_in, int code, int hook)
170{
171 struct iphdr *iph = ip_hdr(skb_in);
172 u8 proto;
173
174 if (skb_in->csum_bad || iph->frag_off & htons(IP_OFFSET))
175 return;
176
177 if (skb_csum_unnecessary(skb_in)) {
178 icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
179 return;
180 }
181
182 if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP)
183 proto = iph->protocol;
184 else
185 proto = 0;
186
187 if (nf_ip_checksum(skb_in, hook, ip_hdrlen(skb_in), proto) == 0)
188 icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
189}
190EXPORT_SYMBOL_GPL(nf_send_unreach);
191
167MODULE_LICENSE("GPL"); 192MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index 19412a4063fb..8412268bbad1 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -17,13 +17,11 @@
17static unsigned int 17static unsigned int
18nft_do_chain_arp(const struct nf_hook_ops *ops, 18nft_do_chain_arp(const struct nf_hook_ops *ops,
19 struct sk_buff *skb, 19 struct sk_buff *skb,
20 const struct net_device *in, 20 const struct nf_hook_state *state)
21 const struct net_device *out,
22 int (*okfn)(struct sk_buff *))
23{ 21{
24 struct nft_pktinfo pkt; 22 struct nft_pktinfo pkt;
25 23
26 nft_set_pktinfo(&pkt, ops, skb, in, out); 24 nft_set_pktinfo(&pkt, ops, skb, state);
27 25
28 return nft_do_chain(&pkt, ops); 26 return nft_do_chain(&pkt, ops);
29} 27}
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index 6820c8c40842..aa180d3a69a5 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -20,22 +20,18 @@
20 20
21static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops, 21static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops,
22 struct sk_buff *skb, 22 struct sk_buff *skb,
23 const struct net_device *in, 23 const struct nf_hook_state *state)
24 const struct net_device *out,
25 int (*okfn)(struct sk_buff *))
26{ 24{
27 struct nft_pktinfo pkt; 25 struct nft_pktinfo pkt;
28 26
29 nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); 27 nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
30 28
31 return nft_do_chain(&pkt, ops); 29 return nft_do_chain(&pkt, ops);
32} 30}
33 31
34static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, 32static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
35 struct sk_buff *skb, 33 struct sk_buff *skb,
36 const struct net_device *in, 34 const struct nf_hook_state *state)
37 const struct net_device *out,
38 int (*okfn)(struct sk_buff *))
39{ 35{
40 if (unlikely(skb->len < sizeof(struct iphdr) || 36 if (unlikely(skb->len < sizeof(struct iphdr) ||
41 ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) { 37 ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) {
@@ -45,7 +41,7 @@ static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
45 return NF_ACCEPT; 41 return NF_ACCEPT;
46 } 42 }
47 43
48 return nft_do_chain_ipv4(ops, skb, in, out, okfn); 44 return nft_do_chain_ipv4(ops, skb, state);
49} 45}
50 46
51struct nft_af_info nft_af_ipv4 __read_mostly = { 47struct nft_af_info nft_af_ipv4 __read_mostly = {
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index df547bf50078..bf5c30ae14e4 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -28,51 +28,42 @@
28 28
29static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops, 29static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops,
30 struct sk_buff *skb, 30 struct sk_buff *skb,
31 const struct net_device *in, 31 const struct nf_hook_state *state,
32 const struct net_device *out,
33 struct nf_conn *ct) 32 struct nf_conn *ct)
34{ 33{
35 struct nft_pktinfo pkt; 34 struct nft_pktinfo pkt;
36 35
37 nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); 36 nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
38 37
39 return nft_do_chain(&pkt, ops); 38 return nft_do_chain(&pkt, ops);
40} 39}
41 40
42static unsigned int nft_nat_ipv4_fn(const struct nf_hook_ops *ops, 41static unsigned int nft_nat_ipv4_fn(const struct nf_hook_ops *ops,
43 struct sk_buff *skb, 42 struct sk_buff *skb,
44 const struct net_device *in, 43 const struct nf_hook_state *state)
45 const struct net_device *out,
46 int (*okfn)(struct sk_buff *))
47{ 44{
48 return nf_nat_ipv4_fn(ops, skb, in, out, nft_nat_do_chain); 45 return nf_nat_ipv4_fn(ops, skb, state, nft_nat_do_chain);
49} 46}
50 47
51static unsigned int nft_nat_ipv4_in(const struct nf_hook_ops *ops, 48static unsigned int nft_nat_ipv4_in(const struct nf_hook_ops *ops,
52 struct sk_buff *skb, 49 struct sk_buff *skb,
53 const struct net_device *in, 50 const struct nf_hook_state *state)
54 const struct net_device *out,
55 int (*okfn)(struct sk_buff *))
56{ 51{
57 return nf_nat_ipv4_in(ops, skb, in, out, nft_nat_do_chain); 52 return nf_nat_ipv4_in(ops, skb, state, nft_nat_do_chain);
58} 53}
59 54
60static unsigned int nft_nat_ipv4_out(const struct nf_hook_ops *ops, 55static unsigned int nft_nat_ipv4_out(const struct nf_hook_ops *ops,
61 struct sk_buff *skb, 56 struct sk_buff *skb,
62 const struct net_device *in, 57 const struct nf_hook_state *state)
63 const struct net_device *out,
64 int (*okfn)(struct sk_buff *))
65{ 58{
66 return nf_nat_ipv4_out(ops, skb, in, out, nft_nat_do_chain); 59 return nf_nat_ipv4_out(ops, skb, state, nft_nat_do_chain);
67} 60}
68 61
69static unsigned int nft_nat_ipv4_local_fn(const struct nf_hook_ops *ops, 62static unsigned int nft_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
70 struct sk_buff *skb, 63 struct sk_buff *skb,
71 const struct net_device *in, 64 const struct nf_hook_state *state)
72 const struct net_device *out,
73 int (*okfn)(struct sk_buff *))
74{ 65{
75 return nf_nat_ipv4_local_fn(ops, skb, in, out, nft_nat_do_chain); 66 return nf_nat_ipv4_local_fn(ops, skb, state, nft_nat_do_chain);
76} 67}
77 68
78static const struct nf_chain_type nft_chain_nat_ipv4 = { 69static const struct nf_chain_type nft_chain_nat_ipv4 = {
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
index 125b66766c0a..e335b0afdaf3 100644
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -23,9 +23,7 @@
23 23
24static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, 24static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
25 struct sk_buff *skb, 25 struct sk_buff *skb,
26 const struct net_device *in, 26 const struct nf_hook_state *state)
27 const struct net_device *out,
28 int (*okfn)(struct sk_buff *))
29{ 27{
30 unsigned int ret; 28 unsigned int ret;
31 struct nft_pktinfo pkt; 29 struct nft_pktinfo pkt;
@@ -39,7 +37,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
39 ip_hdrlen(skb) < sizeof(struct iphdr)) 37 ip_hdrlen(skb) < sizeof(struct iphdr))
40 return NF_ACCEPT; 38 return NF_ACCEPT;
41 39
42 nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); 40 nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
43 41
44 mark = skb->mark; 42 mark = skb->mark;
45 iph = ip_hdr(skb); 43 iph = ip_hdr(skb);
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index 665de06561cd..40e414c4ca56 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -17,20 +17,17 @@
17#include <net/netfilter/ipv4/nf_nat_masquerade.h> 17#include <net/netfilter/ipv4/nf_nat_masquerade.h>
18 18
19static void nft_masq_ipv4_eval(const struct nft_expr *expr, 19static void nft_masq_ipv4_eval(const struct nft_expr *expr,
20 struct nft_data data[NFT_REG_MAX + 1], 20 struct nft_regs *regs,
21 const struct nft_pktinfo *pkt) 21 const struct nft_pktinfo *pkt)
22{ 22{
23 struct nft_masq *priv = nft_expr_priv(expr); 23 struct nft_masq *priv = nft_expr_priv(expr);
24 struct nf_nat_range range; 24 struct nf_nat_range range;
25 unsigned int verdict;
26 25
27 memset(&range, 0, sizeof(range)); 26 memset(&range, 0, sizeof(range));
28 range.flags = priv->flags; 27 range.flags = priv->flags;
29 28
30 verdict = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum, 29 regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum,
31 &range, pkt->out); 30 &range, pkt->out);
32
33 data[NFT_REG_VERDICT].verdict = verdict;
34} 31}
35 32
36static struct nft_expr_type nft_masq_ipv4_type; 33static struct nft_expr_type nft_masq_ipv4_type;
diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c
index 6ecfce63201a..d8d795df9c13 100644
--- a/net/ipv4/netfilter/nft_redir_ipv4.c
+++ b/net/ipv4/netfilter/nft_redir_ipv4.c
@@ -18,26 +18,25 @@
18#include <net/netfilter/nft_redir.h> 18#include <net/netfilter/nft_redir.h>
19 19
20static void nft_redir_ipv4_eval(const struct nft_expr *expr, 20static void nft_redir_ipv4_eval(const struct nft_expr *expr,
21 struct nft_data data[NFT_REG_MAX + 1], 21 struct nft_regs *regs,
22 const struct nft_pktinfo *pkt) 22 const struct nft_pktinfo *pkt)
23{ 23{
24 struct nft_redir *priv = nft_expr_priv(expr); 24 struct nft_redir *priv = nft_expr_priv(expr);
25 struct nf_nat_ipv4_multi_range_compat mr; 25 struct nf_nat_ipv4_multi_range_compat mr;
26 unsigned int verdict;
27 26
28 memset(&mr, 0, sizeof(mr)); 27 memset(&mr, 0, sizeof(mr));
29 if (priv->sreg_proto_min) { 28 if (priv->sreg_proto_min) {
30 mr.range[0].min.all = 29 mr.range[0].min.all =
31 *(__be16 *)&data[priv->sreg_proto_min].data[0]; 30 *(__be16 *)&regs->data[priv->sreg_proto_min];
32 mr.range[0].max.all = 31 mr.range[0].max.all =
33 *(__be16 *)&data[priv->sreg_proto_max].data[0]; 32 *(__be16 *)&regs->data[priv->sreg_proto_max];
34 mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 33 mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
35 } 34 }
36 35
37 mr.range[0].flags |= priv->flags; 36 mr.range[0].flags |= priv->flags;
38 37
39 verdict = nf_nat_redirect_ipv4(pkt->skb, &mr, pkt->ops->hooknum); 38 regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr,
40 data[NFT_REG_VERDICT].verdict = verdict; 39 pkt->ops->hooknum);
41} 40}
42 41
43static struct nft_expr_type nft_redir_ipv4_type; 42static struct nft_expr_type nft_redir_ipv4_type;
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index d729542bd1b7..b07e58b51158 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -20,21 +20,24 @@
20#include <net/netfilter/nft_reject.h> 20#include <net/netfilter/nft_reject.h>
21 21
22static void nft_reject_ipv4_eval(const struct nft_expr *expr, 22static void nft_reject_ipv4_eval(const struct nft_expr *expr,
23 struct nft_data data[NFT_REG_MAX + 1], 23 struct nft_regs *regs,
24 const struct nft_pktinfo *pkt) 24 const struct nft_pktinfo *pkt)
25{ 25{
26 struct nft_reject *priv = nft_expr_priv(expr); 26 struct nft_reject *priv = nft_expr_priv(expr);
27 27
28 switch (priv->type) { 28 switch (priv->type) {
29 case NFT_REJECT_ICMP_UNREACH: 29 case NFT_REJECT_ICMP_UNREACH:
30 nf_send_unreach(pkt->skb, priv->icmp_code); 30 nf_send_unreach(pkt->skb, priv->icmp_code,
31 pkt->ops->hooknum);
31 break; 32 break;
32 case NFT_REJECT_TCP_RST: 33 case NFT_REJECT_TCP_RST:
33 nf_send_reset(pkt->skb, pkt->ops->hooknum); 34 nf_send_reset(pkt->skb, pkt->ops->hooknum);
34 break; 35 break;
36 default:
37 break;
35 } 38 }
36 39
37 data[NFT_REG_VERDICT].verdict = NF_DROP; 40 regs->verdict.code = NF_DROP;
38} 41}
39 42
40static struct nft_expr_type nft_reject_ipv4_type; 43static struct nft_expr_type nft_reject_ipv4_type;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index e9f66e1cda50..05ff44b758df 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -64,11 +64,11 @@ EXPORT_SYMBOL_GPL(pingv6_ops);
64 64
65static u16 ping_port_rover; 65static u16 ping_port_rover;
66 66
67static inline int ping_hashfn(struct net *net, unsigned int num, unsigned int mask) 67static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask)
68{ 68{
69 int res = (num + net_hash_mix(net)) & mask; 69 u32 res = (num + net_hash_mix(net)) & mask;
70 70
71 pr_debug("hash(%d) = %d\n", num, res); 71 pr_debug("hash(%u) = %u\n", num, res);
72 return res; 72 return res;
73} 73}
74EXPORT_SYMBOL_GPL(ping_hash); 74EXPORT_SYMBOL_GPL(ping_hash);
@@ -158,6 +158,7 @@ void ping_unhash(struct sock *sk)
158 if (sk_hashed(sk)) { 158 if (sk_hashed(sk)) {
159 write_lock_bh(&ping_table.lock); 159 write_lock_bh(&ping_table.lock);
160 hlist_nulls_del(&sk->sk_nulls_node); 160 hlist_nulls_del(&sk->sk_nulls_node);
161 sk_nulls_node_init(&sk->sk_nulls_node);
161 sock_put(sk); 162 sock_put(sk);
162 isk->inet_num = 0; 163 isk->inet_num = 0;
163 isk->inet_sport = 0; 164 isk->inet_sport = 0;
@@ -259,6 +260,9 @@ int ping_init_sock(struct sock *sk)
259 kgid_t low, high; 260 kgid_t low, high;
260 int ret = 0; 261 int ret = 0;
261 262
263 if (sk->sk_family == AF_INET6)
264 sk->sk_ipv6only = 1;
265
262 inet_get_ping_group_range_net(net, &low, &high); 266 inet_get_ping_group_range_net(net, &low, &high);
263 if (gid_lte(low, group) && gid_lte(group, high)) 267 if (gid_lte(low, group) && gid_lte(group, high))
264 return 0; 268 return 0;
@@ -305,6 +309,11 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
305 if (addr_len < sizeof(*addr)) 309 if (addr_len < sizeof(*addr))
306 return -EINVAL; 310 return -EINVAL;
307 311
312 if (addr->sin_family != AF_INET &&
313 !(addr->sin_family == AF_UNSPEC &&
314 addr->sin_addr.s_addr == htonl(INADDR_ANY)))
315 return -EAFNOSUPPORT;
316
308 pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n", 317 pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n",
309 sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port)); 318 sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port));
310 319
@@ -330,7 +339,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
330 return -EINVAL; 339 return -EINVAL;
331 340
332 if (addr->sin6_family != AF_INET6) 341 if (addr->sin6_family != AF_INET6)
333 return -EINVAL; 342 return -EAFNOSUPPORT;
334 343
335 pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n", 344 pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n",
336 sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port)); 345 sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port));
@@ -508,7 +517,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info)
508 ntohs(icmph->un.echo.sequence)); 517 ntohs(icmph->un.echo.sequence));
509 518
510 sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); 519 sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
511 if (sk == NULL) { 520 if (!sk) {
512 pr_debug("no socket, dropping\n"); 521 pr_debug("no socket, dropping\n");
513 return; /* No socket for error */ 522 return; /* No socket for error */
514 } 523 }
@@ -684,8 +693,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
684} 693}
685EXPORT_SYMBOL_GPL(ping_common_sendmsg); 694EXPORT_SYMBOL_GPL(ping_common_sendmsg);
686 695
687static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 696static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
688 size_t len)
689{ 697{
690 struct net *net = sock_net(sk); 698 struct net *net = sock_net(sk);
691 struct flowi4 fl4; 699 struct flowi4 fl4;
@@ -716,7 +724,7 @@ static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m
716 if (msg->msg_namelen < sizeof(*usin)) 724 if (msg->msg_namelen < sizeof(*usin))
717 return -EINVAL; 725 return -EINVAL;
718 if (usin->sin_family != AF_INET) 726 if (usin->sin_family != AF_INET)
719 return -EINVAL; 727 return -EAFNOSUPPORT;
720 daddr = usin->sin_addr.s_addr; 728 daddr = usin->sin_addr.s_addr;
721 /* no remote port */ 729 /* no remote port */
722 } else { 730 } else {
@@ -841,8 +849,8 @@ do_confirm:
841 goto out; 849 goto out;
842} 850}
843 851
844int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 852int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
845 size_t len, int noblock, int flags, int *addr_len) 853 int flags, int *addr_len)
846{ 854{
847 struct inet_sock *isk = inet_sk(sk); 855 struct inet_sock *isk = inet_sk(sk);
848 int family = sk->sk_family; 856 int family = sk->sk_family;
@@ -964,7 +972,7 @@ bool ping_rcv(struct sk_buff *skb)
964 skb_push(skb, skb->data - (u8 *)icmph); 972 skb_push(skb, skb->data - (u8 *)icmph);
965 973
966 sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); 974 sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
967 if (sk != NULL) { 975 if (sk) {
968 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 976 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
969 977
970 pr_debug("rcv on socket %p\n", sk); 978 pr_debug("rcv on socket %p\n", sk);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index d8953ef0770c..e1f3b911dd1e 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -63,7 +63,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
63 socket_seq_show(seq); 63 socket_seq_show(seq);
64 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", 64 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
65 sock_prot_inuse_get(net, &tcp_prot), orphans, 65 sock_prot_inuse_get(net, &tcp_prot), orphans,
66 tcp_death_row.tw_count, sockets, 66 atomic_read(&tcp_death_row.tw_count), sockets,
67 proto_memory_allocated(&tcp_prot)); 67 proto_memory_allocated(&tcp_prot));
68 seq_printf(seq, "UDP: inuse %d mem %ld\n", 68 seq_printf(seq, "UDP: inuse %d mem %ld\n",
69 sock_prot_inuse_get(net, &udp_prot), 69 sock_prot_inuse_get(net, &udp_prot),
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index f027a708b7e0..561cd4b8fc6e 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -46,7 +46,6 @@
46#include <linux/stddef.h> 46#include <linux/stddef.h>
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include <linux/errno.h> 48#include <linux/errno.h>
49#include <linux/aio.h>
50#include <linux/kernel.h> 49#include <linux/kernel.h>
51#include <linux/export.h> 50#include <linux/export.h>
52#include <linux/spinlock.h> 51#include <linux/spinlock.h>
@@ -293,7 +292,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
293 292
294 read_lock(&raw_v4_hashinfo.lock); 293 read_lock(&raw_v4_hashinfo.lock);
295 raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); 294 raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
296 if (raw_sk != NULL) { 295 if (raw_sk) {
297 iph = (const struct iphdr *)skb->data; 296 iph = (const struct iphdr *)skb->data;
298 net = dev_net(skb->dev); 297 net = dev_net(skb->dev);
299 298
@@ -363,7 +362,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
363 skb = sock_alloc_send_skb(sk, 362 skb = sock_alloc_send_skb(sk,
364 length + hlen + tlen + 15, 363 length + hlen + tlen + 15,
365 flags & MSG_DONTWAIT, &err); 364 flags & MSG_DONTWAIT, &err);
366 if (skb == NULL) 365 if (!skb)
367 goto error; 366 goto error;
368 skb_reserve(skb, hlen); 367 skb_reserve(skb, hlen);
369 368
@@ -404,7 +403,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
404 iph->check = 0; 403 iph->check = 0;
405 iph->tot_len = htons(length); 404 iph->tot_len = htons(length);
406 if (!iph->id) 405 if (!iph->id)
407 ip_select_ident(skb, NULL); 406 ip_select_ident(net, skb, NULL);
408 407
409 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); 408 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
410 } 409 }
@@ -412,8 +411,8 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
412 icmp_out_count(net, ((struct icmphdr *) 411 icmp_out_count(net, ((struct icmphdr *)
413 skb_transport_header(skb))->type); 412 skb_transport_header(skb))->type);
414 413
415 err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL, 414 err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb,
416 rt->dst.dev, dst_output); 415 NULL, rt->dst.dev, dst_output_sk);
417 if (err > 0) 416 if (err > 0)
418 err = net_xmit_errno(err); 417 err = net_xmit_errno(err);
419 if (err) 418 if (err)
@@ -481,8 +480,7 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
481 return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb); 480 return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb);
482} 481}
483 482
484static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 483static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
485 size_t len)
486{ 484{
487 struct inet_sock *inet = inet_sk(sk); 485 struct inet_sock *inet = inet_sk(sk);
488 struct ipcm_cookie ipc; 486 struct ipcm_cookie ipc;
@@ -709,8 +707,8 @@ out: return ret;
709 * we return it, otherwise we block. 707 * we return it, otherwise we block.
710 */ 708 */
711 709
712static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 710static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
713 size_t len, int noblock, int flags, int *addr_len) 711 int noblock, int flags, int *addr_len)
714{ 712{
715 struct inet_sock *inet = inet_sk(sk); 713 struct inet_sock *inet = inet_sk(sk);
716 size_t copied = 0; 714 size_t copied = 0;
@@ -873,7 +871,7 @@ static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
873 871
874 spin_lock_bh(&sk->sk_receive_queue.lock); 872 spin_lock_bh(&sk->sk_receive_queue.lock);
875 skb = skb_peek(&sk->sk_receive_queue); 873 skb = skb_peek(&sk->sk_receive_queue);
876 if (skb != NULL) 874 if (skb)
877 amount = skb->len; 875 amount = skb->len;
878 spin_unlock_bh(&sk->sk_receive_queue.lock); 876 spin_unlock_bh(&sk->sk_receive_queue.lock);
879 return put_user(amount, (int __user *)arg); 877 return put_user(amount, (int __user *)arg);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ad5064362c5c..bff62fc87b8e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -152,7 +152,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
152 152
153static struct dst_ops ipv4_dst_ops = { 153static struct dst_ops ipv4_dst_ops = {
154 .family = AF_INET, 154 .family = AF_INET,
155 .protocol = cpu_to_be16(ETH_P_IP),
156 .check = ipv4_dst_check, 155 .check = ipv4_dst_check,
157 .default_advmss = ipv4_default_advmss, 156 .default_advmss = ipv4_default_advmss,
158 .mtu = ipv4_mtu, 157 .mtu = ipv4_mtu,
@@ -483,7 +482,7 @@ u32 ip_idents_reserve(u32 hash, int segs)
483} 482}
484EXPORT_SYMBOL(ip_idents_reserve); 483EXPORT_SYMBOL(ip_idents_reserve);
485 484
486void __ip_select_ident(struct iphdr *iph, int segs) 485void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
487{ 486{
488 static u32 ip_idents_hashrnd __read_mostly; 487 static u32 ip_idents_hashrnd __read_mostly;
489 u32 hash, id; 488 u32 hash, id;
@@ -492,7 +491,7 @@ void __ip_select_ident(struct iphdr *iph, int segs)
492 491
493 hash = jhash_3words((__force u32)iph->daddr, 492 hash = jhash_3words((__force u32)iph->daddr,
494 (__force u32)iph->saddr, 493 (__force u32)iph->saddr,
495 iph->protocol, 494 iph->protocol ^ net_hash_mix(net),
496 ip_idents_hashrnd); 495 ip_idents_hashrnd);
497 id = ip_idents_reserve(hash, segs); 496 id = ip_idents_reserve(hash, segs);
498 iph->id = htons(id); 497 iph->id = htons(id);
@@ -963,10 +962,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
963 if (dst_metric_locked(dst, RTAX_MTU)) 962 if (dst_metric_locked(dst, RTAX_MTU))
964 return; 963 return;
965 964
966 if (dst->dev->mtu < mtu) 965 if (ipv4_mtu(dst) < mtu)
967 return;
968
969 if (rt->rt_pmtu && rt->rt_pmtu < mtu)
970 return; 966 return;
971 967
972 if (mtu < ip_rt_min_pmtu) 968 if (mtu < ip_rt_min_pmtu)
@@ -1057,7 +1053,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1057 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); 1053 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1058 1054
1059 rt = (struct rtable *)odst; 1055 rt = (struct rtable *)odst;
1060 if (odst->obsolete && odst->ops->check(odst, 0) == NULL) { 1056 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1061 rt = ip_route_output_flow(sock_net(sk), &fl4, sk); 1057 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1062 if (IS_ERR(rt)) 1058 if (IS_ERR(rt))
1063 goto out; 1059 goto out;
@@ -1451,7 +1447,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1451 1447
1452 /* Primary sanity checks. */ 1448 /* Primary sanity checks. */
1453 1449
1454 if (in_dev == NULL) 1450 if (!in_dev)
1455 return -EINVAL; 1451 return -EINVAL;
1456 1452
1457 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 1453 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
@@ -1554,7 +1550,7 @@ static int __mkroute_input(struct sk_buff *skb,
1554 1550
1555 /* get a working reference to the output device */ 1551 /* get a working reference to the output device */
1556 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); 1552 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1557 if (out_dev == NULL) { 1553 if (!out_dev) {
1558 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); 1554 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1559 return -EINVAL; 1555 return -EINVAL;
1560 } 1556 }
@@ -1592,7 +1588,7 @@ static int __mkroute_input(struct sk_buff *skb,
1592 1588
1593 fnhe = find_exception(&FIB_RES_NH(*res), daddr); 1589 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1594 if (do_cache) { 1590 if (do_cache) {
1595 if (fnhe != NULL) 1591 if (fnhe)
1596 rth = rcu_dereference(fnhe->fnhe_rth_input); 1592 rth = rcu_dereference(fnhe->fnhe_rth_input);
1597 else 1593 else
1598 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); 1594 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
@@ -2055,7 +2051,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
2055 ipv4_is_lbcast(fl4->daddr))) { 2051 ipv4_is_lbcast(fl4->daddr))) {
2056 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2052 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2057 dev_out = __ip_dev_find(net, fl4->saddr, false); 2053 dev_out = __ip_dev_find(net, fl4->saddr, false);
2058 if (dev_out == NULL) 2054 if (!dev_out)
2059 goto out; 2055 goto out;
2060 2056
2061 /* Special hack: user can direct multicasts 2057 /* Special hack: user can direct multicasts
@@ -2088,7 +2084,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
2088 if (fl4->flowi4_oif) { 2084 if (fl4->flowi4_oif) {
2089 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); 2085 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2090 rth = ERR_PTR(-ENODEV); 2086 rth = ERR_PTR(-ENODEV);
2091 if (dev_out == NULL) 2087 if (!dev_out)
2092 goto out; 2088 goto out;
2093 2089
2094 /* RACE: Check return value of inet_select_addr instead. */ 2090 /* RACE: Check return value of inet_select_addr instead. */
@@ -2225,7 +2221,6 @@ static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2225 2221
2226static struct dst_ops ipv4_dst_blackhole_ops = { 2222static struct dst_ops ipv4_dst_blackhole_ops = {
2227 .family = AF_INET, 2223 .family = AF_INET,
2228 .protocol = cpu_to_be16(ETH_P_IP),
2229 .check = ipv4_blackhole_dst_check, 2224 .check = ipv4_blackhole_dst_check,
2230 .mtu = ipv4_blackhole_mtu, 2225 .mtu = ipv4_blackhole_mtu,
2231 .default_advmss = ipv4_default_advmss, 2226 .default_advmss = ipv4_default_advmss,
@@ -2301,7 +2296,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2301 u32 metrics[RTAX_MAX]; 2296 u32 metrics[RTAX_MAX];
2302 2297
2303 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags); 2298 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2304 if (nlh == NULL) 2299 if (!nlh)
2305 return -EMSGSIZE; 2300 return -EMSGSIZE;
2306 2301
2307 r = nlmsg_data(nlh); 2302 r = nlmsg_data(nlh);
@@ -2321,11 +2316,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2321 if (IPCB(skb)->flags & IPSKB_DOREDIRECT) 2316 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2322 r->rtm_flags |= RTCF_DOREDIRECT; 2317 r->rtm_flags |= RTCF_DOREDIRECT;
2323 2318
2324 if (nla_put_be32(skb, RTA_DST, dst)) 2319 if (nla_put_in_addr(skb, RTA_DST, dst))
2325 goto nla_put_failure; 2320 goto nla_put_failure;
2326 if (src) { 2321 if (src) {
2327 r->rtm_src_len = 32; 2322 r->rtm_src_len = 32;
2328 if (nla_put_be32(skb, RTA_SRC, src)) 2323 if (nla_put_in_addr(skb, RTA_SRC, src))
2329 goto nla_put_failure; 2324 goto nla_put_failure;
2330 } 2325 }
2331 if (rt->dst.dev && 2326 if (rt->dst.dev &&
@@ -2338,11 +2333,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2338#endif 2333#endif
2339 if (!rt_is_input_route(rt) && 2334 if (!rt_is_input_route(rt) &&
2340 fl4->saddr != src) { 2335 fl4->saddr != src) {
2341 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr)) 2336 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2342 goto nla_put_failure; 2337 goto nla_put_failure;
2343 } 2338 }
2344 if (rt->rt_uses_gateway && 2339 if (rt->rt_uses_gateway &&
2345 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) 2340 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2346 goto nla_put_failure; 2341 goto nla_put_failure;
2347 2342
2348 expires = rt->dst.expires; 2343 expires = rt->dst.expires;
@@ -2423,7 +2418,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2423 rtm = nlmsg_data(nlh); 2418 rtm = nlmsg_data(nlh);
2424 2419
2425 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2420 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2426 if (skb == NULL) { 2421 if (!skb) {
2427 err = -ENOBUFS; 2422 err = -ENOBUFS;
2428 goto errout; 2423 goto errout;
2429 } 2424 }
@@ -2438,8 +2433,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2438 ip_hdr(skb)->protocol = IPPROTO_ICMP; 2433 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2439 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); 2434 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2440 2435
2441 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; 2436 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2442 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0; 2437 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2443 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 2438 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2444 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; 2439 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2445 2440
@@ -2454,7 +2449,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2454 struct net_device *dev; 2449 struct net_device *dev;
2455 2450
2456 dev = __dev_get_by_index(net, iif); 2451 dev = __dev_get_by_index(net, iif);
2457 if (dev == NULL) { 2452 if (!dev) {
2458 err = -ENODEV; 2453 err = -ENODEV;
2459 goto errout_free; 2454 goto errout_free;
2460 } 2455 }
@@ -2653,7 +2648,7 @@ static __net_init int sysctl_route_net_init(struct net *net)
2653 tbl = ipv4_route_flush_table; 2648 tbl = ipv4_route_flush_table;
2654 if (!net_eq(net, &init_net)) { 2649 if (!net_eq(net, &init_net)) {
2655 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); 2650 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2656 if (tbl == NULL) 2651 if (!tbl)
2657 goto err_dup; 2652 goto err_dup;
2658 2653
2659 /* Don't export sysctls to unprivileged users */ 2654 /* Don't export sysctls to unprivileged users */
@@ -2663,7 +2658,7 @@ static __net_init int sysctl_route_net_init(struct net *net)
2663 tbl[0].extra1 = net; 2658 tbl[0].extra1 = net;
2664 2659
2665 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl); 2660 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2666 if (net->ipv4.route_hdr == NULL) 2661 if (!net->ipv4.route_hdr)
2667 goto err_reg; 2662 goto err_reg;
2668 return 0; 2663 return 0;
2669 2664
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 45fe60c5238e..df849e5a10f1 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -219,19 +219,20 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
219} 219}
220EXPORT_SYMBOL_GPL(__cookie_v4_check); 220EXPORT_SYMBOL_GPL(__cookie_v4_check);
221 221
222static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, 222static struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
223 struct request_sock *req, 223 struct request_sock *req,
224 struct dst_entry *dst) 224 struct dst_entry *dst)
225{ 225{
226 struct inet_connection_sock *icsk = inet_csk(sk); 226 struct inet_connection_sock *icsk = inet_csk(sk);
227 struct sock *child; 227 struct sock *child;
228 228
229 child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); 229 child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
230 if (child) 230 if (child) {
231 atomic_set(&req->rsk_refcnt, 1);
231 inet_csk_reqsk_queue_add(sk, req, child); 232 inet_csk_reqsk_queue_add(sk, req, child);
232 else 233 } else {
233 reqsk_free(req); 234 reqsk_free(req);
234 235 }
235 return child; 236 return child;
236} 237}
237 238
@@ -325,7 +326,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
325 goto out; 326 goto out;
326 327
327 ret = NULL; 328 ret = NULL;
328 req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */ 329 req = inet_reqsk_alloc(&tcp_request_sock_ops, sk); /* for safety */
329 if (!req) 330 if (!req)
330 goto out; 331 goto out;
331 332
@@ -336,8 +337,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
336 req->mss = mss; 337 req->mss = mss;
337 ireq->ir_num = ntohs(th->dest); 338 ireq->ir_num = ntohs(th->dest);
338 ireq->ir_rmt_port = th->source; 339 ireq->ir_rmt_port = th->source;
339 ireq->ir_loc_addr = ip_hdr(skb)->daddr; 340 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
340 ireq->ir_rmt_addr = ip_hdr(skb)->saddr; 341 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
341 ireq->ir_mark = inet_request_mark(sk, skb); 342 ireq->ir_mark = inet_request_mark(sk, skb);
342 ireq->snd_wscale = tcp_opt.snd_wscale; 343 ireq->snd_wscale = tcp_opt.snd_wscale;
343 ireq->sack_ok = tcp_opt.sack_ok; 344 ireq->sack_ok = tcp_opt.sack_ok;
@@ -345,7 +346,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
345 ireq->tstamp_ok = tcp_opt.saw_tstamp; 346 ireq->tstamp_ok = tcp_opt.saw_tstamp;
346 req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; 347 req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
347 treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; 348 treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
348 treq->listener = NULL; 349 treq->tfo_listener = false;
350
351 ireq->ir_iif = sk->sk_bound_dev_if;
349 352
350 /* We throwed the options of the initial SYN away, so we hope 353 /* We throwed the options of the initial SYN away, so we hope
351 * the ACK carries the same options again (see RFC1122 4.2.3.8) 354 * the ACK carries the same options again (see RFC1122 4.2.3.8)
@@ -357,7 +360,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
357 goto out; 360 goto out;
358 } 361 }
359 362
360 req->expires = 0UL;
361 req->num_retrans = 0; 363 req->num_retrans = 0;
362 364
363 /* 365 /*
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d151539da8e6..c3852a7ff3c7 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -883,6 +883,20 @@ static struct ctl_table ipv4_net_table[] = {
883 .mode = 0644, 883 .mode = 0644,
884 .proc_handler = proc_dointvec, 884 .proc_handler = proc_dointvec,
885 }, 885 },
886 {
887 .procname = "tcp_probe_threshold",
888 .data = &init_net.ipv4.sysctl_tcp_probe_threshold,
889 .maxlen = sizeof(int),
890 .mode = 0644,
891 .proc_handler = proc_dointvec,
892 },
893 {
894 .procname = "tcp_probe_interval",
895 .data = &init_net.ipv4.sysctl_tcp_probe_interval,
896 .maxlen = sizeof(int),
897 .mode = 0644,
898 .proc_handler = proc_dointvec,
899 },
886 { } 900 { }
887}; 901};
888 902
@@ -895,7 +909,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
895 int i; 909 int i;
896 910
897 table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL); 911 table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);
898 if (table == NULL) 912 if (!table)
899 goto err_alloc; 913 goto err_alloc;
900 914
901 /* Update the variables to point into the current struct net */ 915 /* Update the variables to point into the current struct net */
@@ -904,7 +918,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
904 } 918 }
905 919
906 net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); 920 net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
907 if (net->ipv4.ipv4_hdr == NULL) 921 if (!net->ipv4.ipv4_hdr)
908 goto err_reg; 922 goto err_reg;
909 923
910 net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); 924 net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
@@ -942,7 +956,7 @@ static __init int sysctl_ipv4_init(void)
942 struct ctl_table_header *hdr; 956 struct ctl_table_header *hdr;
943 957
944 hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table); 958 hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table);
945 if (hdr == NULL) 959 if (!hdr)
946 return -ENOMEM; 960 return -ENOMEM;
947 961
948 if (register_pernet_subsys(&ipv4_sysctl_ops)) { 962 if (register_pernet_subsys(&ipv4_sysctl_ops)) {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9d72a0fcd928..8c5cd9efebbc 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -496,7 +496,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
496 496
497 /* Connected or passive Fast Open socket? */ 497 /* Connected or passive Fast Open socket? */
498 if (sk->sk_state != TCP_SYN_SENT && 498 if (sk->sk_state != TCP_SYN_SENT &&
499 (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) { 499 (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk)) {
500 int target = sock_rcvlowat(sk, 0, INT_MAX); 500 int target = sock_rcvlowat(sk, 0, INT_MAX);
501 501
502 if (tp->urg_seq == tp->copied_seq && 502 if (tp->urg_seq == tp->copied_seq &&
@@ -520,8 +520,10 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
520 520
521 /* Race breaker. If space is freed after 521 /* Race breaker. If space is freed after
522 * wspace test but before the flags are set, 522 * wspace test but before the flags are set,
523 * IO signal will be lost. 523 * IO signal will be lost. Memory barrier
524 * pairs with the input side.
524 */ 525 */
526 smp_mb__after_atomic();
525 if (sk_stream_is_writeable(sk)) 527 if (sk_stream_is_writeable(sk))
526 mask |= POLLOUT | POLLWRNORM; 528 mask |= POLLOUT | POLLWRNORM;
527 } 529 }
@@ -835,17 +837,13 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
835 int large_allowed) 837 int large_allowed)
836{ 838{
837 struct tcp_sock *tp = tcp_sk(sk); 839 struct tcp_sock *tp = tcp_sk(sk);
838 u32 new_size_goal, size_goal, hlen; 840 u32 new_size_goal, size_goal;
839 841
840 if (!large_allowed || !sk_can_gso(sk)) 842 if (!large_allowed || !sk_can_gso(sk))
841 return mss_now; 843 return mss_now;
842 844
843 /* Maybe we should/could use sk->sk_prot->max_header here ? */ 845 /* Note : tcp_tso_autosize() will eventually split this later */
844 hlen = inet_csk(sk)->icsk_af_ops->net_header_len + 846 new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
845 inet_csk(sk)->icsk_ext_hdr_len +
846 tp->tcp_header_len;
847
848 new_size_goal = sk->sk_gso_max_size - 1 - hlen;
849 new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal); 847 new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
850 848
851 /* We try hard to avoid divides here */ 849 /* We try hard to avoid divides here */
@@ -1032,7 +1030,7 @@ static inline int select_size(const struct sock *sk, bool sg)
1032 1030
1033void tcp_free_fastopen_req(struct tcp_sock *tp) 1031void tcp_free_fastopen_req(struct tcp_sock *tp)
1034{ 1032{
1035 if (tp->fastopen_req != NULL) { 1033 if (tp->fastopen_req) {
1036 kfree(tp->fastopen_req); 1034 kfree(tp->fastopen_req);
1037 tp->fastopen_req = NULL; 1035 tp->fastopen_req = NULL;
1038 } 1036 }
@@ -1046,12 +1044,12 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1046 1044
1047 if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) 1045 if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
1048 return -EOPNOTSUPP; 1046 return -EOPNOTSUPP;
1049 if (tp->fastopen_req != NULL) 1047 if (tp->fastopen_req)
1050 return -EALREADY; /* Another Fast Open is in progress */ 1048 return -EALREADY; /* Another Fast Open is in progress */
1051 1049
1052 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), 1050 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1053 sk->sk_allocation); 1051 sk->sk_allocation);
1054 if (unlikely(tp->fastopen_req == NULL)) 1052 if (unlikely(!tp->fastopen_req))
1055 return -ENOBUFS; 1053 return -ENOBUFS;
1056 tp->fastopen_req->data = msg; 1054 tp->fastopen_req->data = msg;
1057 tp->fastopen_req->size = size; 1055 tp->fastopen_req->size = size;
@@ -1064,8 +1062,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1064 return err; 1062 return err;
1065} 1063}
1066 1064
1067int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 1065int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1068 size_t size)
1069{ 1066{
1070 struct tcp_sock *tp = tcp_sk(sk); 1067 struct tcp_sock *tp = tcp_sk(sk);
1071 struct sk_buff *skb; 1068 struct sk_buff *skb;
@@ -1124,7 +1121,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1124 1121
1125 sg = !!(sk->sk_route_caps & NETIF_F_SG); 1122 sg = !!(sk->sk_route_caps & NETIF_F_SG);
1126 1123
1127 while (iov_iter_count(&msg->msg_iter)) { 1124 while (msg_data_left(msg)) {
1128 int copy = 0; 1125 int copy = 0;
1129 int max = size_goal; 1126 int max = size_goal;
1130 1127
@@ -1168,8 +1165,8 @@ new_segment:
1168 } 1165 }
1169 1166
1170 /* Try to append data to the end of skb. */ 1167 /* Try to append data to the end of skb. */
1171 if (copy > iov_iter_count(&msg->msg_iter)) 1168 if (copy > msg_data_left(msg))
1172 copy = iov_iter_count(&msg->msg_iter); 1169 copy = msg_data_left(msg);
1173 1170
1174 /* Where to copy to? */ 1171 /* Where to copy to? */
1175 if (skb_availroom(skb) > 0) { 1172 if (skb_availroom(skb) > 0) {
@@ -1226,7 +1223,7 @@ new_segment:
1226 tcp_skb_pcount_set(skb, 0); 1223 tcp_skb_pcount_set(skb, 0);
1227 1224
1228 copied += copy; 1225 copied += copy;
1229 if (!iov_iter_count(&msg->msg_iter)) { 1226 if (!msg_data_left(msg)) {
1230 tcp_tx_timestamp(sk, skb); 1227 tcp_tx_timestamp(sk, skb);
1231 goto out; 1228 goto out;
1232 } 1229 }
@@ -1543,8 +1540,8 @@ EXPORT_SYMBOL(tcp_read_sock);
1543 * Probably, code can be easily improved even more. 1540 * Probably, code can be easily improved even more.
1544 */ 1541 */
1545 1542
1546int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 1543int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
1547 size_t len, int nonblock, int flags, int *addr_len) 1544 int flags, int *addr_len)
1548{ 1545{
1549 struct tcp_sock *tp = tcp_sk(sk); 1546 struct tcp_sock *tp = tcp_sk(sk);
1550 int copied = 0; 1547 int copied = 0;
@@ -1918,18 +1915,19 @@ EXPORT_SYMBOL_GPL(tcp_set_state);
1918 1915
1919static const unsigned char new_state[16] = { 1916static const unsigned char new_state[16] = {
1920 /* current state: new state: action: */ 1917 /* current state: new state: action: */
1921 /* (Invalid) */ TCP_CLOSE, 1918 [0 /* (Invalid) */] = TCP_CLOSE,
1922 /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, 1919 [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1923 /* TCP_SYN_SENT */ TCP_CLOSE, 1920 [TCP_SYN_SENT] = TCP_CLOSE,
1924 /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, 1921 [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1925 /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1, 1922 [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
1926 /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2, 1923 [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
1927 /* TCP_TIME_WAIT */ TCP_CLOSE, 1924 [TCP_TIME_WAIT] = TCP_CLOSE,
1928 /* TCP_CLOSE */ TCP_CLOSE, 1925 [TCP_CLOSE] = TCP_CLOSE,
1929 /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN, 1926 [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
1930 /* TCP_LAST_ACK */ TCP_LAST_ACK, 1927 [TCP_LAST_ACK] = TCP_LAST_ACK,
1931 /* TCP_LISTEN */ TCP_CLOSE, 1928 [TCP_LISTEN] = TCP_CLOSE,
1932 /* TCP_CLOSING */ TCP_CLOSING, 1929 [TCP_CLOSING] = TCP_CLOSING,
1930 [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */
1933}; 1931};
1934 1932
1935static int tcp_close_state(struct sock *sk) 1933static int tcp_close_state(struct sock *sk)
@@ -2142,7 +2140,7 @@ adjudge_to_death:
2142 * aborted (e.g., closed with unread data) before 3WHS 2140 * aborted (e.g., closed with unread data) before 3WHS
2143 * finishes. 2141 * finishes.
2144 */ 2142 */
2145 if (req != NULL) 2143 if (req)
2146 reqsk_fastopen_remove(sk, req, false); 2144 reqsk_fastopen_remove(sk, req, false);
2147 inet_csk_destroy_sock(sk); 2145 inet_csk_destroy_sock(sk);
2148 } 2146 }
@@ -2599,6 +2597,7 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
2599 const struct tcp_sock *tp = tcp_sk(sk); 2597 const struct tcp_sock *tp = tcp_sk(sk);
2600 const struct inet_connection_sock *icsk = inet_csk(sk); 2598 const struct inet_connection_sock *icsk = inet_csk(sk);
2601 u32 now = tcp_time_stamp; 2599 u32 now = tcp_time_stamp;
2600 u32 rate;
2602 2601
2603 memset(info, 0, sizeof(*info)); 2602 memset(info, 0, sizeof(*info));
2604 2603
@@ -2659,10 +2658,11 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
2659 2658
2660 info->tcpi_total_retrans = tp->total_retrans; 2659 info->tcpi_total_retrans = tp->total_retrans;
2661 2660
2662 info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ? 2661 rate = READ_ONCE(sk->sk_pacing_rate);
2663 sk->sk_pacing_rate : ~0ULL; 2662 info->tcpi_pacing_rate = rate != ~0U ? rate : ~0ULL;
2664 info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ? 2663
2665 sk->sk_max_pacing_rate : ~0ULL; 2664 rate = READ_ONCE(sk->sk_max_pacing_rate);
2665 info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL;
2666} 2666}
2667EXPORT_SYMBOL_GPL(tcp_get_info); 2667EXPORT_SYMBOL_GPL(tcp_get_info);
2668 2668
@@ -2780,7 +2780,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2780 break; 2780 break;
2781 2781
2782 case TCP_FASTOPEN: 2782 case TCP_FASTOPEN:
2783 if (icsk->icsk_accept_queue.fastopenq != NULL) 2783 if (icsk->icsk_accept_queue.fastopenq)
2784 val = icsk->icsk_accept_queue.fastopenq->max_qlen; 2784 val = icsk->icsk_accept_queue.fastopenq->max_qlen;
2785 else 2785 else
2786 val = 0; 2786 val = 0;
@@ -2964,7 +2964,7 @@ void tcp_done(struct sock *sk)
2964 2964
2965 tcp_set_state(sk, TCP_CLOSE); 2965 tcp_set_state(sk, TCP_CLOSE);
2966 tcp_clear_xmit_timers(sk); 2966 tcp_clear_xmit_timers(sk);
2967 if (req != NULL) 2967 if (req)
2968 reqsk_fastopen_remove(sk, req, false); 2968 reqsk_fastopen_remove(sk, req, false);
2969 2969
2970 sk->sk_shutdown = SHUTDOWN_MASK; 2970 sk->sk_shutdown = SHUTDOWN_MASK;
@@ -3005,12 +3005,11 @@ static void __init tcp_init_mem(void)
3005 3005
3006void __init tcp_init(void) 3006void __init tcp_init(void)
3007{ 3007{
3008 struct sk_buff *skb = NULL;
3009 unsigned long limit; 3008 unsigned long limit;
3010 int max_rshare, max_wshare, cnt; 3009 int max_rshare, max_wshare, cnt;
3011 unsigned int i; 3010 unsigned int i;
3012 3011
3013 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); 3012 sock_skb_cb_check_size(sizeof(struct tcp_skb_cb));
3014 3013
3015 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); 3014 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
3016 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); 3015 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index d694088214cd..7a5ae50c80c8 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -83,7 +83,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
83 ret = -EEXIST; 83 ret = -EEXIST;
84 } else { 84 } else {
85 list_add_tail_rcu(&ca->list, &tcp_cong_list); 85 list_add_tail_rcu(&ca->list, &tcp_cong_list);
86 pr_info("%s registered\n", ca->name); 86 pr_debug("%s registered\n", ca->name);
87 } 87 }
88 spin_unlock(&tcp_cong_list_lock); 88 spin_unlock(&tcp_cong_list_lock);
89 89
@@ -378,6 +378,12 @@ EXPORT_SYMBOL_GPL(tcp_slow_start);
378 */ 378 */
379void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) 379void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
380{ 380{
381 /* If credits accumulated at a higher w, apply them gently now. */
382 if (tp->snd_cwnd_cnt >= w) {
383 tp->snd_cwnd_cnt = 0;
384 tp->snd_cwnd++;
385 }
386
381 tp->snd_cwnd_cnt += acked; 387 tp->snd_cwnd_cnt += acked;
382 if (tp->snd_cwnd_cnt >= w) { 388 if (tp->snd_cwnd_cnt >= w) {
383 u32 delta = tp->snd_cwnd_cnt / w; 389 u32 delta = tp->snd_cwnd_cnt / w;
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 4b276d1ed980..06d3d665a9fd 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -306,8 +306,10 @@ tcp_friendliness:
306 } 306 }
307 } 307 }
308 308
309 if (ca->cnt == 0) /* cannot be zero */ 309 /* The maximum rate of cwnd increase CUBIC allows is 1 packet per
310 ca->cnt = 1; 310 * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT.
311 */
312 ca->cnt = max(ca->cnt, 2U);
311} 313}
312 314
313static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) 315static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index b504371af742..4376016f7fa5 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -277,7 +277,7 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
277 } 277 }
278} 278}
279 279
280static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) 280static int dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
281{ 281{
282 const struct dctcp *ca = inet_csk_ca(sk); 282 const struct dctcp *ca = inet_csk_ca(sk);
283 283
@@ -297,8 +297,9 @@ static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
297 info.dctcp_ab_tot = ca->acked_bytes_total; 297 info.dctcp_ab_tot = ca->acked_bytes_total;
298 } 298 }
299 299
300 nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info); 300 return nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info);
301 } 301 }
302 return 0;
302} 303}
303 304
304static struct tcp_congestion_ops dctcp __read_mostly = { 305static struct tcp_congestion_ops dctcp __read_mostly = {
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 0d73f9ddb55b..79b34a0f4a4a 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -29,18 +29,18 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
29 r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); 29 r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
30 r->idiag_wqueue = tp->write_seq - tp->snd_una; 30 r->idiag_wqueue = tp->write_seq - tp->snd_una;
31 } 31 }
32 if (info != NULL) 32 if (info)
33 tcp_get_info(sk, info); 33 tcp_get_info(sk, info);
34} 34}
35 35
36static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, 36static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
37 struct inet_diag_req_v2 *r, struct nlattr *bc) 37 const struct inet_diag_req_v2 *r, struct nlattr *bc)
38{ 38{
39 inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc); 39 inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc);
40} 40}
41 41
42static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, 42static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
43 struct inet_diag_req_v2 *req) 43 const struct inet_diag_req_v2 *req)
44{ 44{
45 return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req); 45 return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req);
46} 46}
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index ea82fd492c1b..e3d87aca6be8 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -141,7 +141,7 @@ static bool tcp_fastopen_create_child(struct sock *sk,
141 req->sk = NULL; 141 req->sk = NULL;
142 142
143 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); 143 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
144 if (child == NULL) 144 if (!child)
145 return false; 145 return false;
146 146
147 spin_lock(&queue->fastopenq->lock); 147 spin_lock(&queue->fastopenq->lock);
@@ -155,12 +155,7 @@ static bool tcp_fastopen_create_child(struct sock *sk,
155 tp = tcp_sk(child); 155 tp = tcp_sk(child);
156 156
157 tp->fastopen_rsk = req; 157 tp->fastopen_rsk = req;
158 /* Do a hold on the listner sk so that if the listener is being 158 tcp_rsk(req)->tfo_listener = true;
159 * closed, the child that has been accepted can live on and still
160 * access listen_lock.
161 */
162 sock_hold(sk);
163 tcp_rsk(req)->listener = sk;
164 159
165 /* RFC1323: The window in SYN & SYN/ACK segments is never 160 /* RFC1323: The window in SYN & SYN/ACK segments is never
166 * scaled. So correct it appropriately. 161 * scaled. So correct it appropriately.
@@ -174,6 +169,7 @@ static bool tcp_fastopen_create_child(struct sock *sk,
174 inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, 169 inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
175 TCP_TIMEOUT_INIT, TCP_RTO_MAX); 170 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
176 171
172 atomic_set(&req->rsk_refcnt, 1);
177 /* Add the child socket directly into the accept queue */ 173 /* Add the child socket directly into the accept queue */
178 inet_csk_reqsk_queue_add(sk, req, child); 174 inet_csk_reqsk_queue_add(sk, req, child);
179 175
@@ -218,10 +214,9 @@ static bool tcp_fastopen_create_child(struct sock *sk,
218 sk->sk_data_ready(sk); 214 sk->sk_data_ready(sk);
219 bh_unlock_sock(child); 215 bh_unlock_sock(child);
220 sock_put(child); 216 sock_put(child);
221 WARN_ON(req->sk == NULL); 217 WARN_ON(!req->sk);
222 return true; 218 return true;
223} 219}
224EXPORT_SYMBOL(tcp_fastopen_create_child);
225 220
226static bool tcp_fastopen_queue_check(struct sock *sk) 221static bool tcp_fastopen_queue_check(struct sock *sk)
227{ 222{
@@ -238,14 +233,14 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
238 * temporarily vs a server not supporting Fast Open at all. 233 * temporarily vs a server not supporting Fast Open at all.
239 */ 234 */
240 fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; 235 fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
241 if (fastopenq == NULL || fastopenq->max_qlen == 0) 236 if (!fastopenq || fastopenq->max_qlen == 0)
242 return false; 237 return false;
243 238
244 if (fastopenq->qlen >= fastopenq->max_qlen) { 239 if (fastopenq->qlen >= fastopenq->max_qlen) {
245 struct request_sock *req1; 240 struct request_sock *req1;
246 spin_lock(&fastopenq->lock); 241 spin_lock(&fastopenq->lock);
247 req1 = fastopenq->rskq_rst_head; 242 req1 = fastopenq->rskq_rst_head;
248 if ((req1 == NULL) || time_after(req1->expires, jiffies)) { 243 if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) {
249 spin_unlock(&fastopenq->lock); 244 spin_unlock(&fastopenq->lock);
250 NET_INC_STATS_BH(sock_net(sk), 245 NET_INC_STATS_BH(sock_net(sk),
251 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); 246 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
@@ -254,7 +249,7 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
254 fastopenq->rskq_rst_head = req1->dl_next; 249 fastopenq->rskq_rst_head = req1->dl_next;
255 fastopenq->qlen--; 250 fastopenq->qlen--;
256 spin_unlock(&fastopenq->lock); 251 spin_unlock(&fastopenq->lock);
257 reqsk_free(req1); 252 reqsk_put(req1);
258 } 253 }
259 return true; 254 return true;
260} 255}
@@ -308,6 +303,7 @@ fastopen:
308 } else if (foc->len > 0) /* Client presents an invalid cookie */ 303 } else if (foc->len > 0) /* Client presents an invalid cookie */
309 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); 304 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
310 305
306 valid_foc.exp = foc->exp;
311 *foc = valid_foc; 307 *foc = valid_foc;
312 return false; 308 return false;
313} 309}
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 1d5a30a90adf..67476f085e48 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -300,8 +300,7 @@ static u32 tcp_illinois_ssthresh(struct sock *sk)
300} 300}
301 301
302/* Extract info for Tcp socket info provided via netlink. */ 302/* Extract info for Tcp socket info provided via netlink. */
303static void tcp_illinois_info(struct sock *sk, u32 ext, 303static int tcp_illinois_info(struct sock *sk, u32 ext, struct sk_buff *skb)
304 struct sk_buff *skb)
305{ 304{
306 const struct illinois *ca = inet_csk_ca(sk); 305 const struct illinois *ca = inet_csk_ca(sk);
307 306
@@ -318,8 +317,9 @@ static void tcp_illinois_info(struct sock *sk, u32 ext,
318 do_div(t, info.tcpv_rttcnt); 317 do_div(t, info.tcpv_rttcnt);
319 info.tcpv_rtt = t; 318 info.tcpv_rtt = t;
320 } 319 }
321 nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); 320 return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
322 } 321 }
322 return 0;
323} 323}
324 324
325static struct tcp_congestion_ops tcp_illinois __read_mostly = { 325static struct tcp_congestion_ops tcp_illinois __read_mostly = {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8fdd27b17306..3a4d9b34bed4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -866,7 +866,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
866/* This must be called before lost_out is incremented */ 866/* This must be called before lost_out is incremented */
867static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) 867static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
868{ 868{
869 if ((tp->retransmit_skb_hint == NULL) || 869 if (!tp->retransmit_skb_hint ||
870 before(TCP_SKB_CB(skb)->seq, 870 before(TCP_SKB_CB(skb)->seq,
871 TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) 871 TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
872 tp->retransmit_skb_hint = skb; 872 tp->retransmit_skb_hint = skb;
@@ -1256,7 +1256,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
1256 fack_count += pcount; 1256 fack_count += pcount;
1257 1257
1258 /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ 1258 /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
1259 if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && 1259 if (!tcp_is_fack(tp) && tp->lost_skb_hint &&
1260 before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) 1260 before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
1261 tp->lost_cnt_hint += pcount; 1261 tp->lost_cnt_hint += pcount;
1262 1262
@@ -1535,7 +1535,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1535 if (!before(TCP_SKB_CB(skb)->seq, end_seq)) 1535 if (!before(TCP_SKB_CB(skb)->seq, end_seq))
1536 break; 1536 break;
1537 1537
1538 if ((next_dup != NULL) && 1538 if (next_dup &&
1539 before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) { 1539 before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
1540 in_sack = tcp_match_skb_to_sack(sk, skb, 1540 in_sack = tcp_match_skb_to_sack(sk, skb,
1541 next_dup->start_seq, 1541 next_dup->start_seq,
@@ -1551,7 +1551,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1551 if (in_sack <= 0) { 1551 if (in_sack <= 0) {
1552 tmp = tcp_shift_skb_data(sk, skb, state, 1552 tmp = tcp_shift_skb_data(sk, skb, state,
1553 start_seq, end_seq, dup_sack); 1553 start_seq, end_seq, dup_sack);
1554 if (tmp != NULL) { 1554 if (tmp) {
1555 if (tmp != skb) { 1555 if (tmp != skb) {
1556 skb = tmp; 1556 skb = tmp;
1557 continue; 1557 continue;
@@ -1614,7 +1614,7 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
1614 struct tcp_sacktag_state *state, 1614 struct tcp_sacktag_state *state,
1615 u32 skip_to_seq) 1615 u32 skip_to_seq)
1616{ 1616{
1617 if (next_dup == NULL) 1617 if (!next_dup)
1618 return skb; 1618 return skb;
1619 1619
1620 if (before(next_dup->start_seq, skip_to_seq)) { 1620 if (before(next_dup->start_seq, skip_to_seq)) {
@@ -1783,7 +1783,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1783 if (tcp_highest_sack_seq(tp) == cache->end_seq) { 1783 if (tcp_highest_sack_seq(tp) == cache->end_seq) {
1784 /* ...but better entrypoint exists! */ 1784 /* ...but better entrypoint exists! */
1785 skb = tcp_highest_sack(sk); 1785 skb = tcp_highest_sack(sk);
1786 if (skb == NULL) 1786 if (!skb)
1787 break; 1787 break;
1788 state.fack_count = tp->fackets_out; 1788 state.fack_count = tp->fackets_out;
1789 cache++; 1789 cache++;
@@ -1798,7 +1798,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1798 1798
1799 if (!before(start_seq, tcp_highest_sack_seq(tp))) { 1799 if (!before(start_seq, tcp_highest_sack_seq(tp))) {
1800 skb = tcp_highest_sack(sk); 1800 skb = tcp_highest_sack(sk);
1801 if (skb == NULL) 1801 if (!skb)
1802 break; 1802 break;
1803 state.fack_count = tp->fackets_out; 1803 state.fack_count = tp->fackets_out;
1804 } 1804 }
@@ -3099,14 +3099,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3099 if (sacked & TCPCB_SACKED_RETRANS) 3099 if (sacked & TCPCB_SACKED_RETRANS)
3100 tp->retrans_out -= acked_pcount; 3100 tp->retrans_out -= acked_pcount;
3101 flag |= FLAG_RETRANS_DATA_ACKED; 3101 flag |= FLAG_RETRANS_DATA_ACKED;
3102 } else { 3102 } else if (!(sacked & TCPCB_SACKED_ACKED)) {
3103 last_ackt = skb->skb_mstamp; 3103 last_ackt = skb->skb_mstamp;
3104 WARN_ON_ONCE(last_ackt.v64 == 0); 3104 WARN_ON_ONCE(last_ackt.v64 == 0);
3105 if (!first_ackt.v64) 3105 if (!first_ackt.v64)
3106 first_ackt = last_ackt; 3106 first_ackt = last_ackt;
3107 3107
3108 if (!(sacked & TCPCB_SACKED_ACKED)) 3108 reord = min(pkts_acked, reord);
3109 reord = min(pkts_acked, reord);
3110 if (!after(scb->end_seq, tp->high_seq)) 3109 if (!after(scb->end_seq, tp->high_seq))
3111 flag |= FLAG_ORIG_SACK_ACKED; 3110 flag |= FLAG_ORIG_SACK_ACKED;
3112 } 3111 }
@@ -3321,6 +3320,36 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
3321 return flag; 3320 return flag;
3322} 3321}
3323 3322
3323/* Return true if we're currently rate-limiting out-of-window ACKs and
3324 * thus shouldn't send a dupack right now. We rate-limit dupacks in
3325 * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
3326 * attacks that send repeated SYNs or ACKs for the same connection. To
3327 * do this, we do not send a duplicate SYNACK or ACK if the remote
3328 * endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
3329 */
3330bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
3331 int mib_idx, u32 *last_oow_ack_time)
3332{
3333 /* Data packets without SYNs are not likely part of an ACK loop. */
3334 if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
3335 !tcp_hdr(skb)->syn)
3336 goto not_rate_limited;
3337
3338 if (*last_oow_ack_time) {
3339 s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
3340
3341 if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
3342 NET_INC_STATS_BH(net, mib_idx);
3343 return true; /* rate-limited: don't send yet! */
3344 }
3345 }
3346
3347 *last_oow_ack_time = tcp_time_stamp;
3348
3349not_rate_limited:
3350 return false; /* not rate-limited: go ahead, send dupack now! */
3351}
3352
3324/* RFC 5961 7 [ACK Throttling] */ 3353/* RFC 5961 7 [ACK Throttling] */
3325static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) 3354static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
3326{ 3355{
@@ -3572,6 +3601,23 @@ old_ack:
3572 return 0; 3601 return 0;
3573} 3602}
3574 3603
3604static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
3605 bool syn, struct tcp_fastopen_cookie *foc,
3606 bool exp_opt)
3607{
3608 /* Valid only in SYN or SYN-ACK with an even length. */
3609 if (!foc || !syn || len < 0 || (len & 1))
3610 return;
3611
3612 if (len >= TCP_FASTOPEN_COOKIE_MIN &&
3613 len <= TCP_FASTOPEN_COOKIE_MAX)
3614 memcpy(foc->val, cookie, len);
3615 else if (len != 0)
3616 len = -1;
3617 foc->len = len;
3618 foc->exp = exp_opt;
3619}
3620
3575/* Look for tcp options. Normally only called on SYN and SYNACK packets. 3621/* Look for tcp options. Normally only called on SYN and SYNACK packets.
3576 * But, this can also be called on packets in the established flow when 3622 * But, this can also be called on packets in the established flow when
3577 * the fast version below fails. 3623 * the fast version below fails.
@@ -3661,21 +3707,22 @@ void tcp_parse_options(const struct sk_buff *skb,
3661 */ 3707 */
3662 break; 3708 break;
3663#endif 3709#endif
3710 case TCPOPT_FASTOPEN:
3711 tcp_parse_fastopen_option(
3712 opsize - TCPOLEN_FASTOPEN_BASE,
3713 ptr, th->syn, foc, false);
3714 break;
3715
3664 case TCPOPT_EXP: 3716 case TCPOPT_EXP:
3665 /* Fast Open option shares code 254 using a 3717 /* Fast Open option shares code 254 using a
3666 * 16 bits magic number. It's valid only in 3718 * 16 bits magic number.
3667 * SYN or SYN-ACK with an even size.
3668 */ 3719 */
3669 if (opsize < TCPOLEN_EXP_FASTOPEN_BASE || 3720 if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
3670 get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC || 3721 get_unaligned_be16(ptr) ==
3671 foc == NULL || !th->syn || (opsize & 1)) 3722 TCPOPT_FASTOPEN_MAGIC)
3672 break; 3723 tcp_parse_fastopen_option(opsize -
3673 foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE; 3724 TCPOLEN_EXP_FASTOPEN_BASE,
3674 if (foc->len >= TCP_FASTOPEN_COOKIE_MIN && 3725 ptr + 2, th->syn, foc, true);
3675 foc->len <= TCP_FASTOPEN_COOKIE_MAX)
3676 memcpy(foc->val, ptr + 2, foc->len);
3677 else if (foc->len != 0)
3678 foc->len = -1;
3679 break; 3726 break;
3680 3727
3681 } 3728 }
@@ -4639,7 +4686,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
4639 struct sk_buff *head; 4686 struct sk_buff *head;
4640 u32 start, end; 4687 u32 start, end;
4641 4688
4642 if (skb == NULL) 4689 if (!skb)
4643 return; 4690 return;
4644 4691
4645 start = TCP_SKB_CB(skb)->seq; 4692 start = TCP_SKB_CB(skb)->seq;
@@ -4770,7 +4817,7 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk)
4770 return false; 4817 return false;
4771 4818
4772 /* If we filled the congestion window, do not expand. */ 4819 /* If we filled the congestion window, do not expand. */
4773 if (tp->packets_out >= tp->snd_cwnd) 4820 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
4774 return false; 4821 return false;
4775 4822
4776 return true; 4823 return true;
@@ -4798,6 +4845,8 @@ static void tcp_check_space(struct sock *sk)
4798{ 4845{
4799 if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { 4846 if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
4800 sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); 4847 sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
4848 /* pairs with tcp_poll() */
4849 smp_mb__after_atomic();
4801 if (sk->sk_socket && 4850 if (sk->sk_socket &&
4802 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) 4851 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
4803 tcp_new_space(sk); 4852 tcp_new_space(sk);
@@ -5094,7 +5143,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5094{ 5143{
5095 struct tcp_sock *tp = tcp_sk(sk); 5144 struct tcp_sock *tp = tcp_sk(sk);
5096 5145
5097 if (unlikely(sk->sk_rx_dst == NULL)) 5146 if (unlikely(!sk->sk_rx_dst))
5098 inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); 5147 inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
5099 /* 5148 /*
5100 * Header prediction. 5149 * Header prediction.
@@ -5291,7 +5340,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5291 5340
5292 tcp_set_state(sk, TCP_ESTABLISHED); 5341 tcp_set_state(sk, TCP_ESTABLISHED);
5293 5342
5294 if (skb != NULL) { 5343 if (skb) {
5295 icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); 5344 icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
5296 security_inet_conn_established(sk, skb); 5345 security_inet_conn_established(sk, skb);
5297 } 5346 }
@@ -5329,8 +5378,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5329{ 5378{
5330 struct tcp_sock *tp = tcp_sk(sk); 5379 struct tcp_sock *tp = tcp_sk(sk);
5331 struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; 5380 struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
5332 u16 mss = tp->rx_opt.mss_clamp; 5381 u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
5333 bool syn_drop; 5382 bool syn_drop = false;
5334 5383
5335 if (mss == tp->rx_opt.user_mss) { 5384 if (mss == tp->rx_opt.user_mss) {
5336 struct tcp_options_received opt; 5385 struct tcp_options_received opt;
@@ -5342,16 +5391,25 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5342 mss = opt.mss_clamp; 5391 mss = opt.mss_clamp;
5343 } 5392 }
5344 5393
5345 if (!tp->syn_fastopen) /* Ignore an unsolicited cookie */ 5394 if (!tp->syn_fastopen) {
5395 /* Ignore an unsolicited cookie */
5346 cookie->len = -1; 5396 cookie->len = -1;
5397 } else if (tp->total_retrans) {
5398 /* SYN timed out and the SYN-ACK neither has a cookie nor
5399 * acknowledges data. Presumably the remote received only
5400 * the retransmitted (regular) SYNs: either the original
5401 * SYN-data or the corresponding SYN-ACK was dropped.
5402 */
5403 syn_drop = (cookie->len < 0 && data);
5404 } else if (cookie->len < 0 && !tp->syn_data) {
5405 /* We requested a cookie but didn't get it. If we did not use
5406 * the (old) exp opt format then try so next time (try_exp=1).
5407 * Otherwise we go back to use the RFC7413 opt (try_exp=2).
5408 */
5409 try_exp = tp->syn_fastopen_exp ? 2 : 1;
5410 }
5347 5411
5348 /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably 5412 tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
5349 * the remote receives only the retransmitted (regular) SYNs: either
5350 * the original SYN-data or the corresponding SYN-ACK is lost.
5351 */
5352 syn_drop = (cookie->len <= 0 && data && tp->total_retrans);
5353
5354 tcp_fastopen_cache_set(sk, mss, cookie, syn_drop);
5355 5413
5356 if (data) { /* Retransmit unacked data in SYN */ 5414 if (data) { /* Retransmit unacked data in SYN */
5357 tcp_for_write_queue_from(data, sk) { 5415 tcp_for_write_queue_from(data, sk) {
@@ -5660,11 +5718,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5660 } 5718 }
5661 5719
5662 req = tp->fastopen_rsk; 5720 req = tp->fastopen_rsk;
5663 if (req != NULL) { 5721 if (req) {
5664 WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && 5722 WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
5665 sk->sk_state != TCP_FIN_WAIT1); 5723 sk->sk_state != TCP_FIN_WAIT1);
5666 5724
5667 if (tcp_check_req(sk, skb, req, NULL, true) == NULL) 5725 if (!tcp_check_req(sk, skb, req, true))
5668 goto discard; 5726 goto discard;
5669 } 5727 }
5670 5728
@@ -5750,7 +5808,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5750 * ACK we have received, this would have acknowledged 5808 * ACK we have received, this would have acknowledged
5751 * our SYNACK so stop the SYNACK timer. 5809 * our SYNACK so stop the SYNACK timer.
5752 */ 5810 */
5753 if (req != NULL) { 5811 if (req) {
5754 /* Return RST if ack_seq is invalid. 5812 /* Return RST if ack_seq is invalid.
5755 * Note that RFC793 only says to generate a 5813 * Note that RFC793 only says to generate a
5756 * DUPACK for it but for TCP Fast Open it seems 5814 * DUPACK for it but for TCP Fast Open it seems
@@ -5912,6 +5970,80 @@ static void tcp_ecn_create_request(struct request_sock *req,
5912 inet_rsk(req)->ecn_ok = 1; 5970 inet_rsk(req)->ecn_ok = 1;
5913} 5971}
5914 5972
5973static void tcp_openreq_init(struct request_sock *req,
5974 const struct tcp_options_received *rx_opt,
5975 struct sk_buff *skb, const struct sock *sk)
5976{
5977 struct inet_request_sock *ireq = inet_rsk(req);
5978
5979 req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
5980 req->cookie_ts = 0;
5981 tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
5982 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
5983 tcp_rsk(req)->snt_synack = tcp_time_stamp;
5984 tcp_rsk(req)->last_oow_ack_time = 0;
5985 req->mss = rx_opt->mss_clamp;
5986 req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
5987 ireq->tstamp_ok = rx_opt->tstamp_ok;
5988 ireq->sack_ok = rx_opt->sack_ok;
5989 ireq->snd_wscale = rx_opt->snd_wscale;
5990 ireq->wscale_ok = rx_opt->wscale_ok;
5991 ireq->acked = 0;
5992 ireq->ecn_ok = 0;
5993 ireq->ir_rmt_port = tcp_hdr(skb)->source;
5994 ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
5995 ireq->ir_mark = inet_request_mark(sk, skb);
5996}
5997
5998struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
5999 struct sock *sk_listener)
6000{
6001 struct request_sock *req = reqsk_alloc(ops, sk_listener);
6002
6003 if (req) {
6004 struct inet_request_sock *ireq = inet_rsk(req);
6005
6006 kmemcheck_annotate_bitfield(ireq, flags);
6007 ireq->opt = NULL;
6008 atomic64_set(&ireq->ir_cookie, 0);
6009 ireq->ireq_state = TCP_NEW_SYN_RECV;
6010 write_pnet(&ireq->ireq_net, sock_net(sk_listener));
6011 ireq->ireq_family = sk_listener->sk_family;
6012 }
6013
6014 return req;
6015}
6016EXPORT_SYMBOL(inet_reqsk_alloc);
6017
6018/*
6019 * Return true if a syncookie should be sent
6020 */
6021static bool tcp_syn_flood_action(struct sock *sk,
6022 const struct sk_buff *skb,
6023 const char *proto)
6024{
6025 const char *msg = "Dropping request";
6026 bool want_cookie = false;
6027 struct listen_sock *lopt;
6028
6029#ifdef CONFIG_SYN_COOKIES
6030 if (sysctl_tcp_syncookies) {
6031 msg = "Sending cookies";
6032 want_cookie = true;
6033 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
6034 } else
6035#endif
6036 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
6037
6038 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
6039 if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
6040 lopt->synflood_warned = 1;
6041 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
6042 proto, ntohs(tcp_hdr(skb)->dest), msg);
6043 }
6044 return want_cookie;
6045}
6046
5915int tcp_conn_request(struct request_sock_ops *rsk_ops, 6047int tcp_conn_request(struct request_sock_ops *rsk_ops,
5916 const struct tcp_request_sock_ops *af_ops, 6048 const struct tcp_request_sock_ops *af_ops,
5917 struct sock *sk, struct sk_buff *skb) 6049 struct sock *sk, struct sk_buff *skb)
@@ -5949,7 +6081,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
5949 goto drop; 6081 goto drop;
5950 } 6082 }
5951 6083
5952 req = inet_reqsk_alloc(rsk_ops); 6084 req = inet_reqsk_alloc(rsk_ops, sk);
5953 if (!req) 6085 if (!req)
5954 goto drop; 6086 goto drop;
5955 6087
@@ -5966,6 +6098,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
5966 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; 6098 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
5967 tcp_openreq_init(req, &tmp_opt, skb, sk); 6099 tcp_openreq_init(req, &tmp_opt, skb, sk);
5968 6100
6101 /* Note: tcp_v6_init_req() might override ir_iif for link locals */
6102 inet_rsk(req)->ir_iif = sk->sk_bound_dev_if;
6103
5969 af_ops->init_req(req, sk, skb); 6104 af_ops->init_req(req, sk, skb);
5970 6105
5971 if (security_inet_conn_request(sk, skb, req)) 6106 if (security_inet_conn_request(sk, skb, req))
@@ -6038,7 +6173,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
6038 if (err || want_cookie) 6173 if (err || want_cookie)
6039 goto drop_and_free; 6174 goto drop_and_free;
6040 6175
6041 tcp_rsk(req)->listener = NULL; 6176 tcp_rsk(req)->tfo_listener = false;
6042 af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); 6177 af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
6043 } 6178 }
6044 6179
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5a2dfed4783b..fc1c658ec6c1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -122,7 +122,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
122 and use initial timestamp retrieved from peer table. 122 and use initial timestamp retrieved from peer table.
123 */ 123 */
124 if (tcptw->tw_ts_recent_stamp && 124 if (tcptw->tw_ts_recent_stamp &&
125 (twp == NULL || (sysctl_tcp_tw_reuse && 125 (!twp || (sysctl_tcp_tw_reuse &&
126 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { 126 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
127 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; 127 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128 if (tp->write_seq == 0) 128 if (tp->write_seq == 0)
@@ -189,7 +189,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
189 189
190 if (!inet->inet_saddr) 190 if (!inet->inet_saddr)
191 inet->inet_saddr = fl4->saddr; 191 inet->inet_saddr = fl4->saddr;
192 inet->inet_rcv_saddr = inet->inet_saddr; 192 sk_rcv_saddr_set(sk, inet->inet_saddr);
193 193
194 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 194 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
195 /* Reset inherited state */ 195 /* Reset inherited state */
@@ -204,7 +204,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
204 tcp_fetch_timewait_stamp(sk, &rt->dst); 204 tcp_fetch_timewait_stamp(sk, &rt->dst);
205 205
206 inet->inet_dport = usin->sin_port; 206 inet->inet_dport = usin->sin_port;
207 inet->inet_daddr = daddr; 207 sk_daddr_set(sk, daddr);
208 208
209 inet_csk(sk)->icsk_ext_hdr_len = 0; 209 inet_csk(sk)->icsk_ext_hdr_len = 0;
210 if (inet_opt) 210 if (inet_opt)
@@ -310,6 +310,34 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk)
310 dst->ops->redirect(dst, sk, skb); 310 dst->ops->redirect(dst, sk, skb);
311} 311}
312 312
313
314/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
315void tcp_req_err(struct sock *sk, u32 seq)
316{
317 struct request_sock *req = inet_reqsk(sk);
318 struct net *net = sock_net(sk);
319
320 /* ICMPs are not backlogged, hence we cannot get
321 * an established socket here.
322 */
323 WARN_ON(req->sk);
324
325 if (seq != tcp_rsk(req)->snt_isn) {
326 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
327 reqsk_put(req);
328 } else {
329 /*
330 * Still in SYN_RECV, just remove it silently.
331 * There is no good way to pass the error to the newly
332 * created socket, and POSIX does not want network
333 * errors returned from accept().
334 */
335 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
336 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
337 }
338}
339EXPORT_SYMBOL(tcp_req_err);
340
313/* 341/*
314 * This routine is called by the ICMP module when it gets some 342 * This routine is called by the ICMP module when it gets some
315 * sort of error condition. If err < 0 then the socket should 343 * sort of error condition. If err < 0 then the socket should
@@ -343,8 +371,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
343 int err; 371 int err;
344 struct net *net = dev_net(icmp_skb->dev); 372 struct net *net = dev_net(icmp_skb->dev);
345 373
346 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, 374 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
347 iph->saddr, th->source, inet_iif(icmp_skb)); 375 th->dest, iph->saddr, ntohs(th->source),
376 inet_iif(icmp_skb));
348 if (!sk) { 377 if (!sk) {
349 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); 378 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
350 return; 379 return;
@@ -353,6 +382,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
353 inet_twsk_put(inet_twsk(sk)); 382 inet_twsk_put(inet_twsk(sk));
354 return; 383 return;
355 } 384 }
385 seq = ntohl(th->seq);
386 if (sk->sk_state == TCP_NEW_SYN_RECV)
387 return tcp_req_err(sk, seq);
356 388
357 bh_lock_sock(sk); 389 bh_lock_sock(sk);
358 /* If too many ICMPs get dropped on busy 390 /* If too many ICMPs get dropped on busy
@@ -374,7 +406,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
374 406
375 icsk = inet_csk(sk); 407 icsk = inet_csk(sk);
376 tp = tcp_sk(sk); 408 tp = tcp_sk(sk);
377 seq = ntohl(th->seq);
378 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ 409 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
379 fastopen = tp->fastopen_rsk; 410 fastopen = tp->fastopen_rsk;
380 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; 411 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
@@ -458,42 +489,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
458 } 489 }
459 490
460 switch (sk->sk_state) { 491 switch (sk->sk_state) {
461 struct request_sock *req, **prev;
462 case TCP_LISTEN:
463 if (sock_owned_by_user(sk))
464 goto out;
465
466 req = inet_csk_search_req(sk, &prev, th->dest,
467 iph->daddr, iph->saddr);
468 if (!req)
469 goto out;
470
471 /* ICMPs are not backlogged, hence we cannot get
472 an established socket here.
473 */
474 WARN_ON(req->sk);
475
476 if (seq != tcp_rsk(req)->snt_isn) {
477 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
478 goto out;
479 }
480
481 /*
482 * Still in SYN_RECV, just remove it silently.
483 * There is no good way to pass the error to the newly
484 * created socket, and POSIX does not want network
485 * errors returned from accept().
486 */
487 inet_csk_reqsk_queue_drop(sk, req, prev);
488 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
489 goto out;
490
491 case TCP_SYN_SENT: 492 case TCP_SYN_SENT:
492 case TCP_SYN_RECV: 493 case TCP_SYN_RECV:
493 /* Only in fast or simultaneous open. If a fast open socket is 494 /* Only in fast or simultaneous open. If a fast open socket is
494 * is already accepted it is treated as a connected one below. 495 * is already accepted it is treated as a connected one below.
495 */ 496 */
496 if (fastopen && fastopen->sk == NULL) 497 if (fastopen && !fastopen->sk)
497 break; 498 break;
498 499
499 if (!sock_owned_by_user(sk)) { 500 if (!sock_owned_by_user(sk)) {
@@ -647,7 +648,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
647 if (!key) 648 if (!key)
648 goto release_sk1; 649 goto release_sk1;
649 650
650 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb); 651 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
651 if (genhash || memcmp(hash_location, newhash, 16) != 0) 652 if (genhash || memcmp(hash_location, newhash, 16) != 0)
652 goto release_sk1; 653 goto release_sk1;
653 } else { 654 } else {
@@ -855,35 +856,6 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
855 kfree(inet_rsk(req)->opt); 856 kfree(inet_rsk(req)->opt);
856} 857}
857 858
858/*
859 * Return true if a syncookie should be sent
860 */
861bool tcp_syn_flood_action(struct sock *sk,
862 const struct sk_buff *skb,
863 const char *proto)
864{
865 const char *msg = "Dropping request";
866 bool want_cookie = false;
867 struct listen_sock *lopt;
868
869#ifdef CONFIG_SYN_COOKIES
870 if (sysctl_tcp_syncookies) {
871 msg = "Sending cookies";
872 want_cookie = true;
873 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
874 } else
875#endif
876 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
877
878 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
879 if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
880 lopt->synflood_warned = 1;
881 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
882 proto, ntohs(tcp_hdr(skb)->dest), msg);
883 }
884 return want_cookie;
885}
886EXPORT_SYMBOL(tcp_syn_flood_action);
887 859
888#ifdef CONFIG_TCP_MD5SIG 860#ifdef CONFIG_TCP_MD5SIG
889/* 861/*
@@ -897,10 +869,10 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
897 const union tcp_md5_addr *addr, 869 const union tcp_md5_addr *addr,
898 int family) 870 int family)
899{ 871{
900 struct tcp_sock *tp = tcp_sk(sk); 872 const struct tcp_sock *tp = tcp_sk(sk);
901 struct tcp_md5sig_key *key; 873 struct tcp_md5sig_key *key;
902 unsigned int size = sizeof(struct in_addr); 874 unsigned int size = sizeof(struct in_addr);
903 struct tcp_md5sig_info *md5sig; 875 const struct tcp_md5sig_info *md5sig;
904 876
905 /* caller either holds rcu_read_lock() or socket lock */ 877 /* caller either holds rcu_read_lock() or socket lock */
906 md5sig = rcu_dereference_check(tp->md5sig_info, 878 md5sig = rcu_dereference_check(tp->md5sig_info,
@@ -923,24 +895,15 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
923EXPORT_SYMBOL(tcp_md5_do_lookup); 895EXPORT_SYMBOL(tcp_md5_do_lookup);
924 896
925struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, 897struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
926 struct sock *addr_sk) 898 const struct sock *addr_sk)
927{ 899{
928 union tcp_md5_addr *addr; 900 const union tcp_md5_addr *addr;
929 901
930 addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr; 902 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
931 return tcp_md5_do_lookup(sk, addr, AF_INET); 903 return tcp_md5_do_lookup(sk, addr, AF_INET);
932} 904}
933EXPORT_SYMBOL(tcp_v4_md5_lookup); 905EXPORT_SYMBOL(tcp_v4_md5_lookup);
934 906
935static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
936 struct request_sock *req)
937{
938 union tcp_md5_addr *addr;
939
940 addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
941 return tcp_md5_do_lookup(sk, addr, AF_INET);
942}
943
944/* This can be called on a newly created socket, from other files */ 907/* This can be called on a newly created socket, from other files */
945int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 908int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
946 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp) 909 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
@@ -1101,8 +1064,8 @@ clear_hash_noput:
1101 return 1; 1064 return 1;
1102} 1065}
1103 1066
1104int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, 1067int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1105 const struct sock *sk, const struct request_sock *req, 1068 const struct sock *sk,
1106 const struct sk_buff *skb) 1069 const struct sk_buff *skb)
1107{ 1070{
1108 struct tcp_md5sig_pool *hp; 1071 struct tcp_md5sig_pool *hp;
@@ -1110,12 +1073,9 @@ int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1110 const struct tcphdr *th = tcp_hdr(skb); 1073 const struct tcphdr *th = tcp_hdr(skb);
1111 __be32 saddr, daddr; 1074 __be32 saddr, daddr;
1112 1075
1113 if (sk) { 1076 if (sk) { /* valid for establish/request sockets */
1114 saddr = inet_sk(sk)->inet_saddr; 1077 saddr = sk->sk_rcv_saddr;
1115 daddr = inet_sk(sk)->inet_daddr; 1078 daddr = sk->sk_daddr;
1116 } else if (req) {
1117 saddr = inet_rsk(req)->ir_loc_addr;
1118 daddr = inet_rsk(req)->ir_rmt_addr;
1119 } else { 1079 } else {
1120 const struct iphdr *iph = ip_hdr(skb); 1080 const struct iphdr *iph = ip_hdr(skb);
1121 saddr = iph->saddr; 1081 saddr = iph->saddr;
@@ -1152,8 +1112,9 @@ clear_hash_noput:
1152} 1112}
1153EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1113EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1154 1114
1155static bool __tcp_v4_inbound_md5_hash(struct sock *sk, 1115/* Called with rcu_read_lock() */
1156 const struct sk_buff *skb) 1116static bool tcp_v4_inbound_md5_hash(struct sock *sk,
1117 const struct sk_buff *skb)
1157{ 1118{
1158 /* 1119 /*
1159 * This gets called for each TCP segment that arrives 1120 * This gets called for each TCP segment that arrives
@@ -1193,7 +1154,7 @@ static bool __tcp_v4_inbound_md5_hash(struct sock *sk,
1193 */ 1154 */
1194 genhash = tcp_v4_md5_hash_skb(newhash, 1155 genhash = tcp_v4_md5_hash_skb(newhash,
1195 hash_expected, 1156 hash_expected,
1196 NULL, NULL, skb); 1157 NULL, skb);
1197 1158
1198 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1159 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1199 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", 1160 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
@@ -1205,28 +1166,16 @@ static bool __tcp_v4_inbound_md5_hash(struct sock *sk,
1205 } 1166 }
1206 return false; 1167 return false;
1207} 1168}
1208
1209static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1210{
1211 bool ret;
1212
1213 rcu_read_lock();
1214 ret = __tcp_v4_inbound_md5_hash(sk, skb);
1215 rcu_read_unlock();
1216
1217 return ret;
1218}
1219
1220#endif 1169#endif
1221 1170
1222static void tcp_v4_init_req(struct request_sock *req, struct sock *sk, 1171static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener,
1223 struct sk_buff *skb) 1172 struct sk_buff *skb)
1224{ 1173{
1225 struct inet_request_sock *ireq = inet_rsk(req); 1174 struct inet_request_sock *ireq = inet_rsk(req);
1226 1175
1227 ireq->ir_loc_addr = ip_hdr(skb)->daddr; 1176 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1228 ireq->ir_rmt_addr = ip_hdr(skb)->saddr; 1177 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1229 ireq->no_srccheck = inet_sk(sk)->transparent; 1178 ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1230 ireq->opt = tcp_v4_save_options(skb); 1179 ireq->opt = tcp_v4_save_options(skb);
1231} 1180}
1232 1181
@@ -1259,7 +1208,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1259static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1208static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1260 .mss_clamp = TCP_MSS_DEFAULT, 1209 .mss_clamp = TCP_MSS_DEFAULT,
1261#ifdef CONFIG_TCP_MD5SIG 1210#ifdef CONFIG_TCP_MD5SIG
1262 .md5_lookup = tcp_v4_reqsk_md5_lookup, 1211 .req_md5_lookup = tcp_v4_md5_lookup,
1263 .calc_md5_hash = tcp_v4_md5_hash_skb, 1212 .calc_md5_hash = tcp_v4_md5_hash_skb,
1264#endif 1213#endif
1265 .init_req = tcp_v4_init_req, 1214 .init_req = tcp_v4_init_req,
@@ -1318,8 +1267,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1318 newtp = tcp_sk(newsk); 1267 newtp = tcp_sk(newsk);
1319 newinet = inet_sk(newsk); 1268 newinet = inet_sk(newsk);
1320 ireq = inet_rsk(req); 1269 ireq = inet_rsk(req);
1321 newinet->inet_daddr = ireq->ir_rmt_addr; 1270 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1322 newinet->inet_rcv_saddr = ireq->ir_loc_addr; 1271 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1323 newinet->inet_saddr = ireq->ir_loc_addr; 1272 newinet->inet_saddr = ireq->ir_loc_addr;
1324 inet_opt = ireq->opt; 1273 inet_opt = ireq->opt;
1325 rcu_assign_pointer(newinet->inet_opt, inet_opt); 1274 rcu_assign_pointer(newinet->inet_opt, inet_opt);
@@ -1356,7 +1305,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1356 /* Copy over the MD5 key from the original socket */ 1305 /* Copy over the MD5 key from the original socket */
1357 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, 1306 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1358 AF_INET); 1307 AF_INET);
1359 if (key != NULL) { 1308 if (key) {
1360 /* 1309 /*
1361 * We're using one, so create a matching key 1310 * We're using one, so create a matching key
1362 * on the newsk structure. If we fail to get 1311 * on the newsk structure. If we fail to get
@@ -1391,15 +1340,18 @@ EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1391 1340
1392static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) 1341static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1393{ 1342{
1394 struct tcphdr *th = tcp_hdr(skb); 1343 const struct tcphdr *th = tcp_hdr(skb);
1395 const struct iphdr *iph = ip_hdr(skb); 1344 const struct iphdr *iph = ip_hdr(skb);
1345 struct request_sock *req;
1396 struct sock *nsk; 1346 struct sock *nsk;
1397 struct request_sock **prev; 1347
1398 /* Find possible connection requests. */ 1348 req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr);
1399 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, 1349 if (req) {
1400 iph->saddr, iph->daddr); 1350 nsk = tcp_check_req(sk, skb, req, false);
1401 if (req) 1351 if (!nsk)
1402 return tcp_check_req(sk, skb, req, prev, false); 1352 reqsk_put(req);
1353 return nsk;
1354 }
1403 1355
1404 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, 1356 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1405 th->source, iph->daddr, th->dest, inet_iif(skb)); 1357 th->source, iph->daddr, th->dest, inet_iif(skb));
@@ -1439,7 +1391,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1439 sk_mark_napi_id(sk, skb); 1391 sk_mark_napi_id(sk, skb);
1440 if (dst) { 1392 if (dst) {
1441 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || 1393 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1442 dst->ops->check(dst, 0) == NULL) { 1394 !dst->ops->check(dst, 0)) {
1443 dst_release(dst); 1395 dst_release(dst);
1444 sk->sk_rx_dst = NULL; 1396 sk->sk_rx_dst = NULL;
1445 } 1397 }
@@ -1517,8 +1469,8 @@ void tcp_v4_early_demux(struct sk_buff *skb)
1517 if (sk) { 1469 if (sk) {
1518 skb->sk = sk; 1470 skb->sk = sk;
1519 skb->destructor = sock_edemux; 1471 skb->destructor = sock_edemux;
1520 if (sk->sk_state != TCP_TIME_WAIT) { 1472 if (sk_fullsock(sk)) {
1521 struct dst_entry *dst = sk->sk_rx_dst; 1473 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1522 1474
1523 if (dst) 1475 if (dst)
1524 dst = dst_check(dst, 0); 1476 dst = dst_check(dst, 0);
@@ -1734,7 +1686,7 @@ do_time_wait:
1734 iph->daddr, th->dest, 1686 iph->daddr, th->dest,
1735 inet_iif(skb)); 1687 inet_iif(skb));
1736 if (sk2) { 1688 if (sk2) {
1737 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); 1689 inet_twsk_deschedule(inet_twsk(sk));
1738 inet_twsk_put(inet_twsk(sk)); 1690 inet_twsk_put(inet_twsk(sk));
1739 sk = sk2; 1691 sk = sk2;
1740 goto process; 1692 goto process;
@@ -1846,7 +1798,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
1846 if (inet_csk(sk)->icsk_bind_hash) 1798 if (inet_csk(sk)->icsk_bind_hash)
1847 inet_put_port(sk); 1799 inet_put_port(sk);
1848 1800
1849 BUG_ON(tp->fastopen_rsk != NULL); 1801 BUG_ON(tp->fastopen_rsk);
1850 1802
1851 /* If socket is aborted during connect operation */ 1803 /* If socket is aborted during connect operation */
1852 tcp_free_fastopen_req(tp); 1804 tcp_free_fastopen_req(tp);
@@ -1904,13 +1856,13 @@ get_req:
1904 } 1856 }
1905 sk = sk_nulls_next(st->syn_wait_sk); 1857 sk = sk_nulls_next(st->syn_wait_sk);
1906 st->state = TCP_SEQ_STATE_LISTENING; 1858 st->state = TCP_SEQ_STATE_LISTENING;
1907 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 1859 spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1908 } else { 1860 } else {
1909 icsk = inet_csk(sk); 1861 icsk = inet_csk(sk);
1910 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 1862 spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1911 if (reqsk_queue_len(&icsk->icsk_accept_queue)) 1863 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1912 goto start_req; 1864 goto start_req;
1913 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 1865 spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1914 sk = sk_nulls_next(sk); 1866 sk = sk_nulls_next(sk);
1915 } 1867 }
1916get_sk: 1868get_sk:
@@ -1922,7 +1874,7 @@ get_sk:
1922 goto out; 1874 goto out;
1923 } 1875 }
1924 icsk = inet_csk(sk); 1876 icsk = inet_csk(sk);
1925 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 1877 spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1926 if (reqsk_queue_len(&icsk->icsk_accept_queue)) { 1878 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1927start_req: 1879start_req:
1928 st->uid = sock_i_uid(sk); 1880 st->uid = sock_i_uid(sk);
@@ -1931,7 +1883,7 @@ start_req:
1931 st->sbucket = 0; 1883 st->sbucket = 0;
1932 goto get_req; 1884 goto get_req;
1933 } 1885 }
1934 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 1886 spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1935 } 1887 }
1936 spin_unlock_bh(&ilb->lock); 1888 spin_unlock_bh(&ilb->lock);
1937 st->offset = 0; 1889 st->offset = 0;
@@ -2150,7 +2102,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
2150 case TCP_SEQ_STATE_OPENREQ: 2102 case TCP_SEQ_STATE_OPENREQ:
2151 if (v) { 2103 if (v) {
2152 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); 2104 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2153 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 2105 spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2154 } 2106 }
2155 case TCP_SEQ_STATE_LISTENING: 2107 case TCP_SEQ_STATE_LISTENING:
2156 if (v != SEQ_START_TOKEN) 2108 if (v != SEQ_START_TOKEN)
@@ -2204,17 +2156,17 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2204} 2156}
2205EXPORT_SYMBOL(tcp_proc_unregister); 2157EXPORT_SYMBOL(tcp_proc_unregister);
2206 2158
2207static void get_openreq4(const struct sock *sk, const struct request_sock *req, 2159static void get_openreq4(const struct request_sock *req,
2208 struct seq_file *f, int i, kuid_t uid) 2160 struct seq_file *f, int i, kuid_t uid)
2209{ 2161{
2210 const struct inet_request_sock *ireq = inet_rsk(req); 2162 const struct inet_request_sock *ireq = inet_rsk(req);
2211 long delta = req->expires - jiffies; 2163 long delta = req->rsk_timer.expires - jiffies;
2212 2164
2213 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2165 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2214 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", 2166 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2215 i, 2167 i,
2216 ireq->ir_loc_addr, 2168 ireq->ir_loc_addr,
2217 ntohs(inet_sk(sk)->inet_sport), 2169 ireq->ir_num,
2218 ireq->ir_rmt_addr, 2170 ireq->ir_rmt_addr,
2219 ntohs(ireq->ir_rmt_port), 2171 ntohs(ireq->ir_rmt_port),
2220 TCP_SYN_RECV, 2172 TCP_SYN_RECV,
@@ -2225,7 +2177,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2225 from_kuid_munged(seq_user_ns(f), uid), 2177 from_kuid_munged(seq_user_ns(f), uid),
2226 0, /* non standard timer */ 2178 0, /* non standard timer */
2227 0, /* open_requests have no inode */ 2179 0, /* open_requests have no inode */
2228 atomic_read(&sk->sk_refcnt), 2180 0,
2229 req); 2181 req);
2230} 2182}
2231 2183
@@ -2291,9 +2243,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2291static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2243static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2292 struct seq_file *f, int i) 2244 struct seq_file *f, int i)
2293{ 2245{
2246 long delta = tw->tw_timer.expires - jiffies;
2294 __be32 dest, src; 2247 __be32 dest, src;
2295 __u16 destp, srcp; 2248 __u16 destp, srcp;
2296 s32 delta = tw->tw_ttd - inet_tw_time_stamp();
2297 2249
2298 dest = tw->tw_daddr; 2250 dest = tw->tw_daddr;
2299 src = tw->tw_rcv_saddr; 2251 src = tw->tw_rcv_saddr;
@@ -2332,7 +2284,7 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
2332 get_tcp4_sock(v, seq, st->num); 2284 get_tcp4_sock(v, seq, st->num);
2333 break; 2285 break;
2334 case TCP_SEQ_STATE_OPENREQ: 2286 case TCP_SEQ_STATE_OPENREQ:
2335 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid); 2287 get_openreq4(v, seq, st->num, st->uid);
2336 break; 2288 break;
2337 } 2289 }
2338out: 2290out:
@@ -2460,6 +2412,8 @@ static int __net_init tcp_sk_init(struct net *net)
2460 } 2412 }
2461 net->ipv4.sysctl_tcp_ecn = 2; 2413 net->ipv4.sysctl_tcp_ecn = 2;
2462 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; 2414 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2415 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2416 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2463 return 0; 2417 return 0;
2464 2418
2465fail: 2419fail:
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index e5f41bd5ec1b..a51d63a43e33 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -28,7 +28,8 @@ static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *s
28 28
29struct tcp_fastopen_metrics { 29struct tcp_fastopen_metrics {
30 u16 mss; 30 u16 mss;
31 u16 syn_loss:10; /* Recurring Fast Open SYN losses */ 31 u16 syn_loss:10, /* Recurring Fast Open SYN losses */
32 try_exp:2; /* Request w/ exp. option (once) */
32 unsigned long last_syn_loss; /* Last Fast Open SYN loss */ 33 unsigned long last_syn_loss; /* Last Fast Open SYN loss */
33 struct tcp_fastopen_cookie cookie; 34 struct tcp_fastopen_cookie cookie;
34}; 35};
@@ -40,6 +41,7 @@ struct tcp_fastopen_metrics {
40 41
41struct tcp_metrics_block { 42struct tcp_metrics_block {
42 struct tcp_metrics_block __rcu *tcpm_next; 43 struct tcp_metrics_block __rcu *tcpm_next;
44 possible_net_t tcpm_net;
43 struct inetpeer_addr tcpm_saddr; 45 struct inetpeer_addr tcpm_saddr;
44 struct inetpeer_addr tcpm_daddr; 46 struct inetpeer_addr tcpm_daddr;
45 unsigned long tcpm_stamp; 47 unsigned long tcpm_stamp;
@@ -52,6 +54,11 @@ struct tcp_metrics_block {
52 struct rcu_head rcu_head; 54 struct rcu_head rcu_head;
53}; 55};
54 56
57static inline struct net *tm_net(struct tcp_metrics_block *tm)
58{
59 return read_pnet(&tm->tcpm_net);
60}
61
55static bool tcp_metric_locked(struct tcp_metrics_block *tm, 62static bool tcp_metric_locked(struct tcp_metrics_block *tm,
56 enum tcp_metric_index idx) 63 enum tcp_metric_index idx)
57{ 64{
@@ -74,23 +81,20 @@ static void tcp_metric_set(struct tcp_metrics_block *tm,
74static bool addr_same(const struct inetpeer_addr *a, 81static bool addr_same(const struct inetpeer_addr *a,
75 const struct inetpeer_addr *b) 82 const struct inetpeer_addr *b)
76{ 83{
77 const struct in6_addr *a6, *b6;
78
79 if (a->family != b->family) 84 if (a->family != b->family)
80 return false; 85 return false;
81 if (a->family == AF_INET) 86 if (a->family == AF_INET)
82 return a->addr.a4 == b->addr.a4; 87 return a->addr.a4 == b->addr.a4;
83 88 return ipv6_addr_equal(&a->addr.in6, &b->addr.in6);
84 a6 = (const struct in6_addr *) &a->addr.a6[0];
85 b6 = (const struct in6_addr *) &b->addr.a6[0];
86
87 return ipv6_addr_equal(a6, b6);
88} 89}
89 90
90struct tcpm_hash_bucket { 91struct tcpm_hash_bucket {
91 struct tcp_metrics_block __rcu *chain; 92 struct tcp_metrics_block __rcu *chain;
92}; 93};
93 94
95static struct tcpm_hash_bucket *tcp_metrics_hash __read_mostly;
96static unsigned int tcp_metrics_hash_log __read_mostly;
97
94static DEFINE_SPINLOCK(tcp_metrics_lock); 98static DEFINE_SPINLOCK(tcp_metrics_lock);
95 99
96static void tcpm_suck_dst(struct tcp_metrics_block *tm, 100static void tcpm_suck_dst(struct tcp_metrics_block *tm,
@@ -128,6 +132,8 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm,
128 if (fastopen_clear) { 132 if (fastopen_clear) {
129 tm->tcpm_fastopen.mss = 0; 133 tm->tcpm_fastopen.mss = 0;
130 tm->tcpm_fastopen.syn_loss = 0; 134 tm->tcpm_fastopen.syn_loss = 0;
135 tm->tcpm_fastopen.try_exp = 0;
136 tm->tcpm_fastopen.cookie.exp = false;
131 tm->tcpm_fastopen.cookie.len = 0; 137 tm->tcpm_fastopen.cookie.len = 0;
132 } 138 }
133} 139}
@@ -143,6 +149,9 @@ static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst
143#define TCP_METRICS_RECLAIM_DEPTH 5 149#define TCP_METRICS_RECLAIM_DEPTH 5
144#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL 150#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL
145 151
152#define deref_locked(p) \
153 rcu_dereference_protected(p, lockdep_is_held(&tcp_metrics_lock))
154
146static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, 155static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
147 struct inetpeer_addr *saddr, 156 struct inetpeer_addr *saddr,
148 struct inetpeer_addr *daddr, 157 struct inetpeer_addr *daddr,
@@ -171,9 +180,9 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
171 if (unlikely(reclaim)) { 180 if (unlikely(reclaim)) {
172 struct tcp_metrics_block *oldest; 181 struct tcp_metrics_block *oldest;
173 182
174 oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); 183 oldest = deref_locked(tcp_metrics_hash[hash].chain);
175 for (tm = rcu_dereference(oldest->tcpm_next); tm; 184 for (tm = deref_locked(oldest->tcpm_next); tm;
176 tm = rcu_dereference(tm->tcpm_next)) { 185 tm = deref_locked(tm->tcpm_next)) {
177 if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp)) 186 if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
178 oldest = tm; 187 oldest = tm;
179 } 188 }
@@ -183,14 +192,15 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
183 if (!tm) 192 if (!tm)
184 goto out_unlock; 193 goto out_unlock;
185 } 194 }
195 write_pnet(&tm->tcpm_net, net);
186 tm->tcpm_saddr = *saddr; 196 tm->tcpm_saddr = *saddr;
187 tm->tcpm_daddr = *daddr; 197 tm->tcpm_daddr = *daddr;
188 198
189 tcpm_suck_dst(tm, dst, true); 199 tcpm_suck_dst(tm, dst, true);
190 200
191 if (likely(!reclaim)) { 201 if (likely(!reclaim)) {
192 tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain; 202 tm->tcpm_next = tcp_metrics_hash[hash].chain;
193 rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm); 203 rcu_assign_pointer(tcp_metrics_hash[hash].chain, tm);
194 } 204 }
195 205
196out_unlock: 206out_unlock:
@@ -214,10 +224,11 @@ static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *s
214 struct tcp_metrics_block *tm; 224 struct tcp_metrics_block *tm;
215 int depth = 0; 225 int depth = 0;
216 226
217 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; 227 for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
218 tm = rcu_dereference(tm->tcpm_next)) { 228 tm = rcu_dereference(tm->tcpm_next)) {
219 if (addr_same(&tm->tcpm_saddr, saddr) && 229 if (addr_same(&tm->tcpm_saddr, saddr) &&
220 addr_same(&tm->tcpm_daddr, daddr)) 230 addr_same(&tm->tcpm_daddr, daddr) &&
231 net_eq(tm_net(tm), net))
221 break; 232 break;
222 depth++; 233 depth++;
223 } 234 }
@@ -242,8 +253,8 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
242 break; 253 break;
243#if IS_ENABLED(CONFIG_IPV6) 254#if IS_ENABLED(CONFIG_IPV6)
244 case AF_INET6: 255 case AF_INET6:
245 *(struct in6_addr *)saddr.addr.a6 = inet_rsk(req)->ir_v6_loc_addr; 256 saddr.addr.in6 = inet_rsk(req)->ir_v6_loc_addr;
246 *(struct in6_addr *)daddr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr; 257 daddr.addr.in6 = inet_rsk(req)->ir_v6_rmt_addr;
247 hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr); 258 hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr);
248 break; 259 break;
249#endif 260#endif
@@ -252,12 +263,14 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
252 } 263 }
253 264
254 net = dev_net(dst->dev); 265 net = dev_net(dst->dev);
255 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); 266 hash ^= net_hash_mix(net);
267 hash = hash_32(hash, tcp_metrics_hash_log);
256 268
257 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; 269 for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
258 tm = rcu_dereference(tm->tcpm_next)) { 270 tm = rcu_dereference(tm->tcpm_next)) {
259 if (addr_same(&tm->tcpm_saddr, &saddr) && 271 if (addr_same(&tm->tcpm_saddr, &saddr) &&
260 addr_same(&tm->tcpm_daddr, &daddr)) 272 addr_same(&tm->tcpm_daddr, &daddr) &&
273 net_eq(tm_net(tm), net))
261 break; 274 break;
262 } 275 }
263 tcpm_check_stamp(tm, dst); 276 tcpm_check_stamp(tm, dst);
@@ -288,9 +301,9 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock
288 hash = (__force unsigned int) daddr.addr.a4; 301 hash = (__force unsigned int) daddr.addr.a4;
289 } else { 302 } else {
290 saddr.family = AF_INET6; 303 saddr.family = AF_INET6;
291 *(struct in6_addr *)saddr.addr.a6 = tw->tw_v6_rcv_saddr; 304 saddr.addr.in6 = tw->tw_v6_rcv_saddr;
292 daddr.family = AF_INET6; 305 daddr.family = AF_INET6;
293 *(struct in6_addr *)daddr.addr.a6 = tw->tw_v6_daddr; 306 daddr.addr.in6 = tw->tw_v6_daddr;
294 hash = ipv6_addr_hash(&tw->tw_v6_daddr); 307 hash = ipv6_addr_hash(&tw->tw_v6_daddr);
295 } 308 }
296 } 309 }
@@ -299,12 +312,14 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock
299 return NULL; 312 return NULL;
300 313
301 net = twsk_net(tw); 314 net = twsk_net(tw);
302 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); 315 hash ^= net_hash_mix(net);
316 hash = hash_32(hash, tcp_metrics_hash_log);
303 317
304 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; 318 for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
305 tm = rcu_dereference(tm->tcpm_next)) { 319 tm = rcu_dereference(tm->tcpm_next)) {
306 if (addr_same(&tm->tcpm_saddr, &saddr) && 320 if (addr_same(&tm->tcpm_saddr, &saddr) &&
307 addr_same(&tm->tcpm_daddr, &daddr)) 321 addr_same(&tm->tcpm_daddr, &daddr) &&
322 net_eq(tm_net(tm), net))
308 break; 323 break;
309 } 324 }
310 return tm; 325 return tm;
@@ -336,9 +351,9 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
336 hash = (__force unsigned int) daddr.addr.a4; 351 hash = (__force unsigned int) daddr.addr.a4;
337 } else { 352 } else {
338 saddr.family = AF_INET6; 353 saddr.family = AF_INET6;
339 *(struct in6_addr *)saddr.addr.a6 = sk->sk_v6_rcv_saddr; 354 saddr.addr.in6 = sk->sk_v6_rcv_saddr;
340 daddr.family = AF_INET6; 355 daddr.family = AF_INET6;
341 *(struct in6_addr *)daddr.addr.a6 = sk->sk_v6_daddr; 356 daddr.addr.in6 = sk->sk_v6_daddr;
342 hash = ipv6_addr_hash(&sk->sk_v6_daddr); 357 hash = ipv6_addr_hash(&sk->sk_v6_daddr);
343 } 358 }
344 } 359 }
@@ -347,7 +362,8 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
347 return NULL; 362 return NULL;
348 363
349 net = dev_net(dst->dev); 364 net = dev_net(dst->dev);
350 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); 365 hash ^= net_hash_mix(net);
366 hash = hash_32(hash, tcp_metrics_hash_log);
351 367
352 tm = __tcp_get_metrics(&saddr, &daddr, net, hash); 368 tm = __tcp_get_metrics(&saddr, &daddr, net, hash);
353 if (tm == TCP_METRICS_RECLAIM_PTR) 369 if (tm == TCP_METRICS_RECLAIM_PTR)
@@ -492,7 +508,7 @@ void tcp_init_metrics(struct sock *sk)
492 struct tcp_metrics_block *tm; 508 struct tcp_metrics_block *tm;
493 u32 val, crtt = 0; /* cached RTT scaled by 8 */ 509 u32 val, crtt = 0; /* cached RTT scaled by 8 */
494 510
495 if (dst == NULL) 511 if (!dst)
496 goto reset; 512 goto reset;
497 513
498 dst_confirm(dst); 514 dst_confirm(dst);
@@ -700,6 +716,8 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
700 if (tfom->mss) 716 if (tfom->mss)
701 *mss = tfom->mss; 717 *mss = tfom->mss;
702 *cookie = tfom->cookie; 718 *cookie = tfom->cookie;
719 if (cookie->len <= 0 && tfom->try_exp == 1)
720 cookie->exp = true;
703 *syn_loss = tfom->syn_loss; 721 *syn_loss = tfom->syn_loss;
704 *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0; 722 *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
705 } while (read_seqretry(&fastopen_seqlock, seq)); 723 } while (read_seqretry(&fastopen_seqlock, seq));
@@ -708,7 +726,8 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
708} 726}
709 727
710void tcp_fastopen_cache_set(struct sock *sk, u16 mss, 728void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
711 struct tcp_fastopen_cookie *cookie, bool syn_lost) 729 struct tcp_fastopen_cookie *cookie, bool syn_lost,
730 u16 try_exp)
712{ 731{
713 struct dst_entry *dst = __sk_dst_get(sk); 732 struct dst_entry *dst = __sk_dst_get(sk);
714 struct tcp_metrics_block *tm; 733 struct tcp_metrics_block *tm;
@@ -725,6 +744,9 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
725 tfom->mss = mss; 744 tfom->mss = mss;
726 if (cookie && cookie->len > 0) 745 if (cookie && cookie->len > 0)
727 tfom->cookie = *cookie; 746 tfom->cookie = *cookie;
747 else if (try_exp > tfom->try_exp &&
748 tfom->cookie.len <= 0 && !tfom->cookie.exp)
749 tfom->try_exp = try_exp;
728 if (syn_lost) { 750 if (syn_lost) {
729 ++tfom->syn_loss; 751 ++tfom->syn_loss;
730 tfom->last_syn_loss = jiffies; 752 tfom->last_syn_loss = jiffies;
@@ -773,19 +795,19 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
773 795
774 switch (tm->tcpm_daddr.family) { 796 switch (tm->tcpm_daddr.family) {
775 case AF_INET: 797 case AF_INET:
776 if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4, 798 if (nla_put_in_addr(msg, TCP_METRICS_ATTR_ADDR_IPV4,
777 tm->tcpm_daddr.addr.a4) < 0) 799 tm->tcpm_daddr.addr.a4) < 0)
778 goto nla_put_failure; 800 goto nla_put_failure;
779 if (nla_put_be32(msg, TCP_METRICS_ATTR_SADDR_IPV4, 801 if (nla_put_in_addr(msg, TCP_METRICS_ATTR_SADDR_IPV4,
780 tm->tcpm_saddr.addr.a4) < 0) 802 tm->tcpm_saddr.addr.a4) < 0)
781 goto nla_put_failure; 803 goto nla_put_failure;
782 break; 804 break;
783 case AF_INET6: 805 case AF_INET6:
784 if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16, 806 if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_ADDR_IPV6,
785 tm->tcpm_daddr.addr.a6) < 0) 807 &tm->tcpm_daddr.addr.in6) < 0)
786 goto nla_put_failure; 808 goto nla_put_failure;
787 if (nla_put(msg, TCP_METRICS_ATTR_SADDR_IPV6, 16, 809 if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_SADDR_IPV6,
788 tm->tcpm_saddr.addr.a6) < 0) 810 &tm->tcpm_saddr.addr.in6) < 0)
789 goto nla_put_failure; 811 goto nla_put_failure;
790 break; 812 break;
791 default: 813 default:
@@ -898,17 +920,19 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb,
898 struct netlink_callback *cb) 920 struct netlink_callback *cb)
899{ 921{
900 struct net *net = sock_net(skb->sk); 922 struct net *net = sock_net(skb->sk);
901 unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; 923 unsigned int max_rows = 1U << tcp_metrics_hash_log;
902 unsigned int row, s_row = cb->args[0]; 924 unsigned int row, s_row = cb->args[0];
903 int s_col = cb->args[1], col = s_col; 925 int s_col = cb->args[1], col = s_col;
904 926
905 for (row = s_row; row < max_rows; row++, s_col = 0) { 927 for (row = s_row; row < max_rows; row++, s_col = 0) {
906 struct tcp_metrics_block *tm; 928 struct tcp_metrics_block *tm;
907 struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row; 929 struct tcpm_hash_bucket *hb = tcp_metrics_hash + row;
908 930
909 rcu_read_lock(); 931 rcu_read_lock();
910 for (col = 0, tm = rcu_dereference(hb->chain); tm; 932 for (col = 0, tm = rcu_dereference(hb->chain); tm;
911 tm = rcu_dereference(tm->tcpm_next), col++) { 933 tm = rcu_dereference(tm->tcpm_next), col++) {
934 if (!net_eq(tm_net(tm), net))
935 continue;
912 if (col < s_col) 936 if (col < s_col)
913 continue; 937 continue;
914 if (tcp_metrics_dump_info(skb, cb, tm) < 0) { 938 if (tcp_metrics_dump_info(skb, cb, tm) < 0) {
@@ -933,7 +957,7 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
933 a = info->attrs[v4]; 957 a = info->attrs[v4];
934 if (a) { 958 if (a) {
935 addr->family = AF_INET; 959 addr->family = AF_INET;
936 addr->addr.a4 = nla_get_be32(a); 960 addr->addr.a4 = nla_get_in_addr(a);
937 if (hash) 961 if (hash)
938 *hash = (__force unsigned int) addr->addr.a4; 962 *hash = (__force unsigned int) addr->addr.a4;
939 return 0; 963 return 0;
@@ -943,9 +967,9 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
943 if (nla_len(a) != sizeof(struct in6_addr)) 967 if (nla_len(a) != sizeof(struct in6_addr))
944 return -EINVAL; 968 return -EINVAL;
945 addr->family = AF_INET6; 969 addr->family = AF_INET6;
946 memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6)); 970 addr->addr.in6 = nla_get_in6_addr(a);
947 if (hash) 971 if (hash)
948 *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6); 972 *hash = ipv6_addr_hash(&addr->addr.in6);
949 return 0; 973 return 0;
950 } 974 }
951 return optional ? 1 : -EAFNOSUPPORT; 975 return optional ? 1 : -EAFNOSUPPORT;
@@ -994,13 +1018,15 @@ static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
994 if (!reply) 1018 if (!reply)
995 goto nla_put_failure; 1019 goto nla_put_failure;
996 1020
997 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); 1021 hash ^= net_hash_mix(net);
1022 hash = hash_32(hash, tcp_metrics_hash_log);
998 ret = -ESRCH; 1023 ret = -ESRCH;
999 rcu_read_lock(); 1024 rcu_read_lock();
1000 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; 1025 for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
1001 tm = rcu_dereference(tm->tcpm_next)) { 1026 tm = rcu_dereference(tm->tcpm_next)) {
1002 if (addr_same(&tm->tcpm_daddr, &daddr) && 1027 if (addr_same(&tm->tcpm_daddr, &daddr) &&
1003 (!src || addr_same(&tm->tcpm_saddr, &saddr))) { 1028 (!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
1029 net_eq(tm_net(tm), net)) {
1004 ret = tcp_metrics_fill_info(msg, tm); 1030 ret = tcp_metrics_fill_info(msg, tm);
1005 break; 1031 break;
1006 } 1032 }
@@ -1020,34 +1046,27 @@ out_free:
1020 return ret; 1046 return ret;
1021} 1047}
1022 1048
1023#define deref_locked_genl(p) \ 1049static void tcp_metrics_flush_all(struct net *net)
1024 rcu_dereference_protected(p, lockdep_genl_is_held() && \
1025 lockdep_is_held(&tcp_metrics_lock))
1026
1027#define deref_genl(p) rcu_dereference_protected(p, lockdep_genl_is_held())
1028
1029static int tcp_metrics_flush_all(struct net *net)
1030{ 1050{
1031 unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; 1051 unsigned int max_rows = 1U << tcp_metrics_hash_log;
1032 struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash; 1052 struct tcpm_hash_bucket *hb = tcp_metrics_hash;
1033 struct tcp_metrics_block *tm; 1053 struct tcp_metrics_block *tm;
1034 unsigned int row; 1054 unsigned int row;
1035 1055
1036 for (row = 0; row < max_rows; row++, hb++) { 1056 for (row = 0; row < max_rows; row++, hb++) {
1057 struct tcp_metrics_block __rcu **pp;
1037 spin_lock_bh(&tcp_metrics_lock); 1058 spin_lock_bh(&tcp_metrics_lock);
1038 tm = deref_locked_genl(hb->chain); 1059 pp = &hb->chain;
1039 if (tm) 1060 for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
1040 hb->chain = NULL; 1061 if (net_eq(tm_net(tm), net)) {
1041 spin_unlock_bh(&tcp_metrics_lock); 1062 *pp = tm->tcpm_next;
1042 while (tm) { 1063 kfree_rcu(tm, rcu_head);
1043 struct tcp_metrics_block *next; 1064 } else {
1044 1065 pp = &tm->tcpm_next;
1045 next = deref_genl(tm->tcpm_next); 1066 }
1046 kfree_rcu(tm, rcu_head);
1047 tm = next;
1048 } 1067 }
1068 spin_unlock_bh(&tcp_metrics_lock);
1049 } 1069 }
1050 return 0;
1051} 1070}
1052 1071
1053static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) 1072static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
@@ -1064,19 +1083,23 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
1064 ret = parse_nl_addr(info, &daddr, &hash, 1); 1083 ret = parse_nl_addr(info, &daddr, &hash, 1);
1065 if (ret < 0) 1084 if (ret < 0)
1066 return ret; 1085 return ret;
1067 if (ret > 0) 1086 if (ret > 0) {
1068 return tcp_metrics_flush_all(net); 1087 tcp_metrics_flush_all(net);
1088 return 0;
1089 }
1069 ret = parse_nl_saddr(info, &saddr); 1090 ret = parse_nl_saddr(info, &saddr);
1070 if (ret < 0) 1091 if (ret < 0)
1071 src = false; 1092 src = false;
1072 1093
1073 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); 1094 hash ^= net_hash_mix(net);
1074 hb = net->ipv4.tcp_metrics_hash + hash; 1095 hash = hash_32(hash, tcp_metrics_hash_log);
1096 hb = tcp_metrics_hash + hash;
1075 pp = &hb->chain; 1097 pp = &hb->chain;
1076 spin_lock_bh(&tcp_metrics_lock); 1098 spin_lock_bh(&tcp_metrics_lock);
1077 for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) { 1099 for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
1078 if (addr_same(&tm->tcpm_daddr, &daddr) && 1100 if (addr_same(&tm->tcpm_daddr, &daddr) &&
1079 (!src || addr_same(&tm->tcpm_saddr, &saddr))) { 1101 (!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
1102 net_eq(tm_net(tm), net)) {
1080 *pp = tm->tcpm_next; 1103 *pp = tm->tcpm_next;
1081 kfree_rcu(tm, rcu_head); 1104 kfree_rcu(tm, rcu_head);
1082 found = true; 1105 found = true;
@@ -1126,6 +1149,9 @@ static int __net_init tcp_net_metrics_init(struct net *net)
1126 size_t size; 1149 size_t size;
1127 unsigned int slots; 1150 unsigned int slots;
1128 1151
1152 if (!net_eq(net, &init_net))
1153 return 0;
1154
1129 slots = tcpmhash_entries; 1155 slots = tcpmhash_entries;
1130 if (!slots) { 1156 if (!slots) {
1131 if (totalram_pages >= 128 * 1024) 1157 if (totalram_pages >= 128 * 1024)
@@ -1134,14 +1160,14 @@ static int __net_init tcp_net_metrics_init(struct net *net)
1134 slots = 8 * 1024; 1160 slots = 8 * 1024;
1135 } 1161 }
1136 1162
1137 net->ipv4.tcp_metrics_hash_log = order_base_2(slots); 1163 tcp_metrics_hash_log = order_base_2(slots);
1138 size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log; 1164 size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log;
1139 1165
1140 net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); 1166 tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1141 if (!net->ipv4.tcp_metrics_hash) 1167 if (!tcp_metrics_hash)
1142 net->ipv4.tcp_metrics_hash = vzalloc(size); 1168 tcp_metrics_hash = vzalloc(size);
1143 1169
1144 if (!net->ipv4.tcp_metrics_hash) 1170 if (!tcp_metrics_hash)
1145 return -ENOMEM; 1171 return -ENOMEM;
1146 1172
1147 return 0; 1173 return 0;
@@ -1149,19 +1175,7 @@ static int __net_init tcp_net_metrics_init(struct net *net)
1149 1175
1150static void __net_exit tcp_net_metrics_exit(struct net *net) 1176static void __net_exit tcp_net_metrics_exit(struct net *net)
1151{ 1177{
1152 unsigned int i; 1178 tcp_metrics_flush_all(net);
1153
1154 for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) {
1155 struct tcp_metrics_block *tm, *next;
1156
1157 tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1);
1158 while (tm) {
1159 next = rcu_dereference_protected(tm->tcpm_next, 1);
1160 kfree(tm);
1161 tm = next;
1162 }
1163 }
1164 kvfree(net->ipv4.tcp_metrics_hash);
1165} 1179}
1166 1180
1167static __net_initdata struct pernet_operations tcp_net_metrics_ops = { 1181static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
@@ -1175,16 +1189,10 @@ void __init tcp_metrics_init(void)
1175 1189
1176 ret = register_pernet_subsys(&tcp_net_metrics_ops); 1190 ret = register_pernet_subsys(&tcp_net_metrics_ops);
1177 if (ret < 0) 1191 if (ret < 0)
1178 goto cleanup; 1192 panic("Could not allocate the tcp_metrics hash table\n");
1193
1179 ret = genl_register_family_with_ops(&tcp_metrics_nl_family, 1194 ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
1180 tcp_metrics_nl_ops); 1195 tcp_metrics_nl_ops);
1181 if (ret < 0) 1196 if (ret < 0)
1182 goto cleanup_subsys; 1197 panic("Could not register tcp_metrics generic netlink\n");
1183 return;
1184
1185cleanup_subsys:
1186 unregister_pernet_subsys(&tcp_net_metrics_ops);
1187
1188cleanup:
1189 return;
1190} 1198}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index dd11ac7798c6..e5d7649136fc 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -34,18 +34,7 @@ int sysctl_tcp_abort_on_overflow __read_mostly;
34 34
35struct inet_timewait_death_row tcp_death_row = { 35struct inet_timewait_death_row tcp_death_row = {
36 .sysctl_max_tw_buckets = NR_FILE * 2, 36 .sysctl_max_tw_buckets = NR_FILE * 2,
37 .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
38 .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
39 .hashinfo = &tcp_hashinfo, 37 .hashinfo = &tcp_hashinfo,
40 .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
41 (unsigned long)&tcp_death_row),
42 .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work,
43 inet_twdr_twkill_work),
44/* Short-time timewait calendar */
45
46 .twcal_hand = -1,
47 .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
48 (unsigned long)&tcp_death_row),
49}; 38};
50EXPORT_SYMBOL_GPL(tcp_death_row); 39EXPORT_SYMBOL_GPL(tcp_death_row);
51 40
@@ -158,7 +147,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
158 if (!th->fin || 147 if (!th->fin ||
159 TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { 148 TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
160kill_with_rst: 149kill_with_rst:
161 inet_twsk_deschedule(tw, &tcp_death_row); 150 inet_twsk_deschedule(tw);
162 inet_twsk_put(tw); 151 inet_twsk_put(tw);
163 return TCP_TW_RST; 152 return TCP_TW_RST;
164 } 153 }
@@ -174,11 +163,9 @@ kill_with_rst:
174 if (tcp_death_row.sysctl_tw_recycle && 163 if (tcp_death_row.sysctl_tw_recycle &&
175 tcptw->tw_ts_recent_stamp && 164 tcptw->tw_ts_recent_stamp &&
176 tcp_tw_remember_stamp(tw)) 165 tcp_tw_remember_stamp(tw))
177 inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, 166 inet_twsk_schedule(tw, tw->tw_timeout);
178 TCP_TIMEWAIT_LEN);
179 else 167 else
180 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, 168 inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
181 TCP_TIMEWAIT_LEN);
182 return TCP_TW_ACK; 169 return TCP_TW_ACK;
183 } 170 }
184 171
@@ -211,13 +198,12 @@ kill_with_rst:
211 */ 198 */
212 if (sysctl_tcp_rfc1337 == 0) { 199 if (sysctl_tcp_rfc1337 == 0) {
213kill: 200kill:
214 inet_twsk_deschedule(tw, &tcp_death_row); 201 inet_twsk_deschedule(tw);
215 inet_twsk_put(tw); 202 inet_twsk_put(tw);
216 return TCP_TW_SUCCESS; 203 return TCP_TW_SUCCESS;
217 } 204 }
218 } 205 }
219 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, 206 inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
220 TCP_TIMEWAIT_LEN);
221 207
222 if (tmp_opt.saw_tstamp) { 208 if (tmp_opt.saw_tstamp) {
223 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 209 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
@@ -267,8 +253,7 @@ kill:
267 * Do not reschedule in the last case. 253 * Do not reschedule in the last case.
268 */ 254 */
269 if (paws_reject || th->ack) 255 if (paws_reject || th->ack)
270 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, 256 inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
271 TCP_TIMEWAIT_LEN);
272 257
273 return tcp_timewait_check_oow_rate_limit( 258 return tcp_timewait_check_oow_rate_limit(
274 tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); 259 tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
@@ -283,18 +268,17 @@ EXPORT_SYMBOL(tcp_timewait_state_process);
283 */ 268 */
284void tcp_time_wait(struct sock *sk, int state, int timeo) 269void tcp_time_wait(struct sock *sk, int state, int timeo)
285{ 270{
286 struct inet_timewait_sock *tw = NULL;
287 const struct inet_connection_sock *icsk = inet_csk(sk); 271 const struct inet_connection_sock *icsk = inet_csk(sk);
288 const struct tcp_sock *tp = tcp_sk(sk); 272 const struct tcp_sock *tp = tcp_sk(sk);
273 struct inet_timewait_sock *tw;
289 bool recycle_ok = false; 274 bool recycle_ok = false;
290 275
291 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) 276 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
292 recycle_ok = tcp_remember_stamp(sk); 277 recycle_ok = tcp_remember_stamp(sk);
293 278
294 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) 279 tw = inet_twsk_alloc(sk, &tcp_death_row, state);
295 tw = inet_twsk_alloc(sk, state);
296 280
297 if (tw != NULL) { 281 if (tw) {
298 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 282 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
299 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); 283 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
300 struct inet_sock *inet = inet_sk(sk); 284 struct inet_sock *inet = inet_sk(sk);
@@ -332,7 +316,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
332 struct tcp_md5sig_key *key; 316 struct tcp_md5sig_key *key;
333 tcptw->tw_md5_key = NULL; 317 tcptw->tw_md5_key = NULL;
334 key = tp->af_specific->md5_lookup(sk, sk); 318 key = tp->af_specific->md5_lookup(sk, sk);
335 if (key != NULL) { 319 if (key) {
336 tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); 320 tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
337 if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool()) 321 if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool())
338 BUG(); 322 BUG();
@@ -355,8 +339,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
355 timeo = TCP_TIMEWAIT_LEN; 339 timeo = TCP_TIMEWAIT_LEN;
356 } 340 }
357 341
358 inet_twsk_schedule(tw, &tcp_death_row, timeo, 342 inet_twsk_schedule(tw, timeo);
359 TCP_TIMEWAIT_LEN);
360 inet_twsk_put(tw); 343 inet_twsk_put(tw);
361 } else { 344 } else {
362 /* Sorry, if we're out of memory, just CLOSE this 345 /* Sorry, if we're out of memory, just CLOSE this
@@ -454,7 +437,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
454{ 437{
455 struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); 438 struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
456 439
457 if (newsk != NULL) { 440 if (newsk) {
458 const struct inet_request_sock *ireq = inet_rsk(req); 441 const struct inet_request_sock *ireq = inet_rsk(req);
459 struct tcp_request_sock *treq = tcp_rsk(req); 442 struct tcp_request_sock *treq = tcp_rsk(req);
460 struct inet_connection_sock *newicsk = inet_csk(newsk); 443 struct inet_connection_sock *newicsk = inet_csk(newsk);
@@ -572,7 +555,6 @@ EXPORT_SYMBOL(tcp_create_openreq_child);
572 555
573struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, 556struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
574 struct request_sock *req, 557 struct request_sock *req,
575 struct request_sock **prev,
576 bool fastopen) 558 bool fastopen)
577{ 559{
578 struct tcp_options_received tmp_opt; 560 struct tcp_options_received tmp_opt;
@@ -629,9 +611,16 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
629 LINUX_MIB_TCPACKSKIPPEDSYNRECV, 611 LINUX_MIB_TCPACKSKIPPEDSYNRECV,
630 &tcp_rsk(req)->last_oow_ack_time) && 612 &tcp_rsk(req)->last_oow_ack_time) &&
631 613
632 !inet_rtx_syn_ack(sk, req)) 614 !inet_rtx_syn_ack(sk, req)) {
633 req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout, 615 unsigned long expires = jiffies;
634 TCP_RTO_MAX) + jiffies; 616
617 expires += min(TCP_TIMEOUT_INIT << req->num_timeout,
618 TCP_RTO_MAX);
619 if (!fastopen)
620 mod_timer_pending(&req->rsk_timer, expires);
621 else
622 req->rsk_timer.expires = expires;
623 }
635 return NULL; 624 return NULL;
636 } 625 }
637 626
@@ -763,13 +752,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
763 * socket is created, wait for troubles. 752 * socket is created, wait for troubles.
764 */ 753 */
765 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); 754 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
766 if (child == NULL) 755 if (!child)
767 goto listen_overflow; 756 goto listen_overflow;
768 757
769 inet_csk_reqsk_queue_unlink(sk, req, prev); 758 inet_csk_reqsk_queue_drop(sk, req);
770 inet_csk_reqsk_queue_removed(sk, req);
771
772 inet_csk_reqsk_queue_add(sk, req, child); 759 inet_csk_reqsk_queue_add(sk, req, child);
760 /* Warning: caller must not call reqsk_put(req);
761 * child stole last reference on it.
762 */
773 return child; 763 return child;
774 764
775listen_overflow: 765listen_overflow:
@@ -791,7 +781,7 @@ embryonic_reset:
791 tcp_reset(sk); 781 tcp_reset(sk);
792 } 782 }
793 if (!fastopen) { 783 if (!fastopen) {
794 inet_csk_reqsk_queue_drop(sk, req, prev); 784 inet_csk_reqsk_queue_drop(sk, req);
795 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); 785 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
796 } 786 }
797 return NULL; 787 return NULL;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 9d7930ba8e0f..3f7c2fca5431 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -29,8 +29,8 @@ static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq,
29 } 29 }
30} 30}
31 31
32struct sk_buff *tcp4_gso_segment(struct sk_buff *skb, 32static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
33 netdev_features_t features) 33 netdev_features_t features)
34{ 34{
35 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 35 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
36 return ERR_PTR(-EINVAL); 36 return ERR_PTR(-EINVAL);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a2a796c5536b..a369e8a70b2c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -518,17 +518,26 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
518 518
519 if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { 519 if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
520 struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; 520 struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
521 u8 *p = (u8 *)ptr;
522 u32 len; /* Fast Open option length */
523
524 if (foc->exp) {
525 len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
526 *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
527 TCPOPT_FASTOPEN_MAGIC);
528 p += TCPOLEN_EXP_FASTOPEN_BASE;
529 } else {
530 len = TCPOLEN_FASTOPEN_BASE + foc->len;
531 *p++ = TCPOPT_FASTOPEN;
532 *p++ = len;
533 }
521 534
522 *ptr++ = htonl((TCPOPT_EXP << 24) | 535 memcpy(p, foc->val, foc->len);
523 ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) | 536 if ((len & 3) == 2) {
524 TCPOPT_FASTOPEN_MAGIC); 537 p[foc->len] = TCPOPT_NOP;
525 538 p[foc->len + 1] = TCPOPT_NOP;
526 memcpy(ptr, foc->val, foc->len);
527 if ((foc->len & 3) == 2) {
528 u8 *align = ((u8 *)ptr) + foc->len;
529 align[0] = align[1] = TCPOPT_NOP;
530 } 539 }
531 ptr += (foc->len + 3) >> 2; 540 ptr += (len + 3) >> 2;
532 } 541 }
533} 542}
534 543
@@ -565,7 +574,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
565 opts->mss = tcp_advertise_mss(sk); 574 opts->mss = tcp_advertise_mss(sk);
566 remaining -= TCPOLEN_MSS_ALIGNED; 575 remaining -= TCPOLEN_MSS_ALIGNED;
567 576
568 if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { 577 if (likely(sysctl_tcp_timestamps && !*md5)) {
569 opts->options |= OPTION_TS; 578 opts->options |= OPTION_TS;
570 opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset; 579 opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
571 opts->tsecr = tp->rx_opt.ts_recent; 580 opts->tsecr = tp->rx_opt.ts_recent;
@@ -583,13 +592,17 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
583 } 592 }
584 593
585 if (fastopen && fastopen->cookie.len >= 0) { 594 if (fastopen && fastopen->cookie.len >= 0) {
586 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len; 595 u32 need = fastopen->cookie.len;
596
597 need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
598 TCPOLEN_FASTOPEN_BASE;
587 need = (need + 3) & ~3U; /* Align to 32 bits */ 599 need = (need + 3) & ~3U; /* Align to 32 bits */
588 if (remaining >= need) { 600 if (remaining >= need) {
589 opts->options |= OPTION_FAST_OPEN_COOKIE; 601 opts->options |= OPTION_FAST_OPEN_COOKIE;
590 opts->fastopen_cookie = &fastopen->cookie; 602 opts->fastopen_cookie = &fastopen->cookie;
591 remaining -= need; 603 remaining -= need;
592 tp->syn_fastopen = 1; 604 tp->syn_fastopen = 1;
605 tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
593 } 606 }
594 } 607 }
595 608
@@ -601,15 +614,14 @@ static unsigned int tcp_synack_options(struct sock *sk,
601 struct request_sock *req, 614 struct request_sock *req,
602 unsigned int mss, struct sk_buff *skb, 615 unsigned int mss, struct sk_buff *skb,
603 struct tcp_out_options *opts, 616 struct tcp_out_options *opts,
604 struct tcp_md5sig_key **md5, 617 const struct tcp_md5sig_key *md5,
605 struct tcp_fastopen_cookie *foc) 618 struct tcp_fastopen_cookie *foc)
606{ 619{
607 struct inet_request_sock *ireq = inet_rsk(req); 620 struct inet_request_sock *ireq = inet_rsk(req);
608 unsigned int remaining = MAX_TCP_OPTION_SPACE; 621 unsigned int remaining = MAX_TCP_OPTION_SPACE;
609 622
610#ifdef CONFIG_TCP_MD5SIG 623#ifdef CONFIG_TCP_MD5SIG
611 *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); 624 if (md5) {
612 if (*md5) {
613 opts->options |= OPTION_MD5; 625 opts->options |= OPTION_MD5;
614 remaining -= TCPOLEN_MD5SIG_ALIGNED; 626 remaining -= TCPOLEN_MD5SIG_ALIGNED;
615 627
@@ -620,8 +632,6 @@ static unsigned int tcp_synack_options(struct sock *sk,
620 */ 632 */
621 ireq->tstamp_ok &= !ireq->sack_ok; 633 ireq->tstamp_ok &= !ireq->sack_ok;
622 } 634 }
623#else
624 *md5 = NULL;
625#endif 635#endif
626 636
627 /* We always send an MSS option. */ 637 /* We always send an MSS option. */
@@ -645,7 +655,10 @@ static unsigned int tcp_synack_options(struct sock *sk,
645 remaining -= TCPOLEN_SACKPERM_ALIGNED; 655 remaining -= TCPOLEN_SACKPERM_ALIGNED;
646 } 656 }
647 if (foc != NULL && foc->len >= 0) { 657 if (foc != NULL && foc->len >= 0) {
648 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; 658 u32 need = foc->len;
659
660 need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
661 TCPOLEN_FASTOPEN_BASE;
649 need = (need + 3) & ~3U; /* Align to 32 bits */ 662 need = (need + 3) & ~3U; /* Align to 32 bits */
650 if (remaining >= need) { 663 if (remaining >= need) {
651 opts->options |= OPTION_FAST_OPEN_COOKIE; 664 opts->options |= OPTION_FAST_OPEN_COOKIE;
@@ -989,7 +1002,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
989 if (md5) { 1002 if (md5) {
990 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 1003 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
991 tp->af_specific->calc_md5_hash(opts.hash_location, 1004 tp->af_specific->calc_md5_hash(opts.hash_location,
992 md5, sk, NULL, skb); 1005 md5, sk, skb);
993 } 1006 }
994#endif 1007#endif
995 1008
@@ -1151,7 +1164,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1151 1164
1152 /* Get a new skb... force flag on. */ 1165 /* Get a new skb... force flag on. */
1153 buff = sk_stream_alloc_skb(sk, nsize, gfp); 1166 buff = sk_stream_alloc_skb(sk, nsize, gfp);
1154 if (buff == NULL) 1167 if (!buff)
1155 return -ENOMEM; /* We'll just try again later. */ 1168 return -ENOMEM; /* We'll just try again later. */
1156 1169
1157 sk->sk_wmem_queued += buff->truesize; 1170 sk->sk_wmem_queued += buff->truesize;
@@ -1354,6 +1367,8 @@ void tcp_mtup_init(struct sock *sk)
1354 icsk->icsk_af_ops->net_header_len; 1367 icsk->icsk_af_ops->net_header_len;
1355 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); 1368 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
1356 icsk->icsk_mtup.probe_size = 0; 1369 icsk->icsk_mtup.probe_size = 0;
1370 if (icsk->icsk_mtup.enabled)
1371 icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
1357} 1372}
1358EXPORT_SYMBOL(tcp_mtup_init); 1373EXPORT_SYMBOL(tcp_mtup_init);
1359 1374
@@ -1708,7 +1723,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1708 return tcp_fragment(sk, skb, len, mss_now, gfp); 1723 return tcp_fragment(sk, skb, len, mss_now, gfp);
1709 1724
1710 buff = sk_stream_alloc_skb(sk, 0, gfp); 1725 buff = sk_stream_alloc_skb(sk, 0, gfp);
1711 if (unlikely(buff == NULL)) 1726 if (unlikely(!buff))
1712 return -ENOMEM; 1727 return -ENOMEM;
1713 1728
1714 sk->sk_wmem_queued += buff->truesize; 1729 sk->sk_wmem_queued += buff->truesize;
@@ -1752,20 +1767,23 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1752static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, 1767static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1753 bool *is_cwnd_limited, u32 max_segs) 1768 bool *is_cwnd_limited, u32 max_segs)
1754{ 1769{
1755 struct tcp_sock *tp = tcp_sk(sk);
1756 const struct inet_connection_sock *icsk = inet_csk(sk); 1770 const struct inet_connection_sock *icsk = inet_csk(sk);
1757 u32 send_win, cong_win, limit, in_flight; 1771 u32 age, send_win, cong_win, limit, in_flight;
1772 struct tcp_sock *tp = tcp_sk(sk);
1773 struct skb_mstamp now;
1774 struct sk_buff *head;
1758 int win_divisor; 1775 int win_divisor;
1759 1776
1760 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 1777 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1761 goto send_now; 1778 goto send_now;
1762 1779
1763 if (icsk->icsk_ca_state != TCP_CA_Open) 1780 if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR)))
1764 goto send_now; 1781 goto send_now;
1765 1782
1766 /* Defer for less than two clock ticks. */ 1783 /* Avoid bursty behavior by allowing defer
1767 if (tp->tso_deferred && 1784 * only if the last write was recent.
1768 (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1) 1785 */
1786 if ((s32)(tcp_time_stamp - tp->lsndtime) > 0)
1769 goto send_now; 1787 goto send_now;
1770 1788
1771 in_flight = tcp_packets_in_flight(tp); 1789 in_flight = tcp_packets_in_flight(tp);
@@ -1807,11 +1825,14 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1807 goto send_now; 1825 goto send_now;
1808 } 1826 }
1809 1827
1810 /* Ok, it looks like it is advisable to defer. 1828 head = tcp_write_queue_head(sk);
1811 * Do not rearm the timer if already set to not break TCP ACK clocking. 1829 skb_mstamp_get(&now);
1812 */ 1830 age = skb_mstamp_us_delta(&now, &head->skb_mstamp);
1813 if (!tp->tso_deferred) 1831 /* If next ACK is likely to come too late (half srtt), do not defer */
1814 tp->tso_deferred = 1 | (jiffies << 1); 1832 if (age < (tp->srtt_us >> 4))
1833 goto send_now;
1834
1835 /* Ok, it looks like it is advisable to defer. */
1815 1836
1816 if (cong_win < send_win && cong_win < skb->len) 1837 if (cong_win < send_win && cong_win < skb->len)
1817 *is_cwnd_limited = true; 1838 *is_cwnd_limited = true;
@@ -1819,10 +1840,34 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1819 return true; 1840 return true;
1820 1841
1821send_now: 1842send_now:
1822 tp->tso_deferred = 0;
1823 return false; 1843 return false;
1824} 1844}
1825 1845
1846static inline void tcp_mtu_check_reprobe(struct sock *sk)
1847{
1848 struct inet_connection_sock *icsk = inet_csk(sk);
1849 struct tcp_sock *tp = tcp_sk(sk);
1850 struct net *net = sock_net(sk);
1851 u32 interval;
1852 s32 delta;
1853
1854 interval = net->ipv4.sysctl_tcp_probe_interval;
1855 delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp;
1856 if (unlikely(delta >= interval * HZ)) {
1857 int mss = tcp_current_mss(sk);
1858
1859 /* Update current search range */
1860 icsk->icsk_mtup.probe_size = 0;
1861 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
1862 sizeof(struct tcphdr) +
1863 icsk->icsk_af_ops->net_header_len;
1864 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
1865
1866 /* Update probe time stamp */
1867 icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
1868 }
1869}
1870
1826/* Create a new MTU probe if we are ready. 1871/* Create a new MTU probe if we are ready.
1827 * MTU probe is regularly attempting to increase the path MTU by 1872 * MTU probe is regularly attempting to increase the path MTU by
1828 * deliberately sending larger packets. This discovers routing 1873 * deliberately sending larger packets. This discovers routing
@@ -1837,11 +1882,13 @@ static int tcp_mtu_probe(struct sock *sk)
1837 struct tcp_sock *tp = tcp_sk(sk); 1882 struct tcp_sock *tp = tcp_sk(sk);
1838 struct inet_connection_sock *icsk = inet_csk(sk); 1883 struct inet_connection_sock *icsk = inet_csk(sk);
1839 struct sk_buff *skb, *nskb, *next; 1884 struct sk_buff *skb, *nskb, *next;
1885 struct net *net = sock_net(sk);
1840 int len; 1886 int len;
1841 int probe_size; 1887 int probe_size;
1842 int size_needed; 1888 int size_needed;
1843 int copy; 1889 int copy;
1844 int mss_now; 1890 int mss_now;
1891 int interval;
1845 1892
1846 /* Not currently probing/verifying, 1893 /* Not currently probing/verifying,
1847 * not in recovery, 1894 * not in recovery,
@@ -1854,12 +1901,25 @@ static int tcp_mtu_probe(struct sock *sk)
1854 tp->rx_opt.num_sacks || tp->rx_opt.dsack) 1901 tp->rx_opt.num_sacks || tp->rx_opt.dsack)
1855 return -1; 1902 return -1;
1856 1903
1857 /* Very simple search strategy: just double the MSS. */ 1904 /* Use binary search for probe_size between tcp_mss_base,
1905 * and current mss_clamp. if (search_high - search_low)
1906 * smaller than a threshold, backoff from probing.
1907 */
1858 mss_now = tcp_current_mss(sk); 1908 mss_now = tcp_current_mss(sk);
1859 probe_size = 2 * tp->mss_cache; 1909 probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
1910 icsk->icsk_mtup.search_low) >> 1);
1860 size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; 1911 size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
1861 if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { 1912 interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
1862 /* TODO: set timer for probe_converge_event */ 1913 /* When misfortune happens, we are reprobing actively,
1914 * and then reprobe timer has expired. We stick with current
1915 * probing process by not resetting search range to its orignal.
1916 */
1917 if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
1918 interval < net->ipv4.sysctl_tcp_probe_threshold) {
1919 /* Check whether enough time has elaplased for
1920 * another round of probing.
1921 */
1922 tcp_mtu_check_reprobe(sk);
1863 return -1; 1923 return -1;
1864 } 1924 }
1865 1925
@@ -1881,7 +1941,8 @@ static int tcp_mtu_probe(struct sock *sk)
1881 } 1941 }
1882 1942
1883 /* We're allowed to probe. Build it now. */ 1943 /* We're allowed to probe. Build it now. */
1884 if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL) 1944 nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC);
1945 if (!nskb)
1885 return -1; 1946 return -1;
1886 sk->sk_wmem_queued += nskb->truesize; 1947 sk->sk_wmem_queued += nskb->truesize;
1887 sk_mem_charge(sk, nskb->truesize); 1948 sk_mem_charge(sk, nskb->truesize);
@@ -2179,7 +2240,7 @@ void tcp_send_loss_probe(struct sock *sk)
2179 int mss = tcp_current_mss(sk); 2240 int mss = tcp_current_mss(sk);
2180 int err = -1; 2241 int err = -1;
2181 2242
2182 if (tcp_send_head(sk) != NULL) { 2243 if (tcp_send_head(sk)) {
2183 err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); 2244 err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
2184 goto rearm_timer; 2245 goto rearm_timer;
2185 } 2246 }
@@ -2689,7 +2750,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2689 if (skb == tcp_send_head(sk)) 2750 if (skb == tcp_send_head(sk))
2690 break; 2751 break;
2691 /* we could do better than to assign each time */ 2752 /* we could do better than to assign each time */
2692 if (hole == NULL) 2753 if (!hole)
2693 tp->retransmit_skb_hint = skb; 2754 tp->retransmit_skb_hint = skb;
2694 2755
2695 /* Assume this retransmit will generate 2756 /* Assume this retransmit will generate
@@ -2713,7 +2774,7 @@ begin_fwd:
2713 if (!tcp_can_forward_retransmit(sk)) 2774 if (!tcp_can_forward_retransmit(sk))
2714 break; 2775 break;
2715 /* Backtrack if necessary to non-L'ed skb */ 2776 /* Backtrack if necessary to non-L'ed skb */
2716 if (hole != NULL) { 2777 if (hole) {
2717 skb = hole; 2778 skb = hole;
2718 hole = NULL; 2779 hole = NULL;
2719 } 2780 }
@@ -2721,7 +2782,7 @@ begin_fwd:
2721 goto begin_fwd; 2782 goto begin_fwd;
2722 2783
2723 } else if (!(sacked & TCPCB_LOST)) { 2784 } else if (!(sacked & TCPCB_LOST)) {
2724 if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) 2785 if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
2725 hole = skb; 2786 hole = skb;
2726 continue; 2787 continue;
2727 2788
@@ -2751,43 +2812,65 @@ begin_fwd:
2751 } 2812 }
2752} 2813}
2753 2814
2754/* Send a fin. The caller locks the socket for us. This cannot be 2815/* We allow to exceed memory limits for FIN packets to expedite
2755 * allowed to fail queueing a FIN frame under any circumstances. 2816 * connection tear down and (memory) recovery.
2817 * Otherwise tcp_send_fin() could be tempted to either delay FIN
2818 * or even be forced to close flow without any FIN.
2819 */
2820static void sk_forced_wmem_schedule(struct sock *sk, int size)
2821{
2822 int amt, status;
2823
2824 if (size <= sk->sk_forward_alloc)
2825 return;
2826 amt = sk_mem_pages(size);
2827 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2828 sk_memory_allocated_add(sk, amt, &status);
2829}
2830
2831/* Send a FIN. The caller locks the socket for us.
2832 * We should try to send a FIN packet really hard, but eventually give up.
2756 */ 2833 */
2757void tcp_send_fin(struct sock *sk) 2834void tcp_send_fin(struct sock *sk)
2758{ 2835{
2836 struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
2759 struct tcp_sock *tp = tcp_sk(sk); 2837 struct tcp_sock *tp = tcp_sk(sk);
2760 struct sk_buff *skb = tcp_write_queue_tail(sk);
2761 int mss_now;
2762 2838
2763 /* Optimization, tack on the FIN if we have a queue of 2839 /* Optimization, tack on the FIN if we have one skb in write queue and
2764 * unsent frames. But be careful about outgoing SACKS 2840 * this skb was not yet sent, or we are under memory pressure.
2765 * and IP options. 2841 * Note: in the latter case, FIN packet will be sent after a timeout,
2842 * as TCP stack thinks it has already been transmitted.
2766 */ 2843 */
2767 mss_now = tcp_current_mss(sk); 2844 if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) {
2768 2845coalesce:
2769 if (tcp_send_head(sk) != NULL) { 2846 TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
2770 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; 2847 TCP_SKB_CB(tskb)->end_seq++;
2771 TCP_SKB_CB(skb)->end_seq++;
2772 tp->write_seq++; 2848 tp->write_seq++;
2849 if (!tcp_send_head(sk)) {
2850 /* This means tskb was already sent.
2851 * Pretend we included the FIN on previous transmit.
2852 * We need to set tp->snd_nxt to the value it would have
2853 * if FIN had been sent. This is because retransmit path
2854 * does not change tp->snd_nxt.
2855 */
2856 tp->snd_nxt++;
2857 return;
2858 }
2773 } else { 2859 } else {
2774 /* Socket is locked, keep trying until memory is available. */ 2860 skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
2775 for (;;) { 2861 if (unlikely(!skb)) {
2776 skb = alloc_skb_fclone(MAX_TCP_HEADER, 2862 if (tskb)
2777 sk->sk_allocation); 2863 goto coalesce;
2778 if (skb) 2864 return;
2779 break;
2780 yield();
2781 } 2865 }
2782
2783 /* Reserve space for headers and prepare control bits. */
2784 skb_reserve(skb, MAX_TCP_HEADER); 2866 skb_reserve(skb, MAX_TCP_HEADER);
2867 sk_forced_wmem_schedule(sk, skb->truesize);
2785 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ 2868 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
2786 tcp_init_nondata_skb(skb, tp->write_seq, 2869 tcp_init_nondata_skb(skb, tp->write_seq,
2787 TCPHDR_ACK | TCPHDR_FIN); 2870 TCPHDR_ACK | TCPHDR_FIN);
2788 tcp_queue_skb(sk, skb); 2871 tcp_queue_skb(sk, skb);
2789 } 2872 }
2790 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); 2873 __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
2791} 2874}
2792 2875
2793/* We get here when a process closes a file descriptor (either due to 2876/* We get here when a process closes a file descriptor (either due to
@@ -2828,14 +2911,14 @@ int tcp_send_synack(struct sock *sk)
2828 struct sk_buff *skb; 2911 struct sk_buff *skb;
2829 2912
2830 skb = tcp_write_queue_head(sk); 2913 skb = tcp_write_queue_head(sk);
2831 if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { 2914 if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
2832 pr_debug("%s: wrong queue state\n", __func__); 2915 pr_debug("%s: wrong queue state\n", __func__);
2833 return -EFAULT; 2916 return -EFAULT;
2834 } 2917 }
2835 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { 2918 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
2836 if (skb_cloned(skb)) { 2919 if (skb_cloned(skb)) {
2837 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); 2920 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
2838 if (nskb == NULL) 2921 if (!nskb)
2839 return -ENOMEM; 2922 return -ENOMEM;
2840 tcp_unlink_write_queue(skb, sk); 2923 tcp_unlink_write_queue(skb, sk);
2841 __skb_header_release(nskb); 2924 __skb_header_release(nskb);
@@ -2870,7 +2953,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2870 struct tcp_sock *tp = tcp_sk(sk); 2953 struct tcp_sock *tp = tcp_sk(sk);
2871 struct tcphdr *th; 2954 struct tcphdr *th;
2872 struct sk_buff *skb; 2955 struct sk_buff *skb;
2873 struct tcp_md5sig_key *md5; 2956 struct tcp_md5sig_key *md5 = NULL;
2874 int tcp_header_size; 2957 int tcp_header_size;
2875 int mss; 2958 int mss;
2876 2959
@@ -2883,7 +2966,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2883 skb_reserve(skb, MAX_TCP_HEADER); 2966 skb_reserve(skb, MAX_TCP_HEADER);
2884 2967
2885 skb_dst_set(skb, dst); 2968 skb_dst_set(skb, dst);
2886 security_skb_owned_by(skb, sk);
2887 2969
2888 mss = dst_metric_advmss(dst); 2970 mss = dst_metric_advmss(dst);
2889 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) 2971 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
@@ -2896,7 +2978,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2896 else 2978 else
2897#endif 2979#endif
2898 skb_mstamp_get(&skb->skb_mstamp); 2980 skb_mstamp_get(&skb->skb_mstamp);
2899 tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, 2981
2982#ifdef CONFIG_TCP_MD5SIG
2983 rcu_read_lock();
2984 md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
2985#endif
2986 tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
2900 foc) + sizeof(*th); 2987 foc) + sizeof(*th);
2901 2988
2902 skb_push(skb, tcp_header_size); 2989 skb_push(skb, tcp_header_size);
@@ -2927,12 +3014,14 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2927 3014
2928#ifdef CONFIG_TCP_MD5SIG 3015#ifdef CONFIG_TCP_MD5SIG
2929 /* Okay, we have all we need - do the md5 hash if needed */ 3016 /* Okay, we have all we need - do the md5 hash if needed */
2930 if (md5) { 3017 if (md5)
2931 tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location, 3018 tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
2932 md5, NULL, req, skb); 3019 md5, req_to_sk(req), skb);
2933 } 3020 rcu_read_unlock();
2934#endif 3021#endif
2935 3022
3023 /* Do not fool tcpdump (if any), clean our debris */
3024 skb->tstamp.tv64 = 0;
2936 return skb; 3025 return skb;
2937} 3026}
2938EXPORT_SYMBOL(tcp_make_synack); 3027EXPORT_SYMBOL(tcp_make_synack);
@@ -2970,7 +3059,7 @@ static void tcp_connect_init(struct sock *sk)
2970 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); 3059 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
2971 3060
2972#ifdef CONFIG_TCP_MD5SIG 3061#ifdef CONFIG_TCP_MD5SIG
2973 if (tp->af_specific->md5_lookup(sk, sk) != NULL) 3062 if (tp->af_specific->md5_lookup(sk, sk))
2974 tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; 3063 tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
2975#endif 3064#endif
2976 3065
@@ -3256,7 +3345,7 @@ void tcp_send_ack(struct sock *sk)
3256 * sock. 3345 * sock.
3257 */ 3346 */
3258 buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); 3347 buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
3259 if (buff == NULL) { 3348 if (!buff) {
3260 inet_csk_schedule_ack(sk); 3349 inet_csk_schedule_ack(sk);
3261 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; 3350 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
3262 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 3351 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
@@ -3300,7 +3389,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
3300 3389
3301 /* We don't queue it, tcp_transmit_skb() sets ownership. */ 3390 /* We don't queue it, tcp_transmit_skb() sets ownership. */
3302 skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); 3391 skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
3303 if (skb == NULL) 3392 if (!skb)
3304 return -1; 3393 return -1;
3305 3394
3306 /* Reserve space for headers and set control bits. */ 3395 /* Reserve space for headers and set control bits. */
@@ -3331,8 +3420,8 @@ int tcp_write_wakeup(struct sock *sk)
3331 if (sk->sk_state == TCP_CLOSE) 3420 if (sk->sk_state == TCP_CLOSE)
3332 return -1; 3421 return -1;
3333 3422
3334 if ((skb = tcp_send_head(sk)) != NULL && 3423 skb = tcp_send_head(sk);
3335 before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { 3424 if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
3336 int err; 3425 int err;
3337 unsigned int mss = tcp_current_mss(sk); 3426 unsigned int mss = tcp_current_mss(sk);
3338 unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; 3427 unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0732b787904e..8c65dc147d8b 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -107,6 +107,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
107 if (net->ipv4.sysctl_tcp_mtu_probing) { 107 if (net->ipv4.sysctl_tcp_mtu_probing) {
108 if (!icsk->icsk_mtup.enabled) { 108 if (!icsk->icsk_mtup.enabled) {
109 icsk->icsk_mtup.enabled = 1; 109 icsk->icsk_mtup.enabled = 1;
110 icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
110 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 111 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
111 } else { 112 } else {
112 struct net *net = sock_net(sk); 113 struct net *net = sock_net(sk);
@@ -166,7 +167,7 @@ static int tcp_write_timeout(struct sock *sk)
166 if (icsk->icsk_retransmits) { 167 if (icsk->icsk_retransmits) {
167 dst_negative_advice(sk); 168 dst_negative_advice(sk);
168 if (tp->syn_fastopen || tp->syn_data) 169 if (tp->syn_fastopen || tp->syn_data)
169 tcp_fastopen_cache_set(sk, 0, NULL, true); 170 tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
170 if (tp->syn_data) 171 if (tp->syn_data)
171 NET_INC_STATS_BH(sock_net(sk), 172 NET_INC_STATS_BH(sock_net(sk),
172 LINUX_MIB_TCPFASTOPENACTIVEFAIL); 173 LINUX_MIB_TCPFASTOPENACTIVEFAIL);
@@ -326,7 +327,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
326 struct request_sock *req; 327 struct request_sock *req;
327 328
328 req = tcp_sk(sk)->fastopen_rsk; 329 req = tcp_sk(sk)->fastopen_rsk;
329 req->rsk_ops->syn_ack_timeout(sk, req); 330 req->rsk_ops->syn_ack_timeout(req);
330 331
331 if (req->num_timeout >= max_retries) { 332 if (req->num_timeout >= max_retries) {
332 tcp_write_err(sk); 333 tcp_write_err(sk);
@@ -538,19 +539,11 @@ static void tcp_write_timer(unsigned long data)
538 sock_put(sk); 539 sock_put(sk);
539} 540}
540 541
541/* 542void tcp_syn_ack_timeout(const struct request_sock *req)
542 * Timer for listening sockets
543 */
544
545static void tcp_synack_timer(struct sock *sk)
546{ 543{
547 inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, 544 struct net *net = read_pnet(&inet_rsk(req)->ireq_net);
548 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
549}
550 545
551void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req) 546 NET_INC_STATS_BH(net, LINUX_MIB_TCPTIMEOUTS);
552{
553 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEOUTS);
554} 547}
555EXPORT_SYMBOL(tcp_syn_ack_timeout); 548EXPORT_SYMBOL(tcp_syn_ack_timeout);
556 549
@@ -582,7 +575,7 @@ static void tcp_keepalive_timer (unsigned long data)
582 } 575 }
583 576
584 if (sk->sk_state == TCP_LISTEN) { 577 if (sk->sk_state == TCP_LISTEN) {
585 tcp_synack_timer(sk); 578 pr_err("Hmm... keepalive on a LISTEN ???\n");
586 goto out; 579 goto out;
587 } 580 }
588 581
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index a6afde666ab1..c71a1b8f7bde 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -286,7 +286,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
286} 286}
287 287
288/* Extract info for Tcp socket info provided via netlink. */ 288/* Extract info for Tcp socket info provided via netlink. */
289void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) 289int tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
290{ 290{
291 const struct vegas *ca = inet_csk_ca(sk); 291 const struct vegas *ca = inet_csk_ca(sk);
292 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { 292 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
@@ -297,8 +297,9 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
297 .tcpv_minrtt = ca->minRTT, 297 .tcpv_minrtt = ca->minRTT,
298 }; 298 };
299 299
300 nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); 300 return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
301 } 301 }
302 return 0;
302} 303}
303EXPORT_SYMBOL_GPL(tcp_vegas_get_info); 304EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
304 305
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
index 0531b99d8637..e8a6b33cc61d 100644
--- a/net/ipv4/tcp_vegas.h
+++ b/net/ipv4/tcp_vegas.h
@@ -19,6 +19,6 @@ void tcp_vegas_init(struct sock *sk);
19void tcp_vegas_state(struct sock *sk, u8 ca_state); 19void tcp_vegas_state(struct sock *sk, u8 ca_state);
20void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); 20void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
21void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); 21void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
22void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); 22int tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb);
23 23
24#endif /* __TCP_VEGAS_H */ 24#endif /* __TCP_VEGAS_H */
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index bb63fba47d47..b3c57cceb990 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -256,8 +256,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
256} 256}
257 257
258/* Extract info for Tcp socket info provided via netlink. */ 258/* Extract info for Tcp socket info provided via netlink. */
259static void tcp_westwood_info(struct sock *sk, u32 ext, 259static int tcp_westwood_info(struct sock *sk, u32 ext, struct sk_buff *skb)
260 struct sk_buff *skb)
261{ 260{
262 const struct westwood *ca = inet_csk_ca(sk); 261 const struct westwood *ca = inet_csk_ca(sk);
263 262
@@ -268,8 +267,9 @@ static void tcp_westwood_info(struct sock *sk, u32 ext,
268 .tcpv_minrtt = jiffies_to_usecs(ca->rtt_min), 267 .tcpv_minrtt = jiffies_to_usecs(ca->rtt_min),
269 }; 268 };
270 269
271 nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); 270 return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
272 } 271 }
272 return 0;
273} 273}
274 274
275static struct tcp_congestion_ops tcp_westwood __read_mostly = { 275static struct tcp_congestion_ops tcp_westwood __read_mostly = {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 97ef1f8b7be8..d10b7e0112eb 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -318,8 +318,8 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
318 inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)); 318 inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
319} 319}
320 320
321static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr, 321static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
322 unsigned int port) 322 unsigned int port)
323{ 323{
324 return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port; 324 return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
325} 325}
@@ -421,9 +421,9 @@ static inline int compute_score2(struct sock *sk, struct net *net,
421 return score; 421 return score;
422} 422}
423 423
424static unsigned int udp_ehashfn(struct net *net, const __be32 laddr, 424static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
425 const __u16 lport, const __be32 faddr, 425 const __u16 lport, const __be32 faddr,
426 const __be16 fport) 426 const __be16 fport)
427{ 427{
428 static u32 udp_ehash_secret __read_mostly; 428 static u32 udp_ehash_secret __read_mostly;
429 429
@@ -433,7 +433,6 @@ static unsigned int udp_ehashfn(struct net *net, const __be32 laddr,
433 udp_ehash_secret + net_hash_mix(net)); 433 udp_ehash_secret + net_hash_mix(net));
434} 434}
435 435
436
437/* called with read_rcu_lock() */ 436/* called with read_rcu_lock() */
438static struct sock *udp4_lib_lookup2(struct net *net, 437static struct sock *udp4_lib_lookup2(struct net *net,
439 __be32 saddr, __be16 sport, 438 __be32 saddr, __be16 sport,
@@ -633,7 +632,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
633 632
634 sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, 633 sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
635 iph->saddr, uh->source, skb->dev->ifindex, udptable); 634 iph->saddr, uh->source, skb->dev->ifindex, udptable);
636 if (sk == NULL) { 635 if (!sk) {
637 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); 636 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
638 return; /* No socket for error */ 637 return; /* No socket for error */
639 } 638 }
@@ -873,8 +872,7 @@ out:
873} 872}
874EXPORT_SYMBOL(udp_push_pending_frames); 873EXPORT_SYMBOL(udp_push_pending_frames);
875 874
876int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 875int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
877 size_t len)
878{ 876{
879 struct inet_sock *inet = inet_sk(sk); 877 struct inet_sock *inet = inet_sk(sk);
880 struct udp_sock *up = udp_sk(sk); 878 struct udp_sock *up = udp_sk(sk);
@@ -1012,7 +1010,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1012 if (connected) 1010 if (connected)
1013 rt = (struct rtable *)sk_dst_check(sk, 0); 1011 rt = (struct rtable *)sk_dst_check(sk, 0);
1014 1012
1015 if (rt == NULL) { 1013 if (!rt) {
1016 struct net *net = sock_net(sk); 1014 struct net *net = sock_net(sk);
1017 1015
1018 fl4 = &fl4_stack; 1016 fl4 = &fl4_stack;
@@ -1136,7 +1134,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
1136 * sendpage interface can't pass. 1134 * sendpage interface can't pass.
1137 * This will succeed only when the socket is connected. 1135 * This will succeed only when the socket is connected.
1138 */ 1136 */
1139 ret = udp_sendmsg(NULL, sk, &msg, 0); 1137 ret = udp_sendmsg(sk, &msg, 0);
1140 if (ret < 0) 1138 if (ret < 0)
1141 return ret; 1139 return ret;
1142 } 1140 }
@@ -1172,7 +1170,6 @@ out:
1172 return ret; 1170 return ret;
1173} 1171}
1174 1172
1175
1176/** 1173/**
1177 * first_packet_length - return length of first packet in receive queue 1174 * first_packet_length - return length of first packet in receive queue
1178 * @sk: socket 1175 * @sk: socket
@@ -1254,8 +1251,8 @@ EXPORT_SYMBOL(udp_ioctl);
1254 * return it, otherwise we block. 1251 * return it, otherwise we block.
1255 */ 1252 */
1256 1253
1257int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 1254int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
1258 size_t len, int noblock, int flags, int *addr_len) 1255 int flags, int *addr_len)
1259{ 1256{
1260 struct inet_sock *inet = inet_sk(sk); 1257 struct inet_sock *inet = inet_sk(sk);
1261 DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); 1258 DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
@@ -1356,7 +1353,6 @@ csum_copy_err:
1356 goto try_again; 1353 goto try_again;
1357} 1354}
1358 1355
1359
1360int udp_disconnect(struct sock *sk, int flags) 1356int udp_disconnect(struct sock *sk, int flags)
1361{ 1357{
1362 struct inet_sock *inet = inet_sk(sk); 1358 struct inet_sock *inet = inet_sk(sk);
@@ -1523,7 +1519,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1523 1519
1524 /* if we're overly short, let UDP handle it */ 1520 /* if we're overly short, let UDP handle it */
1525 encap_rcv = ACCESS_ONCE(up->encap_rcv); 1521 encap_rcv = ACCESS_ONCE(up->encap_rcv);
1526 if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) { 1522 if (skb->len > sizeof(struct udphdr) && encap_rcv) {
1527 int ret; 1523 int ret;
1528 1524
1529 /* Verify checksum before giving to encap */ 1525 /* Verify checksum before giving to encap */
@@ -1580,7 +1576,6 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1580 udp_lib_checksum_complete(skb)) 1576 udp_lib_checksum_complete(skb))
1581 goto csum_error; 1577 goto csum_error;
1582 1578
1583
1584 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 1579 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
1585 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, 1580 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
1586 is_udplite); 1581 is_udplite);
@@ -1610,7 +1605,6 @@ drop:
1610 return -1; 1605 return -1;
1611} 1606}
1612 1607
1613
1614static void flush_stack(struct sock **stack, unsigned int count, 1608static void flush_stack(struct sock **stack, unsigned int count,
1615 struct sk_buff *skb, unsigned int final) 1609 struct sk_buff *skb, unsigned int final)
1616{ 1610{
@@ -1620,7 +1614,7 @@ static void flush_stack(struct sock **stack, unsigned int count,
1620 1614
1621 for (i = 0; i < count; i++) { 1615 for (i = 0; i < count; i++) {
1622 sk = stack[i]; 1616 sk = stack[i];
1623 if (likely(skb1 == NULL)) 1617 if (likely(!skb1))
1624 skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); 1618 skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
1625 1619
1626 if (!skb1) { 1620 if (!skb1) {
@@ -1803,7 +1797,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
1803 saddr, daddr, udptable, proto); 1797 saddr, daddr, udptable, proto);
1804 1798
1805 sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); 1799 sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
1806 if (sk != NULL) { 1800 if (sk) {
1807 int ret; 1801 int ret;
1808 1802
1809 if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) 1803 if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
@@ -2525,6 +2519,16 @@ void __init udp_table_init(struct udp_table *table, const char *name)
2525 } 2519 }
2526} 2520}
2527 2521
2522u32 udp_flow_hashrnd(void)
2523{
2524 static u32 hashrnd __read_mostly;
2525
2526 net_get_random_once(&hashrnd, sizeof(hashrnd));
2527
2528 return hashrnd;
2529}
2530EXPORT_SYMBOL(udp_flow_hashrnd);
2531
2528void __init udp_init(void) 2532void __init udp_init(void)
2529{ 2533{
2530 unsigned long limit; 2534 unsigned long limit;
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 4a000f1dd757..b763c39ae1d7 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -18,8 +18,9 @@
18#include <linux/sock_diag.h> 18#include <linux/sock_diag.h>
19 19
20static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, 20static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
21 struct netlink_callback *cb, struct inet_diag_req_v2 *req, 21 struct netlink_callback *cb,
22 struct nlattr *bc) 22 const struct inet_diag_req_v2 *req,
23 struct nlattr *bc)
23{ 24{
24 if (!inet_diag_bc_sk(bc, sk)) 25 if (!inet_diag_bc_sk(bc, sk))
25 return 0; 26 return 0;
@@ -31,7 +32,8 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
31} 32}
32 33
33static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, 34static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
34 const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req) 35 const struct nlmsghdr *nlh,
36 const struct inet_diag_req_v2 *req)
35{ 37{
36 int err = -EINVAL; 38 int err = -EINVAL;
37 struct sock *sk; 39 struct sock *sk;
@@ -56,7 +58,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
56 goto out_nosk; 58 goto out_nosk;
57 59
58 err = -ENOENT; 60 err = -ENOENT;
59 if (sk == NULL) 61 if (!sk)
60 goto out_nosk; 62 goto out_nosk;
61 63
62 err = sock_diag_check_cookie(sk, req->id.idiag_cookie); 64 err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
@@ -90,8 +92,9 @@ out_nosk:
90 return err; 92 return err;
91} 93}
92 94
93static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, 95static void udp_dump(struct udp_table *table, struct sk_buff *skb,
94 struct inet_diag_req_v2 *r, struct nlattr *bc) 96 struct netlink_callback *cb,
97 const struct inet_diag_req_v2 *r, struct nlattr *bc)
95{ 98{
96 int num, s_num, slot, s_slot; 99 int num, s_num, slot, s_slot;
97 struct net *net = sock_net(skb->sk); 100 struct net *net = sock_net(skb->sk);
@@ -144,13 +147,13 @@ done:
144} 147}
145 148
146static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, 149static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
147 struct inet_diag_req_v2 *r, struct nlattr *bc) 150 const struct inet_diag_req_v2 *r, struct nlattr *bc)
148{ 151{
149 udp_dump(&udp_table, skb, cb, r, bc); 152 udp_dump(&udp_table, skb, cb, r, bc);
150} 153}
151 154
152static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, 155static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
153 struct inet_diag_req_v2 *req) 156 const struct inet_diag_req_v2 *req)
154{ 157{
155 return udp_dump_one(&udp_table, in_skb, nlh, req); 158 return udp_dump_one(&udp_table, in_skb, nlh, req);
156} 159}
@@ -170,13 +173,14 @@ static const struct inet_diag_handler udp_diag_handler = {
170}; 173};
171 174
172static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, 175static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
173 struct inet_diag_req_v2 *r, struct nlattr *bc) 176 const struct inet_diag_req_v2 *r,
177 struct nlattr *bc)
174{ 178{
175 udp_dump(&udplite_table, skb, cb, r, bc); 179 udp_dump(&udplite_table, skb, cb, r, bc);
176} 180}
177 181
178static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, 182static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
179 struct inet_diag_req_v2 *req) 183 const struct inet_diag_req_v2 *req)
180{ 184{
181 return udp_dump_one(&udplite_table, in_skb, nlh, req); 185 return udp_dump_one(&udplite_table, in_skb, nlh, req);
182} 186}
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index f3c27899f62b..7e0fe4bdd967 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -21,8 +21,8 @@ int compat_udp_setsockopt(struct sock *sk, int level, int optname,
21int compat_udp_getsockopt(struct sock *sk, int level, int optname, 21int compat_udp_getsockopt(struct sock *sk, int level, int optname,
22 char __user *optval, int __user *optlen); 22 char __user *optval, int __user *optlen);
23#endif 23#endif
24int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 24int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
25 size_t len, int noblock, int flags, int *addr_len); 25 int flags, int *addr_len);
26int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, 26int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
27 int flags); 27 int flags);
28int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); 28int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 4915d8284a86..f9386160cbee 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -285,7 +285,7 @@ void udp_del_offload(struct udp_offload *uo)
285 pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port)); 285 pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port));
286unlock: 286unlock:
287 spin_unlock(&udp_offload_lock); 287 spin_unlock(&udp_offload_lock);
288 if (uo_priv != NULL) 288 if (uo_priv)
289 call_rcu(&uo_priv->rcu, udp_offload_free_routine); 289 call_rcu(&uo_priv->rcu, udp_offload_free_routine);
290} 290}
291EXPORT_SYMBOL(udp_del_offload); 291EXPORT_SYMBOL(udp_del_offload);
@@ -394,7 +394,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
394 break; 394 break;
395 } 395 }
396 396
397 if (uo_priv != NULL) { 397 if (uo_priv) {
398 NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; 398 NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
399 err = uo_priv->offload->callbacks.gro_complete(skb, 399 err = uo_priv->offload->callbacks.gro_complete(skb,
400 nhoff + sizeof(struct udphdr), 400 nhoff + sizeof(struct udphdr),
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index c83b35485056..6bb98cc193c9 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -75,7 +75,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
75} 75}
76EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); 76EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
77 77
78int udp_tunnel_xmit_skb(struct rtable *rt, struct sk_buff *skb, 78int udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
79 __be32 src, __be32 dst, __u8 tos, __u8 ttl, 79 __be32 src, __be32 dst, __u8 tos, __u8 ttl,
80 __be16 df, __be16 src_port, __be16 dst_port, 80 __be16 df, __be16 src_port, __be16 dst_port,
81 bool xnet, bool nocheck) 81 bool xnet, bool nocheck)
@@ -92,7 +92,7 @@ int udp_tunnel_xmit_skb(struct rtable *rt, struct sk_buff *skb,
92 92
93 udp_set_csum(nocheck, skb, src, dst, skb->len); 93 udp_set_csum(nocheck, skb, src, dst, skb->len);
94 94
95 return iptunnel_xmit(skb->sk, rt, skb, src, dst, IPPROTO_UDP, 95 return iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP,
96 tos, ttl, df, xnet); 96 tos, ttl, df, xnet);
97} 97}
98EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); 98EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index aac6197b7a71..60b032f58ccc 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -22,9 +22,9 @@ int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb)
22 return xfrm4_extract_header(skb); 22 return xfrm4_extract_header(skb);
23} 23}
24 24
25static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) 25static inline int xfrm4_rcv_encap_finish(struct sock *sk, struct sk_buff *skb)
26{ 26{
27 if (skb_dst(skb) == NULL) { 27 if (!skb_dst(skb)) {
28 const struct iphdr *iph = ip_hdr(skb); 28 const struct iphdr *iph = ip_hdr(skb);
29 29
30 if (ip_route_input_noref(skb, iph->daddr, iph->saddr, 30 if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
@@ -52,7 +52,8 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
52 iph->tot_len = htons(skb->len); 52 iph->tot_len = htons(skb->len);
53 ip_send_check(iph); 53 ip_send_check(iph);
54 54
55 NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, 55 NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb,
56 skb->dev, NULL,
56 xfrm4_rcv_encap_finish); 57 xfrm4_rcv_encap_finish);
57 return 0; 58 return 0;
58} 59}
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 91771a7c802f..35feda676464 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -63,7 +63,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
63 63
64 top_iph->saddr = x->props.saddr.a4; 64 top_iph->saddr = x->props.saddr.a4;
65 top_iph->daddr = x->id.daddr.a4; 65 top_iph->daddr = x->id.daddr.a4;
66 ip_select_ident(skb, NULL); 66 ip_select_ident(dev_net(dst->dev), skb, NULL);
67 67
68 return 0; 68 return 0;
69} 69}
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index d5f6bd9a210a..2878dbfffeb7 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -63,40 +63,40 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
63 return err; 63 return err;
64 64
65 IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; 65 IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
66 skb->protocol = htons(ETH_P_IP);
66 67
67 return x->outer_mode->output2(x, skb); 68 return x->outer_mode->output2(x, skb);
68} 69}
69EXPORT_SYMBOL(xfrm4_prepare_output); 70EXPORT_SYMBOL(xfrm4_prepare_output);
70 71
71int xfrm4_output_finish(struct sk_buff *skb) 72int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb)
72{ 73{
73 memset(IPCB(skb), 0, sizeof(*IPCB(skb))); 74 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
74 skb->protocol = htons(ETH_P_IP);
75 75
76#ifdef CONFIG_NETFILTER 76#ifdef CONFIG_NETFILTER
77 IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; 77 IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
78#endif 78#endif
79 79
80 return xfrm_output(skb); 80 return xfrm_output(sk, skb);
81} 81}
82 82
83static int __xfrm4_output(struct sk_buff *skb) 83static int __xfrm4_output(struct sock *sk, struct sk_buff *skb)
84{ 84{
85 struct xfrm_state *x = skb_dst(skb)->xfrm; 85 struct xfrm_state *x = skb_dst(skb)->xfrm;
86 86
87#ifdef CONFIG_NETFILTER 87#ifdef CONFIG_NETFILTER
88 if (!x) { 88 if (!x) {
89 IPCB(skb)->flags |= IPSKB_REROUTED; 89 IPCB(skb)->flags |= IPSKB_REROUTED;
90 return dst_output(skb); 90 return dst_output_sk(sk, skb);
91 } 91 }
92#endif 92#endif
93 93
94 return x->outer_mode->afinfo->output_finish(skb); 94 return x->outer_mode->afinfo->output_finish(sk, skb);
95} 95}
96 96
97int xfrm4_output(struct sock *sk, struct sk_buff *skb) 97int xfrm4_output(struct sock *sk, struct sk_buff *skb)
98{ 98{
99 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, 99 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb,
100 NULL, skb_dst(skb)->dev, __xfrm4_output, 100 NULL, skb_dst(skb)->dev, __xfrm4_output,
101 !(IPCB(skb)->flags & IPSKB_REROUTED)); 101 !(IPCB(skb)->flags & IPSKB_REROUTED));
102} 102}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 6156f68a1e90..bff69746e05f 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -232,7 +232,6 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
232 232
233static struct dst_ops xfrm4_dst_ops = { 233static struct dst_ops xfrm4_dst_ops = {
234 .family = AF_INET, 234 .family = AF_INET,
235 .protocol = cpu_to_be16(ETH_P_IP),
236 .gc = xfrm4_garbage_collect, 235 .gc = xfrm4_garbage_collect,
237 .update_pmtu = xfrm4_update_pmtu, 236 .update_pmtu = xfrm4_update_pmtu,
238 .redirect = xfrm4_redirect, 237 .redirect = xfrm4_redirect,
@@ -299,7 +298,7 @@ static void __net_exit xfrm4_net_exit(struct net *net)
299{ 298{
300 struct ctl_table *table; 299 struct ctl_table *table;
301 300
302 if (net->ipv4.xfrm4_hdr == NULL) 301 if (!net->ipv4.xfrm4_hdr)
303 return; 302 return;
304 303
305 table = net->ipv4.xfrm4_hdr->ctl_table_arg; 304 table = net->ipv4.xfrm4_hdr->ctl_table_arg;