aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig10
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/af_inet.c13
-rw-r--r--net/ipv4/arp.c43
-rw-r--r--net/ipv4/devinet.c2
-rw-r--r--net/ipv4/fib_rules.c1
-rw-r--r--net/ipv4/fib_trie.c11
-rw-r--r--net/ipv4/igmp.c27
-rw-r--r--net/ipv4/inet_connection_sock.c26
-rw-r--r--net/ipv4/inet_diag.c470
-rw-r--r--net/ipv4/inetpeer.c82
-rw-r--r--net/ipv4/ip_fragment.c2
-rw-r--r--net/ipv4/ip_gre.c28
-rw-r--r--net/ipv4/ip_options.c2
-rw-r--r--net/ipv4/ip_output.c23
-rw-r--r--net/ipv4/ip_sockglue.c41
-rw-r--r--net/ipv4/ipconfig.c23
-rw-r--r--net/ipv4/ipip.c10
-rw-r--r--net/ipv4/ipmr.c6
-rw-r--r--net/ipv4/netfilter/Kconfig18
-rw-r--r--net/ipv4/netfilter/Makefile2
-rw-r--r--net/ipv4/netfilter/ip_queue.c8
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c16
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c14
-rw-r--r--net/ipv4/netfilter/ipt_REDIRECT.c16
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c2
-rw-r--r--net/ipv4/netfilter/ipt_ecn.c127
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c141
-rw-r--r--net/ipv4/netfilter/iptable_filter.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c98
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c20
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c16
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c14
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_common.c36
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_dccp.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c10
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_sctp.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_tcp.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_udp.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_udplite.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_unknown.c3
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c22
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c10
-rw-r--r--net/ipv4/netfilter/nf_nat_standalone.c2
-rw-r--r--net/ipv4/ping.c28
-rw-r--r--net/ipv4/proc.c16
-rw-r--r--net/ipv4/raw.c10
-rw-r--r--net/ipv4/route.c24
-rw-r--r--net/ipv4/syncookies.c32
-rw-r--r--net/ipv4/sysctl_net_ipv4.c64
-rw-r--r--net/ipv4/tcp.c91
-rw-r--r--net/ipv4/tcp_bic.c11
-rw-r--r--net/ipv4/tcp_cong.c2
-rw-r--r--net/ipv4/tcp_cubic.c10
-rw-r--r--net/ipv4/tcp_diag.c20
-rw-r--r--net/ipv4/tcp_input.c159
-rw-r--r--net/ipv4/tcp_ipv4.c32
-rw-r--r--net/ipv4/tcp_memcontrol.c272
-rw-r--r--net/ipv4/tcp_minisocks.c12
-rw-r--r--net/ipv4/tcp_output.c33
-rw-r--r--net/ipv4/tcp_timer.c13
-rw-r--r--net/ipv4/tunnel4.c10
-rw-r--r--net/ipv4/udp.c9
-rw-r--r--net/ipv4/udp_diag.c200
-rw-r--r--net/ipv4/xfrm4_mode_beet.c5
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c6
-rw-r--r--net/ipv4/xfrm4_tunnel.c6
68 files changed, 1650 insertions, 850 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index cbb505ba9324..d183262943d9 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -163,8 +163,6 @@ config IP_PNP_RARP
163 operating on your network. Read 163 operating on your network. Read
164 <file:Documentation/filesystems/nfs/nfsroot.txt> for details. 164 <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
165 165
166# not yet ready..
167# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
168config NET_IPIP 166config NET_IPIP
169 tristate "IP: tunneling" 167 tristate "IP: tunneling"
170 select INET_TUNNEL 168 select INET_TUNNEL
@@ -409,6 +407,14 @@ config INET_TCP_DIAG
409 depends on INET_DIAG 407 depends on INET_DIAG
410 def_tristate INET_DIAG 408 def_tristate INET_DIAG
411 409
410config INET_UDP_DIAG
411 tristate "UDP: socket monitoring interface"
412 depends on INET_DIAG && (IPV6 || IPV6=n)
413 default n
414 ---help---
415 Support for UDP socket monitoring interface used by the ss tool.
416 If unsure, say Y.
417
412menuconfig TCP_CONG_ADVANCED 418menuconfig TCP_CONG_ADVANCED
413 bool "TCP: advanced congestion control" 419 bool "TCP: advanced congestion control"
414 ---help--- 420 ---help---
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f2dc69cffb57..ff75d3bbcd6a 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_IP_PNP) += ipconfig.o
34obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ 34obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
35obj-$(CONFIG_INET_DIAG) += inet_diag.o 35obj-$(CONFIG_INET_DIAG) += inet_diag.o
36obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o 36obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
37obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
37obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o 38obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
38obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o 39obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
39obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o 40obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
@@ -47,6 +48,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
47obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o 48obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
48obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o 49obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
49obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o 50obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
51obj-$(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) += tcp_memcontrol.o
50obj-$(CONFIG_NETLABEL) += cipso_ipv4.o 52obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
51 53
52obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ 54obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1b5096a9875a..f7b5670744f0 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1250,7 +1250,8 @@ out:
1250 return err; 1250 return err;
1251} 1251}
1252 1252
1253static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features) 1253static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
1254 netdev_features_t features)
1254{ 1255{
1255 struct sk_buff *segs = ERR_PTR(-EINVAL); 1256 struct sk_buff *segs = ERR_PTR(-EINVAL);
1256 struct iphdr *iph; 1257 struct iphdr *iph;
@@ -1572,9 +1573,9 @@ static __net_init int ipv4_mib_init_net(struct net *net)
1572 sizeof(struct icmp_mib), 1573 sizeof(struct icmp_mib),
1573 __alignof__(struct icmp_mib)) < 0) 1574 __alignof__(struct icmp_mib)) < 0)
1574 goto err_icmp_mib; 1575 goto err_icmp_mib;
1575 if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics, 1576 net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),
1576 sizeof(struct icmpmsg_mib), 1577 GFP_KERNEL);
1577 __alignof__(struct icmpmsg_mib)) < 0) 1578 if (!net->mib.icmpmsg_statistics)
1578 goto err_icmpmsg_mib; 1579 goto err_icmpmsg_mib;
1579 1580
1580 tcp_mib_init(net); 1581 tcp_mib_init(net);
@@ -1598,7 +1599,7 @@ err_tcp_mib:
1598 1599
1599static __net_exit void ipv4_mib_exit_net(struct net *net) 1600static __net_exit void ipv4_mib_exit_net(struct net *net)
1600{ 1601{
1601 snmp_mib_free((void __percpu **)net->mib.icmpmsg_statistics); 1602 kfree(net->mib.icmpmsg_statistics);
1602 snmp_mib_free((void __percpu **)net->mib.icmp_statistics); 1603 snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
1603 snmp_mib_free((void __percpu **)net->mib.udplite_statistics); 1604 snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
1604 snmp_mib_free((void __percpu **)net->mib.udp_statistics); 1605 snmp_mib_free((void __percpu **)net->mib.udp_statistics);
@@ -1671,6 +1672,8 @@ static int __init inet_init(void)
1671 ip_static_sysctl_init(); 1672 ip_static_sysctl_init();
1672#endif 1673#endif
1673 1674
1675 tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem;
1676
1674 /* 1677 /*
1675 * Add all the base protocols. 1678 * Add all the base protocols.
1676 */ 1679 */
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 96a164aa1367..63e49890ad31 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -112,11 +112,6 @@
112#include <net/arp.h> 112#include <net/arp.h>
113#include <net/ax25.h> 113#include <net/ax25.h>
114#include <net/netrom.h> 114#include <net/netrom.h>
115#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
116#include <net/atmclip.h>
117struct neigh_table *clip_tbl_hook;
118EXPORT_SYMBOL(clip_tbl_hook);
119#endif
120 115
121#include <asm/system.h> 116#include <asm/system.h>
122#include <linux/uaccess.h> 117#include <linux/uaccess.h>
@@ -126,7 +121,7 @@ EXPORT_SYMBOL(clip_tbl_hook);
126/* 121/*
127 * Interface to generic neighbour cache. 122 * Interface to generic neighbour cache.
128 */ 123 */
129static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd); 124static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd);
130static int arp_constructor(struct neighbour *neigh); 125static int arp_constructor(struct neighbour *neigh);
131static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); 126static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
132static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); 127static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -164,7 +159,6 @@ static const struct neigh_ops arp_broken_ops = {
164 159
165struct neigh_table arp_tbl = { 160struct neigh_table arp_tbl = {
166 .family = AF_INET, 161 .family = AF_INET,
167 .entry_size = sizeof(struct neighbour) + 4,
168 .key_len = 4, 162 .key_len = 4,
169 .hash = arp_hash, 163 .hash = arp_hash,
170 .constructor = arp_constructor, 164 .constructor = arp_constructor,
@@ -177,7 +171,7 @@ struct neigh_table arp_tbl = {
177 .gc_staletime = 60 * HZ, 171 .gc_staletime = 60 * HZ,
178 .reachable_time = 30 * HZ, 172 .reachable_time = 30 * HZ,
179 .delay_probe_time = 5 * HZ, 173 .delay_probe_time = 5 * HZ,
180 .queue_len = 3, 174 .queue_len_bytes = 64*1024,
181 .ucast_probes = 3, 175 .ucast_probes = 3,
182 .mcast_probes = 3, 176 .mcast_probes = 3,
183 .anycast_delay = 1 * HZ, 177 .anycast_delay = 1 * HZ,
@@ -221,9 +215,9 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
221 215
222static u32 arp_hash(const void *pkey, 216static u32 arp_hash(const void *pkey,
223 const struct net_device *dev, 217 const struct net_device *dev,
224 __u32 hash_rnd) 218 __u32 *hash_rnd)
225{ 219{
226 return arp_hashfn(*(u32 *)pkey, dev, hash_rnd); 220 return arp_hashfn(*(u32 *)pkey, dev, *hash_rnd);
227} 221}
228 222
229static int arp_constructor(struct neighbour *neigh) 223static int arp_constructor(struct neighbour *neigh)
@@ -283,9 +277,9 @@ static int arp_constructor(struct neighbour *neigh)
283 default: 277 default:
284 break; 278 break;
285 case ARPHRD_ROSE: 279 case ARPHRD_ROSE:
286#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) 280#if IS_ENABLED(CONFIG_AX25)
287 case ARPHRD_AX25: 281 case ARPHRD_AX25:
288#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) 282#if IS_ENABLED(CONFIG_NETROM)
289 case ARPHRD_NETROM: 283 case ARPHRD_NETROM:
290#endif 284#endif
291 neigh->ops = &arp_broken_ops; 285 neigh->ops = &arp_broken_ops;
@@ -592,16 +586,18 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
592 struct sk_buff *skb; 586 struct sk_buff *skb;
593 struct arphdr *arp; 587 struct arphdr *arp;
594 unsigned char *arp_ptr; 588 unsigned char *arp_ptr;
589 int hlen = LL_RESERVED_SPACE(dev);
590 int tlen = dev->needed_tailroom;
595 591
596 /* 592 /*
597 * Allocate a buffer 593 * Allocate a buffer
598 */ 594 */
599 595
600 skb = alloc_skb(arp_hdr_len(dev) + LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); 596 skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC);
601 if (skb == NULL) 597 if (skb == NULL)
602 return NULL; 598 return NULL;
603 599
604 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 600 skb_reserve(skb, hlen);
605 skb_reset_network_header(skb); 601 skb_reset_network_header(skb);
606 arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev)); 602 arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev));
607 skb->dev = dev; 603 skb->dev = dev;
@@ -633,13 +629,13 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
633 arp->ar_pro = htons(ETH_P_IP); 629 arp->ar_pro = htons(ETH_P_IP);
634 break; 630 break;
635 631
636#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) 632#if IS_ENABLED(CONFIG_AX25)
637 case ARPHRD_AX25: 633 case ARPHRD_AX25:
638 arp->ar_hrd = htons(ARPHRD_AX25); 634 arp->ar_hrd = htons(ARPHRD_AX25);
639 arp->ar_pro = htons(AX25_P_IP); 635 arp->ar_pro = htons(AX25_P_IP);
640 break; 636 break;
641 637
642#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) 638#if IS_ENABLED(CONFIG_NETROM)
643 case ARPHRD_NETROM: 639 case ARPHRD_NETROM:
644 arp->ar_hrd = htons(ARPHRD_NETROM); 640 arp->ar_hrd = htons(ARPHRD_NETROM);
645 arp->ar_pro = htons(AX25_P_IP); 641 arp->ar_pro = htons(AX25_P_IP);
@@ -647,13 +643,13 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
647#endif 643#endif
648#endif 644#endif
649 645
650#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE) 646#if IS_ENABLED(CONFIG_FDDI)
651 case ARPHRD_FDDI: 647 case ARPHRD_FDDI:
652 arp->ar_hrd = htons(ARPHRD_ETHER); 648 arp->ar_hrd = htons(ARPHRD_ETHER);
653 arp->ar_pro = htons(ETH_P_IP); 649 arp->ar_pro = htons(ETH_P_IP);
654 break; 650 break;
655#endif 651#endif
656#if defined(CONFIG_TR) || defined(CONFIG_TR_MODULE) 652#if IS_ENABLED(CONFIG_TR)
657 case ARPHRD_IEEE802_TR: 653 case ARPHRD_IEEE802_TR:
658 arp->ar_hrd = htons(ARPHRD_IEEE802); 654 arp->ar_hrd = htons(ARPHRD_IEEE802);
659 arp->ar_pro = htons(ETH_P_IP); 655 arp->ar_pro = htons(ETH_P_IP);
@@ -867,7 +863,8 @@ static int arp_process(struct sk_buff *skb)
867 if (addr_type == RTN_UNICAST && 863 if (addr_type == RTN_UNICAST &&
868 (arp_fwd_proxy(in_dev, dev, rt) || 864 (arp_fwd_proxy(in_dev, dev, rt) ||
869 arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || 865 arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
870 pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { 866 (rt->dst.dev != dev &&
867 pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
871 n = neigh_event_ns(&arp_tbl, sha, &sip, dev); 868 n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
872 if (n) 869 if (n)
873 neigh_release(n); 870 neigh_release(n);
@@ -1040,7 +1037,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1040 return -EINVAL; 1037 return -EINVAL;
1041 } 1038 }
1042 switch (dev->type) { 1039 switch (dev->type) {
1043#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE) 1040#if IS_ENABLED(CONFIG_FDDI)
1044 case ARPHRD_FDDI: 1041 case ARPHRD_FDDI:
1045 /* 1042 /*
1046 * According to RFC 1390, FDDI devices should accept ARP 1043 * According to RFC 1390, FDDI devices should accept ARP
@@ -1286,7 +1283,7 @@ void __init arp_init(void)
1286} 1283}
1287 1284
1288#ifdef CONFIG_PROC_FS 1285#ifdef CONFIG_PROC_FS
1289#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) 1286#if IS_ENABLED(CONFIG_AX25)
1290 1287
1291/* ------------------------------------------------------------------------ */ 1288/* ------------------------------------------------------------------------ */
1292/* 1289/*
@@ -1334,7 +1331,7 @@ static void arp_format_neigh_entry(struct seq_file *seq,
1334 1331
1335 read_lock(&n->lock); 1332 read_lock(&n->lock);
1336 /* Convert hardware address to XX:XX:XX:XX ... form. */ 1333 /* Convert hardware address to XX:XX:XX:XX ... form. */
1337#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) 1334#if IS_ENABLED(CONFIG_AX25)
1338 if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM) 1335 if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
1339 ax2asc2((ax25_address *)n->ha, hbuffer); 1336 ax2asc2((ax25_address *)n->ha, hbuffer);
1340 else { 1337 else {
@@ -1347,7 +1344,7 @@ static void arp_format_neigh_entry(struct seq_file *seq,
1347 if (k != 0) 1344 if (k != 0)
1348 --k; 1345 --k;
1349 hbuffer[k] = 0; 1346 hbuffer[k] = 0;
1350#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) 1347#if IS_ENABLED(CONFIG_AX25)
1351 } 1348 }
1352#endif 1349#endif
1353 sprintf(tbuf, "%pI4", n->primary_key); 1350 sprintf(tbuf, "%pI4", n->primary_key);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 65f01dc47565..e41c40f48cfe 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -258,7 +258,7 @@ static struct in_device *inetdev_init(struct net_device *dev)
258 ip_mc_up(in_dev); 258 ip_mc_up(in_dev);
259 259
260 /* we can receive as soon as ip_ptr is set -- do this last */ 260 /* we can receive as soon as ip_ptr is set -- do this last */
261 RCU_INIT_POINTER(dev->ip_ptr, in_dev); 261 rcu_assign_pointer(dev->ip_ptr, in_dev);
262out: 262out:
263 return in_dev; 263 return in_dev;
264out_kfree: 264out_kfree:
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 46339ba7a2d3..799fc790b3cf 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -67,6 +67,7 @@ int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
67 67
68 return err; 68 return err;
69} 69}
70EXPORT_SYMBOL_GPL(fib_lookup);
70 71
71static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, 72static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
72 int flags, struct fib_lookup_arg *arg) 73 int flags, struct fib_lookup_arg *arg)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 37b671185c81..2b555a5521e0 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -205,7 +205,7 @@ static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
205 return (struct tnode *)(parent & ~NODE_TYPE_MASK); 205 return (struct tnode *)(parent & ~NODE_TYPE_MASK);
206} 206}
207 207
208/* Same as RCU_INIT_POINTER 208/* Same as rcu_assign_pointer
209 * but that macro() assumes that value is a pointer. 209 * but that macro() assumes that value is a pointer.
210 */ 210 */
211static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr) 211static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
@@ -529,7 +529,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *
529 if (n) 529 if (n)
530 node_set_parent(n, tn); 530 node_set_parent(n, tn);
531 531
532 RCU_INIT_POINTER(tn->child[i], n); 532 rcu_assign_pointer(tn->child[i], n);
533} 533}
534 534
535#define MAX_WORK 10 535#define MAX_WORK 10
@@ -1015,7 +1015,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
1015 1015
1016 tp = node_parent((struct rt_trie_node *) tn); 1016 tp = node_parent((struct rt_trie_node *) tn);
1017 if (!tp) 1017 if (!tp)
1018 RCU_INIT_POINTER(t->trie, (struct rt_trie_node *)tn); 1018 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1019 1019
1020 tnode_free_flush(); 1020 tnode_free_flush();
1021 if (!tp) 1021 if (!tp)
@@ -1027,7 +1027,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
1027 if (IS_TNODE(tn)) 1027 if (IS_TNODE(tn))
1028 tn = (struct tnode *)resize(t, (struct tnode *)tn); 1028 tn = (struct tnode *)resize(t, (struct tnode *)tn);
1029 1029
1030 RCU_INIT_POINTER(t->trie, (struct rt_trie_node *)tn); 1030 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1031 tnode_free_flush(); 1031 tnode_free_flush();
1032} 1032}
1033 1033
@@ -1164,7 +1164,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1164 put_child(t, (struct tnode *)tp, cindex, 1164 put_child(t, (struct tnode *)tp, cindex,
1165 (struct rt_trie_node *)tn); 1165 (struct rt_trie_node *)tn);
1166 } else { 1166 } else {
1167 RCU_INIT_POINTER(t->trie, (struct rt_trie_node *)tn); 1167 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1168 tp = tn; 1168 tp = tn;
1169 } 1169 }
1170 } 1170 }
@@ -1607,6 +1607,7 @@ found:
1607 rcu_read_unlock(); 1607 rcu_read_unlock();
1608 return ret; 1608 return ret;
1609} 1609}
1610EXPORT_SYMBOL_GPL(fib_table_lookup);
1610 1611
1611/* 1612/*
1612 * Remove the leaf and return parent. 1613 * Remove the leaf and return parent.
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index b2ca095cb9da..450e5d21ed2a 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -304,9 +304,11 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
304 struct igmpv3_report *pig; 304 struct igmpv3_report *pig;
305 struct net *net = dev_net(dev); 305 struct net *net = dev_net(dev);
306 struct flowi4 fl4; 306 struct flowi4 fl4;
307 int hlen = LL_RESERVED_SPACE(dev);
308 int tlen = dev->needed_tailroom;
307 309
308 while (1) { 310 while (1) {
309 skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev), 311 skb = alloc_skb(size + hlen + tlen,
310 GFP_ATOMIC | __GFP_NOWARN); 312 GFP_ATOMIC | __GFP_NOWARN);
311 if (skb) 313 if (skb)
312 break; 314 break;
@@ -327,7 +329,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
327 skb_dst_set(skb, &rt->dst); 329 skb_dst_set(skb, &rt->dst);
328 skb->dev = dev; 330 skb->dev = dev;
329 331
330 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 332 skb_reserve(skb, hlen);
331 333
332 skb_reset_network_header(skb); 334 skb_reset_network_header(skb);
333 pip = ip_hdr(skb); 335 pip = ip_hdr(skb);
@@ -647,6 +649,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
647 __be32 group = pmc ? pmc->multiaddr : 0; 649 __be32 group = pmc ? pmc->multiaddr : 0;
648 struct flowi4 fl4; 650 struct flowi4 fl4;
649 __be32 dst; 651 __be32 dst;
652 int hlen, tlen;
650 653
651 if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) 654 if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
652 return igmpv3_send_report(in_dev, pmc); 655 return igmpv3_send_report(in_dev, pmc);
@@ -661,7 +664,9 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
661 if (IS_ERR(rt)) 664 if (IS_ERR(rt))
662 return -1; 665 return -1;
663 666
664 skb = alloc_skb(IGMP_SIZE+LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); 667 hlen = LL_RESERVED_SPACE(dev);
668 tlen = dev->needed_tailroom;
669 skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC);
665 if (skb == NULL) { 670 if (skb == NULL) {
666 ip_rt_put(rt); 671 ip_rt_put(rt);
667 return -1; 672 return -1;
@@ -669,7 +674,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
669 674
670 skb_dst_set(skb, &rt->dst); 675 skb_dst_set(skb, &rt->dst);
671 676
672 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 677 skb_reserve(skb, hlen);
673 678
674 skb_reset_network_header(skb); 679 skb_reset_network_header(skb);
675 iph = ip_hdr(skb); 680 iph = ip_hdr(skb);
@@ -875,6 +880,8 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
875 * to be intended in a v3 query. 880 * to be intended in a v3 query.
876 */ 881 */
877 max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); 882 max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
883 if (!max_delay)
884 max_delay = 1; /* can't mod w/ 0 */
878 } else { /* v3 */ 885 } else { /* v3 */
879 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) 886 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
880 return; 887 return;
@@ -1242,7 +1249,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1242 1249
1243 im->next_rcu = in_dev->mc_list; 1250 im->next_rcu = in_dev->mc_list;
1244 in_dev->mc_count++; 1251 in_dev->mc_count++;
1245 RCU_INIT_POINTER(in_dev->mc_list, im); 1252 rcu_assign_pointer(in_dev->mc_list, im);
1246 1253
1247#ifdef CONFIG_IP_MULTICAST 1254#ifdef CONFIG_IP_MULTICAST
1248 igmpv3_del_delrec(in_dev, im->multiaddr); 1255 igmpv3_del_delrec(in_dev, im->multiaddr);
@@ -1574,7 +1581,7 @@ out_unlock:
1574 * Add multicast single-source filter to the interface list 1581 * Add multicast single-source filter to the interface list
1575 */ 1582 */
1576static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode, 1583static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,
1577 __be32 *psfsrc, int delta) 1584 __be32 *psfsrc)
1578{ 1585{
1579 struct ip_sf_list *psf, *psf_prev; 1586 struct ip_sf_list *psf, *psf_prev;
1580 1587
@@ -1709,7 +1716,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
1709 pmc->sfcount[sfmode]++; 1716 pmc->sfcount[sfmode]++;
1710 err = 0; 1717 err = 0;
1711 for (i=0; i<sfcount; i++) { 1718 for (i=0; i<sfcount; i++) {
1712 err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i], delta); 1719 err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]);
1713 if (err) 1720 if (err)
1714 break; 1721 break;
1715 } 1722 }
@@ -1814,7 +1821,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1814 iml->next_rcu = inet->mc_list; 1821 iml->next_rcu = inet->mc_list;
1815 iml->sflist = NULL; 1822 iml->sflist = NULL;
1816 iml->sfmode = MCAST_EXCLUDE; 1823 iml->sfmode = MCAST_EXCLUDE;
1817 RCU_INIT_POINTER(inet->mc_list, iml); 1824 rcu_assign_pointer(inet->mc_list, iml);
1818 ip_mc_inc_group(in_dev, addr); 1825 ip_mc_inc_group(in_dev, addr);
1819 err = 0; 1826 err = 0;
1820done: 1827done:
@@ -2001,7 +2008,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
2001 atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); 2008 atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
2002 kfree_rcu(psl, rcu); 2009 kfree_rcu(psl, rcu);
2003 } 2010 }
2004 RCU_INIT_POINTER(pmc->sflist, newpsl); 2011 rcu_assign_pointer(pmc->sflist, newpsl);
2005 psl = newpsl; 2012 psl = newpsl;
2006 } 2013 }
2007 rv = 1; /* > 0 for insert logic below if sl_count is 0 */ 2014 rv = 1; /* > 0 for insert logic below if sl_count is 0 */
@@ -2104,7 +2111,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
2104 } else 2111 } else
2105 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 2112 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
2106 0, NULL, 0); 2113 0, NULL, 0);
2107 RCU_INIT_POINTER(pmc->sflist, newpsl); 2114 rcu_assign_pointer(pmc->sflist, newpsl);
2108 pmc->sfmode = msf->imsf_fmode; 2115 pmc->sfmode = msf->imsf_fmode;
2109 err = 0; 2116 err = 0;
2110done: 2117done:
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index c14d88ad348d..19d66cefd7d3 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -123,11 +123,14 @@ again:
123 smallest_size = tb->num_owners; 123 smallest_size = tb->num_owners;
124 smallest_rover = rover; 124 smallest_rover = rover;
125 if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) { 125 if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) {
126 spin_unlock(&head->lock);
127 snum = smallest_rover; 126 snum = smallest_rover;
128 goto have_snum; 127 goto tb_found;
129 } 128 }
130 } 129 }
130 if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
131 snum = rover;
132 goto tb_found;
133 }
131 goto next; 134 goto next;
132 } 135 }
133 break; 136 break;
@@ -418,7 +421,7 @@ static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
418 return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); 421 return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
419} 422}
420 423
421#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 424#if IS_ENABLED(CONFIG_IPV6)
422#define AF_INET_FAMILY(fam) ((fam) == AF_INET) 425#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
423#else 426#else
424#define AF_INET_FAMILY(fam) 1 427#define AF_INET_FAMILY(fam) 1
@@ -588,10 +591,19 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
588} 591}
589EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); 592EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
590 593
591struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, 594/**
592 const gfp_t priority) 595 * inet_csk_clone_lock - clone an inet socket, and lock its clone
596 * @sk: the socket to clone
597 * @req: request_sock
598 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
599 *
600 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
601 */
602struct sock *inet_csk_clone_lock(const struct sock *sk,
603 const struct request_sock *req,
604 const gfp_t priority)
593{ 605{
594 struct sock *newsk = sk_clone(sk, priority); 606 struct sock *newsk = sk_clone_lock(sk, priority);
595 607
596 if (newsk != NULL) { 608 if (newsk != NULL) {
597 struct inet_connection_sock *newicsk = inet_csk(newsk); 609 struct inet_connection_sock *newicsk = inet_csk(newsk);
@@ -615,7 +627,7 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
615 } 627 }
616 return newsk; 628 return newsk;
617} 629}
618EXPORT_SYMBOL_GPL(inet_csk_clone); 630EXPORT_SYMBOL_GPL(inet_csk_clone_lock);
619 631
620/* 632/*
621 * At this point, there should be no process reference to this 633 * At this point, there should be no process reference to this
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index ccee270a9b65..fcf281819cd4 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -33,6 +33,7 @@
33#include <linux/stddef.h> 33#include <linux/stddef.h>
34 34
35#include <linux/inet_diag.h> 35#include <linux/inet_diag.h>
36#include <linux/sock_diag.h>
36 37
37static const struct inet_diag_handler **inet_diag_table; 38static const struct inet_diag_handler **inet_diag_table;
38 39
@@ -45,24 +46,22 @@ struct inet_diag_entry {
45 u16 userlocks; 46 u16 userlocks;
46}; 47};
47 48
48static struct sock *idiagnl;
49
50#define INET_DIAG_PUT(skb, attrtype, attrlen) \ 49#define INET_DIAG_PUT(skb, attrtype, attrlen) \
51 RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) 50 RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
52 51
53static DEFINE_MUTEX(inet_diag_table_mutex); 52static DEFINE_MUTEX(inet_diag_table_mutex);
54 53
55static const struct inet_diag_handler *inet_diag_lock_handler(int type) 54static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
56{ 55{
57 if (!inet_diag_table[type]) 56 if (!inet_diag_table[proto])
58 request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 57 request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
59 NETLINK_INET_DIAG, type); 58 NETLINK_SOCK_DIAG, AF_INET, proto);
60 59
61 mutex_lock(&inet_diag_table_mutex); 60 mutex_lock(&inet_diag_table_mutex);
62 if (!inet_diag_table[type]) 61 if (!inet_diag_table[proto])
63 return ERR_PTR(-ENOENT); 62 return ERR_PTR(-ENOENT);
64 63
65 return inet_diag_table[type]; 64 return inet_diag_table[proto];
66} 65}
67 66
68static inline void inet_diag_unlock_handler( 67static inline void inet_diag_unlock_handler(
@@ -71,21 +70,21 @@ static inline void inet_diag_unlock_handler(
71 mutex_unlock(&inet_diag_table_mutex); 70 mutex_unlock(&inet_diag_table_mutex);
72} 71}
73 72
74static int inet_csk_diag_fill(struct sock *sk, 73int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
75 struct sk_buff *skb, 74 struct sk_buff *skb, struct inet_diag_req_v2 *req,
76 int ext, u32 pid, u32 seq, u16 nlmsg_flags, 75 u32 pid, u32 seq, u16 nlmsg_flags,
77 const struct nlmsghdr *unlh) 76 const struct nlmsghdr *unlh)
78{ 77{
79 const struct inet_sock *inet = inet_sk(sk); 78 const struct inet_sock *inet = inet_sk(sk);
80 const struct inet_connection_sock *icsk = inet_csk(sk);
81 struct inet_diag_msg *r; 79 struct inet_diag_msg *r;
82 struct nlmsghdr *nlh; 80 struct nlmsghdr *nlh;
83 void *info = NULL; 81 void *info = NULL;
84 struct inet_diag_meminfo *minfo = NULL; 82 struct inet_diag_meminfo *minfo = NULL;
85 unsigned char *b = skb_tail_pointer(skb); 83 unsigned char *b = skb_tail_pointer(skb);
86 const struct inet_diag_handler *handler; 84 const struct inet_diag_handler *handler;
85 int ext = req->idiag_ext;
87 86
88 handler = inet_diag_table[unlh->nlmsg_type]; 87 handler = inet_diag_table[req->sdiag_protocol];
89 BUG_ON(handler == NULL); 88 BUG_ON(handler == NULL);
90 89
91 nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); 90 nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
@@ -97,25 +96,13 @@ static int inet_csk_diag_fill(struct sock *sk,
97 if (ext & (1 << (INET_DIAG_MEMINFO - 1))) 96 if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
98 minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo)); 97 minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo));
99 98
100 if (ext & (1 << (INET_DIAG_INFO - 1)))
101 info = INET_DIAG_PUT(skb, INET_DIAG_INFO,
102 handler->idiag_info_size);
103
104 if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) {
105 const size_t len = strlen(icsk->icsk_ca_ops->name);
106
107 strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1),
108 icsk->icsk_ca_ops->name);
109 }
110
111 r->idiag_family = sk->sk_family; 99 r->idiag_family = sk->sk_family;
112 r->idiag_state = sk->sk_state; 100 r->idiag_state = sk->sk_state;
113 r->idiag_timer = 0; 101 r->idiag_timer = 0;
114 r->idiag_retrans = 0; 102 r->idiag_retrans = 0;
115 103
116 r->id.idiag_if = sk->sk_bound_dev_if; 104 r->id.idiag_if = sk->sk_bound_dev_if;
117 r->id.idiag_cookie[0] = (u32)(unsigned long)sk; 105 sock_diag_save_cookie(sk, r->id.idiag_cookie);
118 r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
119 106
120 r->id.idiag_sport = inet->inet_sport; 107 r->id.idiag_sport = inet->inet_sport;
121 r->id.idiag_dport = inet->inet_dport; 108 r->id.idiag_dport = inet->inet_dport;
@@ -128,20 +115,36 @@ static int inet_csk_diag_fill(struct sock *sk,
128 if (ext & (1 << (INET_DIAG_TOS - 1))) 115 if (ext & (1 << (INET_DIAG_TOS - 1)))
129 RTA_PUT_U8(skb, INET_DIAG_TOS, inet->tos); 116 RTA_PUT_U8(skb, INET_DIAG_TOS, inet->tos);
130 117
131#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 118#if IS_ENABLED(CONFIG_IPV6)
132 if (r->idiag_family == AF_INET6) { 119 if (r->idiag_family == AF_INET6) {
133 const struct ipv6_pinfo *np = inet6_sk(sk); 120 const struct ipv6_pinfo *np = inet6_sk(sk);
134 121
122 *(struct in6_addr *)r->id.idiag_src = np->rcv_saddr;
123 *(struct in6_addr *)r->id.idiag_dst = np->daddr;
135 if (ext & (1 << (INET_DIAG_TCLASS - 1))) 124 if (ext & (1 << (INET_DIAG_TCLASS - 1)))
136 RTA_PUT_U8(skb, INET_DIAG_TCLASS, np->tclass); 125 RTA_PUT_U8(skb, INET_DIAG_TCLASS, np->tclass);
137
138 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
139 &np->rcv_saddr);
140 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
141 &np->daddr);
142 } 126 }
143#endif 127#endif
144 128
129 r->idiag_uid = sock_i_uid(sk);
130 r->idiag_inode = sock_i_ino(sk);
131
132 if (minfo) {
133 minfo->idiag_rmem = sk_rmem_alloc_get(sk);
134 minfo->idiag_wmem = sk->sk_wmem_queued;
135 minfo->idiag_fmem = sk->sk_forward_alloc;
136 minfo->idiag_tmem = sk_wmem_alloc_get(sk);
137 }
138
139 if (ext & (1 << (INET_DIAG_SKMEMINFO - 1)))
140 if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
141 goto rtattr_failure;
142
143 if (icsk == NULL) {
144 r->idiag_rqueue = r->idiag_wqueue = 0;
145 goto out;
146 }
147
145#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ) 148#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
146 149
147 if (icsk->icsk_pending == ICSK_TIME_RETRANS) { 150 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
@@ -162,14 +165,14 @@ static int inet_csk_diag_fill(struct sock *sk,
162 } 165 }
163#undef EXPIRES_IN_MS 166#undef EXPIRES_IN_MS
164 167
165 r->idiag_uid = sock_i_uid(sk); 168 if (ext & (1 << (INET_DIAG_INFO - 1)))
166 r->idiag_inode = sock_i_ino(sk); 169 info = INET_DIAG_PUT(skb, INET_DIAG_INFO, sizeof(struct tcp_info));
167 170
168 if (minfo) { 171 if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) {
169 minfo->idiag_rmem = sk_rmem_alloc_get(sk); 172 const size_t len = strlen(icsk->icsk_ca_ops->name);
170 minfo->idiag_wmem = sk->sk_wmem_queued; 173
171 minfo->idiag_fmem = sk->sk_forward_alloc; 174 strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1),
172 minfo->idiag_tmem = sk_wmem_alloc_get(sk); 175 icsk->icsk_ca_ops->name);
173 } 176 }
174 177
175 handler->idiag_get_info(sk, r, info); 178 handler->idiag_get_info(sk, r, info);
@@ -178,6 +181,7 @@ static int inet_csk_diag_fill(struct sock *sk,
178 icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) 181 icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)
179 icsk->icsk_ca_ops->get_info(sk, ext, skb); 182 icsk->icsk_ca_ops->get_info(sk, ext, skb);
180 183
184out:
181 nlh->nlmsg_len = skb_tail_pointer(skb) - b; 185 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
182 return skb->len; 186 return skb->len;
183 187
@@ -186,10 +190,20 @@ nlmsg_failure:
186 nlmsg_trim(skb, b); 190 nlmsg_trim(skb, b);
187 return -EMSGSIZE; 191 return -EMSGSIZE;
188} 192}
193EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
194
195static int inet_csk_diag_fill(struct sock *sk,
196 struct sk_buff *skb, struct inet_diag_req_v2 *req,
197 u32 pid, u32 seq, u16 nlmsg_flags,
198 const struct nlmsghdr *unlh)
199{
200 return inet_sk_diag_fill(sk, inet_csk(sk),
201 skb, req, pid, seq, nlmsg_flags, unlh);
202}
189 203
190static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, 204static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
191 struct sk_buff *skb, int ext, u32 pid, 205 struct sk_buff *skb, struct inet_diag_req_v2 *req,
192 u32 seq, u16 nlmsg_flags, 206 u32 pid, u32 seq, u16 nlmsg_flags,
193 const struct nlmsghdr *unlh) 207 const struct nlmsghdr *unlh)
194{ 208{
195 long tmo; 209 long tmo;
@@ -210,8 +224,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
210 r->idiag_family = tw->tw_family; 224 r->idiag_family = tw->tw_family;
211 r->idiag_retrans = 0; 225 r->idiag_retrans = 0;
212 r->id.idiag_if = tw->tw_bound_dev_if; 226 r->id.idiag_if = tw->tw_bound_dev_if;
213 r->id.idiag_cookie[0] = (u32)(unsigned long)tw; 227 sock_diag_save_cookie(tw, r->id.idiag_cookie);
214 r->id.idiag_cookie[1] = (u32)(((unsigned long)tw >> 31) >> 1);
215 r->id.idiag_sport = tw->tw_sport; 228 r->id.idiag_sport = tw->tw_sport;
216 r->id.idiag_dport = tw->tw_dport; 229 r->id.idiag_dport = tw->tw_dport;
217 r->id.idiag_src[0] = tw->tw_rcv_saddr; 230 r->id.idiag_src[0] = tw->tw_rcv_saddr;
@@ -223,15 +236,13 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
223 r->idiag_wqueue = 0; 236 r->idiag_wqueue = 0;
224 r->idiag_uid = 0; 237 r->idiag_uid = 0;
225 r->idiag_inode = 0; 238 r->idiag_inode = 0;
226#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 239#if IS_ENABLED(CONFIG_IPV6)
227 if (tw->tw_family == AF_INET6) { 240 if (tw->tw_family == AF_INET6) {
228 const struct inet6_timewait_sock *tw6 = 241 const struct inet6_timewait_sock *tw6 =
229 inet6_twsk((struct sock *)tw); 242 inet6_twsk((struct sock *)tw);
230 243
231 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, 244 *(struct in6_addr *)r->id.idiag_src = tw6->tw_v6_rcv_saddr;
232 &tw6->tw_v6_rcv_saddr); 245 *(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr;
233 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
234 &tw6->tw_v6_daddr);
235 } 246 }
236#endif 247#endif
237 nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail; 248 nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail;
@@ -242,42 +253,31 @@ nlmsg_failure:
242} 253}
243 254
244static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, 255static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
245 int ext, u32 pid, u32 seq, u16 nlmsg_flags, 256 struct inet_diag_req_v2 *r, u32 pid, u32 seq, u16 nlmsg_flags,
246 const struct nlmsghdr *unlh) 257 const struct nlmsghdr *unlh)
247{ 258{
248 if (sk->sk_state == TCP_TIME_WAIT) 259 if (sk->sk_state == TCP_TIME_WAIT)
249 return inet_twsk_diag_fill((struct inet_timewait_sock *)sk, 260 return inet_twsk_diag_fill((struct inet_timewait_sock *)sk,
250 skb, ext, pid, seq, nlmsg_flags, 261 skb, r, pid, seq, nlmsg_flags,
251 unlh); 262 unlh);
252 return inet_csk_diag_fill(sk, skb, ext, pid, seq, nlmsg_flags, unlh); 263 return inet_csk_diag_fill(sk, skb, r, pid, seq, nlmsg_flags, unlh);
253} 264}
254 265
255static int inet_diag_get_exact(struct sk_buff *in_skb, 266int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb,
256 const struct nlmsghdr *nlh) 267 const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req)
257{ 268{
258 int err; 269 int err;
259 struct sock *sk; 270 struct sock *sk;
260 struct inet_diag_req *req = NLMSG_DATA(nlh);
261 struct sk_buff *rep; 271 struct sk_buff *rep;
262 struct inet_hashinfo *hashinfo;
263 const struct inet_diag_handler *handler;
264 272
265 handler = inet_diag_lock_handler(nlh->nlmsg_type);
266 if (IS_ERR(handler)) {
267 err = PTR_ERR(handler);
268 goto unlock;
269 }
270
271 hashinfo = handler->idiag_hashinfo;
272 err = -EINVAL; 273 err = -EINVAL;
273 274 if (req->sdiag_family == AF_INET) {
274 if (req->idiag_family == AF_INET) {
275 sk = inet_lookup(&init_net, hashinfo, req->id.idiag_dst[0], 275 sk = inet_lookup(&init_net, hashinfo, req->id.idiag_dst[0],
276 req->id.idiag_dport, req->id.idiag_src[0], 276 req->id.idiag_dport, req->id.idiag_src[0],
277 req->id.idiag_sport, req->id.idiag_if); 277 req->id.idiag_sport, req->id.idiag_if);
278 } 278 }
279#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 279#if IS_ENABLED(CONFIG_IPV6)
280 else if (req->idiag_family == AF_INET6) { 280 else if (req->sdiag_family == AF_INET6) {
281 sk = inet6_lookup(&init_net, hashinfo, 281 sk = inet6_lookup(&init_net, hashinfo,
282 (struct in6_addr *)req->id.idiag_dst, 282 (struct in6_addr *)req->id.idiag_dst,
283 req->id.idiag_dport, 283 req->id.idiag_dport,
@@ -287,29 +287,26 @@ static int inet_diag_get_exact(struct sk_buff *in_skb,
287 } 287 }
288#endif 288#endif
289 else { 289 else {
290 goto unlock; 290 goto out_nosk;
291 } 291 }
292 292
293 err = -ENOENT; 293 err = -ENOENT;
294 if (sk == NULL) 294 if (sk == NULL)
295 goto unlock; 295 goto out_nosk;
296 296
297 err = -ESTALE; 297 err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
298 if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE || 298 if (err)
299 req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) &&
300 ((u32)(unsigned long)sk != req->id.idiag_cookie[0] ||
301 (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1]))
302 goto out; 299 goto out;
303 300
304 err = -ENOMEM; 301 err = -ENOMEM;
305 rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + 302 rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
306 sizeof(struct inet_diag_meminfo) + 303 sizeof(struct inet_diag_meminfo) +
307 handler->idiag_info_size + 64)), 304 sizeof(struct tcp_info) + 64)),
308 GFP_KERNEL); 305 GFP_KERNEL);
309 if (!rep) 306 if (!rep)
310 goto out; 307 goto out;
311 308
312 err = sk_diag_fill(sk, rep, req->idiag_ext, 309 err = sk_diag_fill(sk, rep, req,
313 NETLINK_CB(in_skb).pid, 310 NETLINK_CB(in_skb).pid,
314 nlh->nlmsg_seq, 0, nlh); 311 nlh->nlmsg_seq, 0, nlh);
315 if (err < 0) { 312 if (err < 0) {
@@ -317,7 +314,7 @@ static int inet_diag_get_exact(struct sk_buff *in_skb,
317 kfree_skb(rep); 314 kfree_skb(rep);
318 goto out; 315 goto out;
319 } 316 }
320 err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid, 317 err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid,
321 MSG_DONTWAIT); 318 MSG_DONTWAIT);
322 if (err > 0) 319 if (err > 0)
323 err = 0; 320 err = 0;
@@ -329,8 +326,25 @@ out:
329 else 326 else
330 sock_put(sk); 327 sock_put(sk);
331 } 328 }
332unlock: 329out_nosk:
330 return err;
331}
332EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
333
334static int inet_diag_get_exact(struct sk_buff *in_skb,
335 const struct nlmsghdr *nlh,
336 struct inet_diag_req_v2 *req)
337{
338 const struct inet_diag_handler *handler;
339 int err;
340
341 handler = inet_diag_lock_handler(req->sdiag_protocol);
342 if (IS_ERR(handler))
343 err = PTR_ERR(handler);
344 else
345 err = handler->dump_one(in_skb, nlh, req);
333 inet_diag_unlock_handler(handler); 346 inet_diag_unlock_handler(handler);
347
334 return err; 348 return err;
335} 349}
336 350
@@ -361,9 +375,12 @@ static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits)
361} 375}
362 376
363 377
364static int inet_diag_bc_run(const void *bc, int len, 378static int inet_diag_bc_run(const struct nlattr *_bc,
365 const struct inet_diag_entry *entry) 379 const struct inet_diag_entry *entry)
366{ 380{
381 const void *bc = nla_data(_bc);
382 int len = nla_len(_bc);
383
367 while (len > 0) { 384 while (len > 0) {
368 int yes = 1; 385 int yes = 1;
369 const struct inet_diag_bc_op *op = bc; 386 const struct inet_diag_bc_op *op = bc;
@@ -437,6 +454,35 @@ static int inet_diag_bc_run(const void *bc, int len,
437 return len == 0; 454 return len == 0;
438} 455}
439 456
457int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
458{
459 struct inet_diag_entry entry;
460 struct inet_sock *inet = inet_sk(sk);
461
462 if (bc == NULL)
463 return 1;
464
465 entry.family = sk->sk_family;
466#if IS_ENABLED(CONFIG_IPV6)
467 if (entry.family == AF_INET6) {
468 struct ipv6_pinfo *np = inet6_sk(sk);
469
470 entry.saddr = np->rcv_saddr.s6_addr32;
471 entry.daddr = np->daddr.s6_addr32;
472 } else
473#endif
474 {
475 entry.saddr = &inet->inet_rcv_saddr;
476 entry.daddr = &inet->inet_daddr;
477 }
478 entry.sport = inet->inet_num;
479 entry.dport = ntohs(inet->inet_dport);
480 entry.userlocks = sk->sk_userlocks;
481
482 return inet_diag_bc_run(bc, &entry);
483}
484EXPORT_SYMBOL_GPL(inet_diag_bc_sk);
485
440static int valid_cc(const void *bc, int len, int cc) 486static int valid_cc(const void *bc, int len, int cc)
441{ 487{
442 while (len >= 0) { 488 while (len >= 0) {
@@ -493,57 +539,29 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
493 539
494static int inet_csk_diag_dump(struct sock *sk, 540static int inet_csk_diag_dump(struct sock *sk,
495 struct sk_buff *skb, 541 struct sk_buff *skb,
496 struct netlink_callback *cb) 542 struct netlink_callback *cb,
543 struct inet_diag_req_v2 *r,
544 const struct nlattr *bc)
497{ 545{
498 struct inet_diag_req *r = NLMSG_DATA(cb->nlh); 546 if (!inet_diag_bc_sk(bc, sk))
499 547 return 0;
500 if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
501 struct inet_diag_entry entry;
502 const struct nlattr *bc = nlmsg_find_attr(cb->nlh,
503 sizeof(*r),
504 INET_DIAG_REQ_BYTECODE);
505 struct inet_sock *inet = inet_sk(sk);
506
507 entry.family = sk->sk_family;
508#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
509 if (entry.family == AF_INET6) {
510 struct ipv6_pinfo *np = inet6_sk(sk);
511
512 entry.saddr = np->rcv_saddr.s6_addr32;
513 entry.daddr = np->daddr.s6_addr32;
514 } else
515#endif
516 {
517 entry.saddr = &inet->inet_rcv_saddr;
518 entry.daddr = &inet->inet_daddr;
519 }
520 entry.sport = inet->inet_num;
521 entry.dport = ntohs(inet->inet_dport);
522 entry.userlocks = sk->sk_userlocks;
523 548
524 if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry)) 549 return inet_csk_diag_fill(sk, skb, r,
525 return 0;
526 }
527
528 return inet_csk_diag_fill(sk, skb, r->idiag_ext,
529 NETLINK_CB(cb->skb).pid, 550 NETLINK_CB(cb->skb).pid,
530 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); 551 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
531} 552}
532 553
533static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, 554static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
534 struct sk_buff *skb, 555 struct sk_buff *skb,
535 struct netlink_callback *cb) 556 struct netlink_callback *cb,
557 struct inet_diag_req_v2 *r,
558 const struct nlattr *bc)
536{ 559{
537 struct inet_diag_req *r = NLMSG_DATA(cb->nlh); 560 if (bc != NULL) {
538
539 if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
540 struct inet_diag_entry entry; 561 struct inet_diag_entry entry;
541 const struct nlattr *bc = nlmsg_find_attr(cb->nlh,
542 sizeof(*r),
543 INET_DIAG_REQ_BYTECODE);
544 562
545 entry.family = tw->tw_family; 563 entry.family = tw->tw_family;
546#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 564#if IS_ENABLED(CONFIG_IPV6)
547 if (tw->tw_family == AF_INET6) { 565 if (tw->tw_family == AF_INET6) {
548 struct inet6_timewait_sock *tw6 = 566 struct inet6_timewait_sock *tw6 =
549 inet6_twsk((struct sock *)tw); 567 inet6_twsk((struct sock *)tw);
@@ -559,11 +577,11 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
559 entry.dport = ntohs(tw->tw_dport); 577 entry.dport = ntohs(tw->tw_dport);
560 entry.userlocks = 0; 578 entry.userlocks = 0;
561 579
562 if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry)) 580 if (!inet_diag_bc_run(bc, &entry))
563 return 0; 581 return 0;
564 } 582 }
565 583
566 return inet_twsk_diag_fill(tw, skb, r->idiag_ext, 584 return inet_twsk_diag_fill(tw, skb, r,
567 NETLINK_CB(cb->skb).pid, 585 NETLINK_CB(cb->skb).pid,
568 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); 586 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
569} 587}
@@ -589,8 +607,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
589 r->idiag_retrans = req->retrans; 607 r->idiag_retrans = req->retrans;
590 608
591 r->id.idiag_if = sk->sk_bound_dev_if; 609 r->id.idiag_if = sk->sk_bound_dev_if;
592 r->id.idiag_cookie[0] = (u32)(unsigned long)req; 610 sock_diag_save_cookie(req, r->id.idiag_cookie);
593 r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
594 611
595 tmo = req->expires - jiffies; 612 tmo = req->expires - jiffies;
596 if (tmo < 0) 613 if (tmo < 0)
@@ -605,12 +622,10 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
605 r->idiag_wqueue = 0; 622 r->idiag_wqueue = 0;
606 r->idiag_uid = sock_i_uid(sk); 623 r->idiag_uid = sock_i_uid(sk);
607 r->idiag_inode = 0; 624 r->idiag_inode = 0;
608#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 625#if IS_ENABLED(CONFIG_IPV6)
609 if (r->idiag_family == AF_INET6) { 626 if (r->idiag_family == AF_INET6) {
610 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, 627 *(struct in6_addr *)r->id.idiag_src = inet6_rsk(req)->loc_addr;
611 &inet6_rsk(req)->loc_addr); 628 *(struct in6_addr *)r->id.idiag_dst = inet6_rsk(req)->rmt_addr;
612 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
613 &inet6_rsk(req)->rmt_addr);
614 } 629 }
615#endif 630#endif
616 nlh->nlmsg_len = skb_tail_pointer(skb) - b; 631 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
@@ -623,13 +638,13 @@ nlmsg_failure:
623} 638}
624 639
625static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, 640static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
626 struct netlink_callback *cb) 641 struct netlink_callback *cb,
642 struct inet_diag_req_v2 *r,
643 const struct nlattr *bc)
627{ 644{
628 struct inet_diag_entry entry; 645 struct inet_diag_entry entry;
629 struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
630 struct inet_connection_sock *icsk = inet_csk(sk); 646 struct inet_connection_sock *icsk = inet_csk(sk);
631 struct listen_sock *lopt; 647 struct listen_sock *lopt;
632 const struct nlattr *bc = NULL;
633 struct inet_sock *inet = inet_sk(sk); 648 struct inet_sock *inet = inet_sk(sk);
634 int j, s_j; 649 int j, s_j;
635 int reqnum, s_reqnum; 650 int reqnum, s_reqnum;
@@ -649,9 +664,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
649 if (!lopt || !lopt->qlen) 664 if (!lopt || !lopt->qlen)
650 goto out; 665 goto out;
651 666
652 if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { 667 if (bc != NULL) {
653 bc = nlmsg_find_attr(cb->nlh, sizeof(*r),
654 INET_DIAG_REQ_BYTECODE);
655 entry.sport = inet->inet_num; 668 entry.sport = inet->inet_num;
656 entry.userlocks = sk->sk_userlocks; 669 entry.userlocks = sk->sk_userlocks;
657 } 670 }
@@ -671,21 +684,20 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
671 684
672 if (bc) { 685 if (bc) {
673 entry.saddr = 686 entry.saddr =
674#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 687#if IS_ENABLED(CONFIG_IPV6)
675 (entry.family == AF_INET6) ? 688 (entry.family == AF_INET6) ?
676 inet6_rsk(req)->loc_addr.s6_addr32 : 689 inet6_rsk(req)->loc_addr.s6_addr32 :
677#endif 690#endif
678 &ireq->loc_addr; 691 &ireq->loc_addr;
679 entry.daddr = 692 entry.daddr =
680#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 693#if IS_ENABLED(CONFIG_IPV6)
681 (entry.family == AF_INET6) ? 694 (entry.family == AF_INET6) ?
682 inet6_rsk(req)->rmt_addr.s6_addr32 : 695 inet6_rsk(req)->rmt_addr.s6_addr32 :
683#endif 696#endif
684 &ireq->rmt_addr; 697 &ireq->rmt_addr;
685 entry.dport = ntohs(ireq->rmt_port); 698 entry.dport = ntohs(ireq->rmt_port);
686 699
687 if (!inet_diag_bc_run(nla_data(bc), 700 if (!inet_diag_bc_run(bc, &entry))
688 nla_len(bc), &entry))
689 continue; 701 continue;
690 } 702 }
691 703
@@ -708,19 +720,11 @@ out:
708 return err; 720 return err;
709} 721}
710 722
711static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) 723void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
724 struct netlink_callback *cb, struct inet_diag_req_v2 *r, struct nlattr *bc)
712{ 725{
713 int i, num; 726 int i, num;
714 int s_i, s_num; 727 int s_i, s_num;
715 struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
716 const struct inet_diag_handler *handler;
717 struct inet_hashinfo *hashinfo;
718
719 handler = inet_diag_lock_handler(cb->nlh->nlmsg_type);
720 if (IS_ERR(handler))
721 goto unlock;
722
723 hashinfo = handler->idiag_hashinfo;
724 728
725 s_i = cb->args[1]; 729 s_i = cb->args[1];
726 s_num = num = cb->args[2]; 730 s_num = num = cb->args[2];
@@ -745,6 +749,10 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
745 continue; 749 continue;
746 } 750 }
747 751
752 if (r->sdiag_family != AF_UNSPEC &&
753 sk->sk_family != r->sdiag_family)
754 goto next_listen;
755
748 if (r->id.idiag_sport != inet->inet_sport && 756 if (r->id.idiag_sport != inet->inet_sport &&
749 r->id.idiag_sport) 757 r->id.idiag_sport)
750 goto next_listen; 758 goto next_listen;
@@ -754,7 +762,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
754 cb->args[3] > 0) 762 cb->args[3] > 0)
755 goto syn_recv; 763 goto syn_recv;
756 764
757 if (inet_csk_diag_dump(sk, skb, cb) < 0) { 765 if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
758 spin_unlock_bh(&ilb->lock); 766 spin_unlock_bh(&ilb->lock);
759 goto done; 767 goto done;
760 } 768 }
@@ -763,7 +771,7 @@ syn_recv:
763 if (!(r->idiag_states & TCPF_SYN_RECV)) 771 if (!(r->idiag_states & TCPF_SYN_RECV))
764 goto next_listen; 772 goto next_listen;
765 773
766 if (inet_diag_dump_reqs(skb, sk, cb) < 0) { 774 if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) {
767 spin_unlock_bh(&ilb->lock); 775 spin_unlock_bh(&ilb->lock);
768 goto done; 776 goto done;
769 } 777 }
@@ -785,7 +793,7 @@ skip_listen_ht:
785 } 793 }
786 794
787 if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) 795 if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
788 goto unlock; 796 goto out;
789 797
790 for (i = s_i; i <= hashinfo->ehash_mask; i++) { 798 for (i = s_i; i <= hashinfo->ehash_mask; i++) {
791 struct inet_ehash_bucket *head = &hashinfo->ehash[i]; 799 struct inet_ehash_bucket *head = &hashinfo->ehash[i];
@@ -810,13 +818,16 @@ skip_listen_ht:
810 goto next_normal; 818 goto next_normal;
811 if (!(r->idiag_states & (1 << sk->sk_state))) 819 if (!(r->idiag_states & (1 << sk->sk_state)))
812 goto next_normal; 820 goto next_normal;
821 if (r->sdiag_family != AF_UNSPEC &&
822 sk->sk_family != r->sdiag_family)
823 goto next_normal;
813 if (r->id.idiag_sport != inet->inet_sport && 824 if (r->id.idiag_sport != inet->inet_sport &&
814 r->id.idiag_sport) 825 r->id.idiag_sport)
815 goto next_normal; 826 goto next_normal;
816 if (r->id.idiag_dport != inet->inet_dport && 827 if (r->id.idiag_dport != inet->inet_dport &&
817 r->id.idiag_dport) 828 r->id.idiag_dport)
818 goto next_normal; 829 goto next_normal;
819 if (inet_csk_diag_dump(sk, skb, cb) < 0) { 830 if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
820 spin_unlock_bh(lock); 831 spin_unlock_bh(lock);
821 goto done; 832 goto done;
822 } 833 }
@@ -832,13 +843,16 @@ next_normal:
832 843
833 if (num < s_num) 844 if (num < s_num)
834 goto next_dying; 845 goto next_dying;
846 if (r->sdiag_family != AF_UNSPEC &&
847 tw->tw_family != r->sdiag_family)
848 goto next_dying;
835 if (r->id.idiag_sport != tw->tw_sport && 849 if (r->id.idiag_sport != tw->tw_sport &&
836 r->id.idiag_sport) 850 r->id.idiag_sport)
837 goto next_dying; 851 goto next_dying;
838 if (r->id.idiag_dport != tw->tw_dport && 852 if (r->id.idiag_dport != tw->tw_dport &&
839 r->id.idiag_dport) 853 r->id.idiag_dport)
840 goto next_dying; 854 goto next_dying;
841 if (inet_twsk_diag_dump(tw, skb, cb) < 0) { 855 if (inet_twsk_diag_dump(tw, skb, cb, r, bc) < 0) {
842 spin_unlock_bh(lock); 856 spin_unlock_bh(lock);
843 goto done; 857 goto done;
844 } 858 }
@@ -852,12 +866,82 @@ next_dying:
852done: 866done:
853 cb->args[1] = i; 867 cb->args[1] = i;
854 cb->args[2] = num; 868 cb->args[2] = num;
855unlock: 869out:
870 ;
871}
872EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
873
874static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
875 struct inet_diag_req_v2 *r, struct nlattr *bc)
876{
877 const struct inet_diag_handler *handler;
878
879 handler = inet_diag_lock_handler(r->sdiag_protocol);
880 if (!IS_ERR(handler))
881 handler->dump(skb, cb, r, bc);
856 inet_diag_unlock_handler(handler); 882 inet_diag_unlock_handler(handler);
883
857 return skb->len; 884 return skb->len;
858} 885}
859 886
860static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) 887static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
888{
889 struct nlattr *bc = NULL;
890 int hdrlen = sizeof(struct inet_diag_req_v2);
891
892 if (nlmsg_attrlen(cb->nlh, hdrlen))
893 bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
894
895 return __inet_diag_dump(skb, cb, (struct inet_diag_req_v2 *)NLMSG_DATA(cb->nlh), bc);
896}
897
898static inline int inet_diag_type2proto(int type)
899{
900 switch (type) {
901 case TCPDIAG_GETSOCK:
902 return IPPROTO_TCP;
903 case DCCPDIAG_GETSOCK:
904 return IPPROTO_DCCP;
905 default:
906 return 0;
907 }
908}
909
910static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb)
911{
912 struct inet_diag_req *rc = NLMSG_DATA(cb->nlh);
913 struct inet_diag_req_v2 req;
914 struct nlattr *bc = NULL;
915 int hdrlen = sizeof(struct inet_diag_req);
916
917 req.sdiag_family = AF_UNSPEC; /* compatibility */
918 req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
919 req.idiag_ext = rc->idiag_ext;
920 req.idiag_states = rc->idiag_states;
921 req.id = rc->id;
922
923 if (nlmsg_attrlen(cb->nlh, hdrlen))
924 bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
925
926 return __inet_diag_dump(skb, cb, &req, bc);
927}
928
929static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
930 const struct nlmsghdr *nlh)
931{
932 struct inet_diag_req *rc = NLMSG_DATA(nlh);
933 struct inet_diag_req_v2 req;
934
935 req.sdiag_family = rc->idiag_family;
936 req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type);
937 req.idiag_ext = rc->idiag_ext;
938 req.idiag_states = rc->idiag_states;
939 req.id = rc->id;
940
941 return inet_diag_get_exact(in_skb, nlh, &req);
942}
943
944static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
861{ 945{
862 int hdrlen = sizeof(struct inet_diag_req); 946 int hdrlen = sizeof(struct inet_diag_req);
863 947
@@ -877,28 +961,54 @@ static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
877 return -EINVAL; 961 return -EINVAL;
878 } 962 }
879 963
880 return netlink_dump_start(idiagnl, skb, nlh, 964 return netlink_dump_start(sock_diag_nlsk, skb, nlh,
881 inet_diag_dump, NULL, 0); 965 inet_diag_dump_compat, NULL, 0);
882 } 966 }
883 967
884 return inet_diag_get_exact(skb, nlh); 968 return inet_diag_get_exact_compat(skb, nlh);
885} 969}
886 970
887static DEFINE_MUTEX(inet_diag_mutex); 971static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
888
889static void inet_diag_rcv(struct sk_buff *skb)
890{ 972{
891 mutex_lock(&inet_diag_mutex); 973 int hdrlen = sizeof(struct inet_diag_req_v2);
892 netlink_rcv_skb(skb, &inet_diag_rcv_msg); 974
893 mutex_unlock(&inet_diag_mutex); 975 if (nlmsg_len(h) < hdrlen)
976 return -EINVAL;
977
978 if (h->nlmsg_flags & NLM_F_DUMP) {
979 if (nlmsg_attrlen(h, hdrlen)) {
980 struct nlattr *attr;
981 attr = nlmsg_find_attr(h, hdrlen,
982 INET_DIAG_REQ_BYTECODE);
983 if (attr == NULL ||
984 nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
985 inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
986 return -EINVAL;
987 }
988
989 return netlink_dump_start(sock_diag_nlsk, skb, h,
990 inet_diag_dump, NULL, 0);
991 }
992
993 return inet_diag_get_exact(skb, h, (struct inet_diag_req_v2 *)NLMSG_DATA(h));
894} 994}
895 995
996static struct sock_diag_handler inet_diag_handler = {
997 .family = AF_INET,
998 .dump = inet_diag_handler_dump,
999};
1000
1001static struct sock_diag_handler inet6_diag_handler = {
1002 .family = AF_INET6,
1003 .dump = inet_diag_handler_dump,
1004};
1005
896int inet_diag_register(const struct inet_diag_handler *h) 1006int inet_diag_register(const struct inet_diag_handler *h)
897{ 1007{
898 const __u16 type = h->idiag_type; 1008 const __u16 type = h->idiag_type;
899 int err = -EINVAL; 1009 int err = -EINVAL;
900 1010
901 if (type >= INET_DIAG_GETSOCK_MAX) 1011 if (type >= IPPROTO_MAX)
902 goto out; 1012 goto out;
903 1013
904 mutex_lock(&inet_diag_table_mutex); 1014 mutex_lock(&inet_diag_table_mutex);
@@ -917,7 +1027,7 @@ void inet_diag_unregister(const struct inet_diag_handler *h)
917{ 1027{
918 const __u16 type = h->idiag_type; 1028 const __u16 type = h->idiag_type;
919 1029
920 if (type >= INET_DIAG_GETSOCK_MAX) 1030 if (type >= IPPROTO_MAX)
921 return; 1031 return;
922 1032
923 mutex_lock(&inet_diag_table_mutex); 1033 mutex_lock(&inet_diag_table_mutex);
@@ -928,7 +1038,7 @@ EXPORT_SYMBOL_GPL(inet_diag_unregister);
928 1038
929static int __init inet_diag_init(void) 1039static int __init inet_diag_init(void)
930{ 1040{
931 const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX * 1041 const int inet_diag_table_size = (IPPROTO_MAX *
932 sizeof(struct inet_diag_handler *)); 1042 sizeof(struct inet_diag_handler *));
933 int err = -ENOMEM; 1043 int err = -ENOMEM;
934 1044
@@ -936,25 +1046,35 @@ static int __init inet_diag_init(void)
936 if (!inet_diag_table) 1046 if (!inet_diag_table)
937 goto out; 1047 goto out;
938 1048
939 idiagnl = netlink_kernel_create(&init_net, NETLINK_INET_DIAG, 0, 1049 err = sock_diag_register(&inet_diag_handler);
940 inet_diag_rcv, NULL, THIS_MODULE); 1050 if (err)
941 if (idiagnl == NULL) 1051 goto out_free_nl;
942 goto out_free_table; 1052
943 err = 0; 1053 err = sock_diag_register(&inet6_diag_handler);
1054 if (err)
1055 goto out_free_inet;
1056
1057 sock_diag_register_inet_compat(inet_diag_rcv_msg_compat);
944out: 1058out:
945 return err; 1059 return err;
946out_free_table: 1060
1061out_free_inet:
1062 sock_diag_unregister(&inet_diag_handler);
1063out_free_nl:
947 kfree(inet_diag_table); 1064 kfree(inet_diag_table);
948 goto out; 1065 goto out;
949} 1066}
950 1067
951static void __exit inet_diag_exit(void) 1068static void __exit inet_diag_exit(void)
952{ 1069{
953 netlink_kernel_release(idiagnl); 1070 sock_diag_unregister(&inet6_diag_handler);
1071 sock_diag_unregister(&inet_diag_handler);
1072 sock_diag_unregister_inet_compat(inet_diag_rcv_msg_compat);
954 kfree(inet_diag_table); 1073 kfree(inet_diag_table);
955} 1074}
956 1075
957module_init(inet_diag_init); 1076module_init(inet_diag_init);
958module_exit(inet_diag_exit); 1077module_exit(inet_diag_exit);
959MODULE_LICENSE("GPL"); 1078MODULE_LICENSE("GPL");
960MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_INET_DIAG); 1079MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */);
1080MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 86f13c67ea85..d4d61b694fab 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -17,6 +17,7 @@
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/net.h> 19#include <linux/net.h>
20#include <linux/workqueue.h>
20#include <net/ip.h> 21#include <net/ip.h>
21#include <net/inetpeer.h> 22#include <net/inetpeer.h>
22#include <net/secure_seq.h> 23#include <net/secure_seq.h>
@@ -66,6 +67,11 @@
66 67
67static struct kmem_cache *peer_cachep __read_mostly; 68static struct kmem_cache *peer_cachep __read_mostly;
68 69
70static LIST_HEAD(gc_list);
71static const int gc_delay = 60 * HZ;
72static struct delayed_work gc_work;
73static DEFINE_SPINLOCK(gc_lock);
74
69#define node_height(x) x->avl_height 75#define node_height(x) x->avl_height
70 76
71#define peer_avl_empty ((struct inet_peer *)&peer_fake_node) 77#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
@@ -102,6 +108,50 @@ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries m
102int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ 108int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */
103int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ 109int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */
104 110
111static void inetpeer_gc_worker(struct work_struct *work)
112{
113 struct inet_peer *p, *n;
114 LIST_HEAD(list);
115
116 spin_lock_bh(&gc_lock);
117 list_replace_init(&gc_list, &list);
118 spin_unlock_bh(&gc_lock);
119
120 if (list_empty(&list))
121 return;
122
123 list_for_each_entry_safe(p, n, &list, gc_list) {
124
125 if(need_resched())
126 cond_resched();
127
128 if (p->avl_left != peer_avl_empty) {
129 list_add_tail(&p->avl_left->gc_list, &list);
130 p->avl_left = peer_avl_empty;
131 }
132
133 if (p->avl_right != peer_avl_empty) {
134 list_add_tail(&p->avl_right->gc_list, &list);
135 p->avl_right = peer_avl_empty;
136 }
137
138 n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
139
140 if (!atomic_read(&p->refcnt)) {
141 list_del(&p->gc_list);
142 kmem_cache_free(peer_cachep, p);
143 }
144 }
145
146 if (list_empty(&list))
147 return;
148
149 spin_lock_bh(&gc_lock);
150 list_splice(&list, &gc_list);
151 spin_unlock_bh(&gc_lock);
152
153 schedule_delayed_work(&gc_work, gc_delay);
154}
105 155
106/* Called from ip_output.c:ip_init */ 156/* Called from ip_output.c:ip_init */
107void __init inet_initpeers(void) 157void __init inet_initpeers(void)
@@ -126,6 +176,7 @@ void __init inet_initpeers(void)
126 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, 176 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
127 NULL); 177 NULL);
128 178
179 INIT_DELAYED_WORK_DEFERRABLE(&gc_work, inetpeer_gc_worker);
129} 180}
130 181
131static int addr_compare(const struct inetpeer_addr *a, 182static int addr_compare(const struct inetpeer_addr *a,
@@ -136,7 +187,7 @@ static int addr_compare(const struct inetpeer_addr *a,
136 for (i = 0; i < n; i++) { 187 for (i = 0; i < n; i++) {
137 if (a->addr.a6[i] == b->addr.a6[i]) 188 if (a->addr.a6[i] == b->addr.a6[i])
138 continue; 189 continue;
139 if (a->addr.a6[i] < b->addr.a6[i]) 190 if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i])
140 return -1; 191 return -1;
141 return 1; 192 return 1;
142 } 193 }
@@ -448,7 +499,7 @@ relookup:
448 p->pmtu_expires = 0; 499 p->pmtu_expires = 0;
449 p->pmtu_orig = 0; 500 p->pmtu_orig = 0;
450 memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); 501 memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
451 502 INIT_LIST_HEAD(&p->gc_list);
452 503
453 /* Link the node. */ 504 /* Link the node. */
454 link_to_pool(p, base); 505 link_to_pool(p, base);
@@ -508,3 +559,30 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
508 return rc; 559 return rc;
509} 560}
510EXPORT_SYMBOL(inet_peer_xrlim_allow); 561EXPORT_SYMBOL(inet_peer_xrlim_allow);
562
563void inetpeer_invalidate_tree(int family)
564{
565 struct inet_peer *old, *new, *prev;
566 struct inet_peer_base *base = family_to_base(family);
567
568 write_seqlock_bh(&base->lock);
569
570 old = base->root;
571 if (old == peer_avl_empty_rcu)
572 goto out;
573
574 new = peer_avl_empty_rcu;
575
576 prev = cmpxchg(&base->root, old, new);
577 if (prev == old) {
578 base->total = 0;
579 spin_lock(&gc_lock);
580 list_add_tail(&prev->gc_list, &gc_list);
581 spin_unlock(&gc_lock);
582 schedule_delayed_work(&gc_work, gc_delay);
583 }
584
585out:
586 write_sequnlock_bh(&base->lock);
587}
588EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index fdaabf2f2b68..1f23a57aa9e6 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -392,7 +392,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
392 /* Is this the final fragment? */ 392 /* Is this the final fragment? */
393 if ((flags & IP_MF) == 0) { 393 if ((flags & IP_MF) == 0) {
394 /* If we already have some bits beyond end 394 /* If we already have some bits beyond end
395 * or have different end, the segment is corrrupted. 395 * or have different end, the segment is corrupted.
396 */ 396 */
397 if (end < qp->q.len || 397 if (end < qp->q.len ||
398 ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len)) 398 ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len))
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index d55110e93120..38673d2860e2 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -46,7 +46,7 @@
46#include <net/rtnetlink.h> 46#include <net/rtnetlink.h>
47#include <net/gre.h> 47#include <net/gre.h>
48 48
49#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 49#if IS_ENABLED(CONFIG_IPV6)
50#include <net/ipv6.h> 50#include <net/ipv6.h>
51#include <net/ip6_fib.h> 51#include <net/ip6_fib.h>
52#include <net/ip6_route.h> 52#include <net/ip6_route.h>
@@ -65,7 +65,7 @@
65 it is infeasible task. The most general solutions would be 65 it is infeasible task. The most general solutions would be
66 to keep skb->encapsulation counter (sort of local ttl), 66 to keep skb->encapsulation counter (sort of local ttl),
67 and silently drop packet when it expires. It is a good 67 and silently drop packet when it expires. It is a good
68 solution, but it supposes maintaing new variable in ALL 68 solution, but it supposes maintaining new variable in ALL
69 skb, even if no tunneling is used. 69 skb, even if no tunneling is used.
70 70
71 Current solution: xmit_recursion breaks dead loops. This is a percpu 71 Current solution: xmit_recursion breaks dead loops. This is a percpu
@@ -91,14 +91,14 @@
91 91
92 One of them is to parse packet trying to detect inner encapsulation 92 One of them is to parse packet trying to detect inner encapsulation
93 made by our node. It is difficult or even impossible, especially, 93 made by our node. It is difficult or even impossible, especially,
94 taking into account fragmentation. TO be short, tt is not solution at all. 94 taking into account fragmentation. TO be short, ttl is not solution at all.
95 95
96 Current solution: The solution was UNEXPECTEDLY SIMPLE. 96 Current solution: The solution was UNEXPECTEDLY SIMPLE.
97 We force DF flag on tunnels with preconfigured hop limit, 97 We force DF flag on tunnels with preconfigured hop limit,
98 that is ALL. :-) Well, it does not remove the problem completely, 98 that is ALL. :-) Well, it does not remove the problem completely,
99 but exponential growth of network traffic is changed to linear 99 but exponential growth of network traffic is changed to linear
100 (branches, that exceed pmtu are pruned) and tunnel mtu 100 (branches, that exceed pmtu are pruned) and tunnel mtu
101 fastly degrades to value <68, where looping stops. 101 rapidly degrades to value <68, where looping stops.
102 Yes, it is not good if there exists a router in the loop, 102 Yes, it is not good if there exists a router in the loop,
103 which does not force DF, even when encapsulating packets have DF set. 103 which does not force DF, even when encapsulating packets have DF set.
104 But it is not our problem! Nobody could accuse us, we made 104 But it is not our problem! Nobody could accuse us, we made
@@ -171,7 +171,7 @@ struct pcpu_tstats {
171 unsigned long rx_bytes; 171 unsigned long rx_bytes;
172 unsigned long tx_packets; 172 unsigned long tx_packets;
173 unsigned long tx_bytes; 173 unsigned long tx_bytes;
174}; 174} __attribute__((aligned(4*sizeof(unsigned long))));
175 175
176static struct net_device_stats *ipgre_get_stats(struct net_device *dev) 176static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
177{ 177{
@@ -422,6 +422,10 @@ static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
422 if (register_netdevice(dev) < 0) 422 if (register_netdevice(dev) < 0)
423 goto failed_free; 423 goto failed_free;
424 424
425 /* Can use a lockless transmit, unless we generate output sequences */
426 if (!(nt->parms.o_flags & GRE_SEQ))
427 dev->features |= NETIF_F_LLTX;
428
425 dev_hold(dev); 429 dev_hold(dev);
426 ipgre_tunnel_link(ign, nt); 430 ipgre_tunnel_link(ign, nt);
427 return nt; 431 return nt;
@@ -453,8 +457,8 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
453 GRE tunnels with enabled checksum. Tell them "thank you". 457 GRE tunnels with enabled checksum. Tell them "thank you".
454 458
455 Well, I wonder, rfc1812 was written by Cisco employee, 459 Well, I wonder, rfc1812 was written by Cisco employee,
456 what the hell these idiots break standrads established 460 what the hell these idiots break standards established
457 by themself??? 461 by themselves???
458 */ 462 */
459 463
460 const struct iphdr *iph = (const struct iphdr *)skb->data; 464 const struct iphdr *iph = (const struct iphdr *)skb->data;
@@ -729,9 +733,9 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
729 if ((dst = rt->rt_gateway) == 0) 733 if ((dst = rt->rt_gateway) == 0)
730 goto tx_error_icmp; 734 goto tx_error_icmp;
731 } 735 }
732#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 736#if IS_ENABLED(CONFIG_IPV6)
733 else if (skb->protocol == htons(ETH_P_IPV6)) { 737 else if (skb->protocol == htons(ETH_P_IPV6)) {
734 struct neighbour *neigh = dst_get_neighbour(skb_dst(skb)); 738 struct neighbour *neigh = dst_get_neighbour_noref(skb_dst(skb));
735 const struct in6_addr *addr6; 739 const struct in6_addr *addr6;
736 int addr_type; 740 int addr_type;
737 741
@@ -799,7 +803,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
799 goto tx_error; 803 goto tx_error;
800 } 804 }
801 } 805 }
802#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 806#if IS_ENABLED(CONFIG_IPV6)
803 else if (skb->protocol == htons(ETH_P_IPV6)) { 807 else if (skb->protocol == htons(ETH_P_IPV6)) {
804 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); 808 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
805 809
@@ -835,6 +839,8 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
835 if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| 839 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
836 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { 840 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
837 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); 841 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
842 if (max_headroom > dev->needed_headroom)
843 dev->needed_headroom = max_headroom;
838 if (!new_skb) { 844 if (!new_skb) {
839 ip_rt_put(rt); 845 ip_rt_put(rt);
840 dev->stats.tx_dropped++; 846 dev->stats.tx_dropped++;
@@ -873,7 +879,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
873 if ((iph->ttl = tiph->ttl) == 0) { 879 if ((iph->ttl = tiph->ttl) == 0) {
874 if (skb->protocol == htons(ETH_P_IP)) 880 if (skb->protocol == htons(ETH_P_IP))
875 iph->ttl = old_iph->ttl; 881 iph->ttl = old_iph->ttl;
876#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 882#if IS_ENABLED(CONFIG_IPV6)
877 else if (skb->protocol == htons(ETH_P_IPV6)) 883 else if (skb->protocol == htons(ETH_P_IPV6))
878 iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit; 884 iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
879#endif 885#endif
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 1e60f7679075..42dd1a90edea 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -573,8 +573,8 @@ void ip_forward_options(struct sk_buff *skb)
573 } 573 }
574 if (srrptr + 3 <= srrspace) { 574 if (srrptr + 3 <= srrspace) {
575 opt->is_changed = 1; 575 opt->is_changed = 1;
576 ip_rt_get_source(&optptr[srrptr-1], skb, rt);
577 ip_hdr(skb)->daddr = opt->nexthop; 576 ip_hdr(skb)->daddr = opt->nexthop;
577 ip_rt_get_source(&optptr[srrptr-1], skb, rt);
578 optptr[2] = srrptr+4; 578 optptr[2] = srrptr+4;
579 } else if (net_ratelimit()) 579 } else if (net_ratelimit())
580 printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); 580 printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 0bc95f3977d2..ff302bde8890 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -206,7 +206,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
206 } 206 }
207 207
208 rcu_read_lock(); 208 rcu_read_lock();
209 neigh = dst_get_neighbour(dst); 209 neigh = dst_get_neighbour_noref(dst);
210 if (neigh) { 210 if (neigh) {
211 int res = neigh_output(neigh, skb); 211 int res = neigh_output(neigh, skb);
212 212
@@ -319,6 +319,20 @@ int ip_output(struct sk_buff *skb)
319 !(IPCB(skb)->flags & IPSKB_REROUTED)); 319 !(IPCB(skb)->flags & IPSKB_REROUTED));
320} 320}
321 321
322/*
323 * copy saddr and daddr, possibly using 64bit load/stores
324 * Equivalent to :
325 * iph->saddr = fl4->saddr;
326 * iph->daddr = fl4->daddr;
327 */
328static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
329{
330 BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
331 offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
332 memcpy(&iph->saddr, &fl4->saddr,
333 sizeof(fl4->saddr) + sizeof(fl4->daddr));
334}
335
322int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) 336int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
323{ 337{
324 struct sock *sk = skb->sk; 338 struct sock *sk = skb->sk;
@@ -381,8 +395,8 @@ packet_routed:
381 iph->frag_off = 0; 395 iph->frag_off = 0;
382 iph->ttl = ip_select_ttl(inet, &rt->dst); 396 iph->ttl = ip_select_ttl(inet, &rt->dst);
383 iph->protocol = sk->sk_protocol; 397 iph->protocol = sk->sk_protocol;
384 iph->saddr = fl4->saddr; 398 ip_copy_addrs(iph, fl4);
385 iph->daddr = fl4->daddr; 399
386 /* Transport layer set skb->h.foo itself. */ 400 /* Transport layer set skb->h.foo itself. */
387 401
388 if (inet_opt && inet_opt->opt.optlen) { 402 if (inet_opt && inet_opt->opt.optlen) {
@@ -1337,8 +1351,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
1337 ip_select_ident(iph, &rt->dst, sk); 1351 ip_select_ident(iph, &rt->dst, sk);
1338 iph->ttl = ttl; 1352 iph->ttl = ttl;
1339 iph->protocol = sk->sk_protocol; 1353 iph->protocol = sk->sk_protocol;
1340 iph->saddr = fl4->saddr; 1354 ip_copy_addrs(iph, fl4);
1341 iph->daddr = fl4->daddr;
1342 1355
1343 if (opt) { 1356 if (opt) {
1344 iph->ihl += opt->optlen>>2; 1357 iph->ihl += opt->optlen>>2;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 09ff51bf16a4..8aa87c19fa00 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -37,7 +37,7 @@
37#include <net/route.h> 37#include <net/route.h>
38#include <net/xfrm.h> 38#include <net/xfrm.h>
39#include <net/compat.h> 39#include <net/compat.h>
40#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 40#if IS_ENABLED(CONFIG_IPV6)
41#include <net/transp_v6.h> 41#include <net/transp_v6.h>
42#endif 42#endif
43 43
@@ -55,20 +55,13 @@
55/* 55/*
56 * SOL_IP control messages. 56 * SOL_IP control messages.
57 */ 57 */
58#define PKTINFO_SKB_CB(__skb) ((struct in_pktinfo *)((__skb)->cb))
58 59
59static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) 60static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
60{ 61{
61 struct in_pktinfo info; 62 struct in_pktinfo info = *PKTINFO_SKB_CB(skb);
62 struct rtable *rt = skb_rtable(skb);
63 63
64 info.ipi_addr.s_addr = ip_hdr(skb)->daddr; 64 info.ipi_addr.s_addr = ip_hdr(skb)->daddr;
65 if (rt) {
66 info.ipi_ifindex = rt->rt_iif;
67 info.ipi_spec_dst.s_addr = rt->rt_spec_dst;
68 } else {
69 info.ipi_ifindex = 0;
70 info.ipi_spec_dst.s_addr = 0;
71 }
72 65
73 put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); 66 put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
74} 67}
@@ -515,7 +508,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
515 sock_owned_by_user(sk)); 508 sock_owned_by_user(sk));
516 if (inet->is_icsk) { 509 if (inet->is_icsk) {
517 struct inet_connection_sock *icsk = inet_csk(sk); 510 struct inet_connection_sock *icsk = inet_csk(sk);
518#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 511#if IS_ENABLED(CONFIG_IPV6)
519 if (sk->sk_family == PF_INET || 512 if (sk->sk_family == PF_INET ||
520 (!((1 << sk->sk_state) & 513 (!((1 << sk->sk_state) &
521 (TCPF_LISTEN | TCPF_CLOSE)) && 514 (TCPF_LISTEN | TCPF_CLOSE)) &&
@@ -526,7 +519,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
526 if (opt) 519 if (opt)
527 icsk->icsk_ext_hdr_len += opt->opt.optlen; 520 icsk->icsk_ext_hdr_len += opt->opt.optlen;
528 icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); 521 icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
529#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 522#if IS_ENABLED(CONFIG_IPV6)
530 } 523 }
531#endif 524#endif
532 } 525 }
@@ -992,20 +985,28 @@ e_inval:
992} 985}
993 986
994/** 987/**
995 * ip_queue_rcv_skb - Queue an skb into sock receive queue 988 * ipv4_pktinfo_prepare - transfert some info from rtable to skb
996 * @sk: socket 989 * @sk: socket
997 * @skb: buffer 990 * @skb: buffer
998 * 991 *
999 * Queues an skb into socket receive queue. If IP_CMSG_PKTINFO option 992 * To support IP_CMSG_PKTINFO option, we store rt_iif and rt_spec_dst
1000 * is not set, we drop skb dst entry now, while dst cache line is hot. 993 * in skb->cb[] before dst drop.
994 * This way, receiver doesnt make cache line misses to read rtable.
1001 */ 995 */
1002int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 996void ipv4_pktinfo_prepare(struct sk_buff *skb)
1003{ 997{
1004 if (!(inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO)) 998 struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
1005 skb_dst_drop(skb); 999 const struct rtable *rt = skb_rtable(skb);
1006 return sock_queue_rcv_skb(sk, skb); 1000
1001 if (rt) {
1002 pktinfo->ipi_ifindex = rt->rt_iif;
1003 pktinfo->ipi_spec_dst.s_addr = rt->rt_spec_dst;
1004 } else {
1005 pktinfo->ipi_ifindex = 0;
1006 pktinfo->ipi_spec_dst.s_addr = 0;
1007 }
1008 skb_dst_drop(skb);
1007} 1009}
1008EXPORT_SYMBOL(ip_queue_rcv_skb);
1009 1010
1010int ip_setsockopt(struct sock *sk, int level, 1011int ip_setsockopt(struct sock *sk, int level,
1011 int optname, char __user *optval, unsigned int optlen) 1012 int optname, char __user *optval, unsigned int optlen)
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 99ec116bef14..6e412a60a91f 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -141,7 +141,7 @@ __be32 ic_servaddr = NONE; /* Boot server IP address */
141__be32 root_server_addr = NONE; /* Address of NFS server */ 141__be32 root_server_addr = NONE; /* Address of NFS server */
142u8 root_server_path[256] = { 0, }; /* Path to mount as root */ 142u8 root_server_path[256] = { 0, }; /* Path to mount as root */
143 143
144u32 ic_dev_xid; /* Device under configuration */ 144__be32 ic_dev_xid; /* Device under configuration */
145 145
146/* vendor class identifier */ 146/* vendor class identifier */
147static char vendor_class_identifier[253] __initdata; 147static char vendor_class_identifier[253] __initdata;
@@ -767,13 +767,15 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
767 struct sk_buff *skb; 767 struct sk_buff *skb;
768 struct bootp_pkt *b; 768 struct bootp_pkt *b;
769 struct iphdr *h; 769 struct iphdr *h;
770 int hlen = LL_RESERVED_SPACE(dev);
771 int tlen = dev->needed_tailroom;
770 772
771 /* Allocate packet */ 773 /* Allocate packet */
772 skb = alloc_skb(sizeof(struct bootp_pkt) + LL_ALLOCATED_SPACE(dev) + 15, 774 skb = alloc_skb(sizeof(struct bootp_pkt) + hlen + tlen + 15,
773 GFP_KERNEL); 775 GFP_KERNEL);
774 if (!skb) 776 if (!skb)
775 return; 777 return;
776 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 778 skb_reserve(skb, hlen);
777 b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt)); 779 b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt));
778 memset(b, 0, sizeof(struct bootp_pkt)); 780 memset(b, 0, sizeof(struct bootp_pkt));
779 781
@@ -826,8 +828,13 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
826 skb->dev = dev; 828 skb->dev = dev;
827 skb->protocol = htons(ETH_P_IP); 829 skb->protocol = htons(ETH_P_IP);
828 if (dev_hard_header(skb, dev, ntohs(skb->protocol), 830 if (dev_hard_header(skb, dev, ntohs(skb->protocol),
829 dev->broadcast, dev->dev_addr, skb->len) < 0 || 831 dev->broadcast, dev->dev_addr, skb->len) < 0) {
830 dev_queue_xmit(skb) < 0) 832 kfree_skb(skb);
833 printk("E");
834 return;
835 }
836
837 if (dev_queue_xmit(skb) < 0)
831 printk("E"); 838 printk("E");
832} 839}
833 840
@@ -852,9 +859,9 @@ static int __init ic_bootp_string(char *dest, char *src, int len, int max)
852 */ 859 */
853static void __init ic_do_bootp_ext(u8 *ext) 860static void __init ic_do_bootp_ext(u8 *ext)
854{ 861{
855 u8 servers; 862 u8 servers;
856 int i; 863 int i;
857 u16 mtu; 864 __be16 mtu;
858 865
859#ifdef IPCONFIG_DEBUG 866#ifdef IPCONFIG_DEBUG
860 u8 *c; 867 u8 *c;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 0b2e7329abda..22a199315309 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -148,7 +148,7 @@ struct pcpu_tstats {
148 unsigned long rx_bytes; 148 unsigned long rx_bytes;
149 unsigned long tx_packets; 149 unsigned long tx_packets;
150 unsigned long tx_bytes; 150 unsigned long tx_bytes;
151}; 151} __attribute__((aligned(4*sizeof(unsigned long))));
152 152
153static struct net_device_stats *ipip_get_stats(struct net_device *dev) 153static struct net_device_stats *ipip_get_stats(struct net_device *dev)
154{ 154{
@@ -231,7 +231,7 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
231 (iter = rtnl_dereference(*tp)) != NULL; 231 (iter = rtnl_dereference(*tp)) != NULL;
232 tp = &iter->next) { 232 tp = &iter->next) {
233 if (t == iter) { 233 if (t == iter) {
234 RCU_INIT_POINTER(*tp, t->next); 234 rcu_assign_pointer(*tp, t->next);
235 break; 235 break;
236 } 236 }
237 } 237 }
@@ -241,8 +241,8 @@ static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
241{ 241{
242 struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t); 242 struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
243 243
244 RCU_INIT_POINTER(t->next, rtnl_dereference(*tp)); 244 rcu_assign_pointer(t->next, rtnl_dereference(*tp));
245 RCU_INIT_POINTER(*tp, t); 245 rcu_assign_pointer(*tp, t);
246} 246}
247 247
248static struct ip_tunnel * ipip_tunnel_locate(struct net *net, 248static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
@@ -792,7 +792,7 @@ static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
792 return -ENOMEM; 792 return -ENOMEM;
793 793
794 dev_hold(dev); 794 dev_hold(dev);
795 RCU_INIT_POINTER(ipn->tunnels_wc[0], tunnel); 795 rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
796 return 0; 796 return 0;
797} 797}
798 798
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 76a7f07b38b6..7bc2db6db8d4 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1225,7 +1225,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1225 1225
1226 ret = ip_ra_control(sk, 1, mrtsock_destruct); 1226 ret = ip_ra_control(sk, 1, mrtsock_destruct);
1227 if (ret == 0) { 1227 if (ret == 0) {
1228 RCU_INIT_POINTER(mrt->mroute_sk, sk); 1228 rcu_assign_pointer(mrt->mroute_sk, sk);
1229 IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; 1229 IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
1230 } 1230 }
1231 rtnl_unlock(); 1231 rtnl_unlock();
@@ -1520,7 +1520,6 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
1520 struct mr_table *mrt; 1520 struct mr_table *mrt;
1521 struct vif_device *v; 1521 struct vif_device *v;
1522 int ct; 1522 int ct;
1523 LIST_HEAD(list);
1524 1523
1525 if (event != NETDEV_UNREGISTER) 1524 if (event != NETDEV_UNREGISTER)
1526 return NOTIFY_DONE; 1525 return NOTIFY_DONE;
@@ -1529,10 +1528,9 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
1529 v = &mrt->vif_table[0]; 1528 v = &mrt->vif_table[0];
1530 for (ct = 0; ct < mrt->maxvif; ct++, v++) { 1529 for (ct = 0; ct < mrt->maxvif; ct++, v++) {
1531 if (v->dev == dev) 1530 if (v->dev == dev)
1532 vif_delete(mrt, ct, 1, &list); 1531 vif_delete(mrt, ct, 1, NULL);
1533 } 1532 }
1534 } 1533 }
1535 unregister_netdevice_many(&list);
1536 return NOTIFY_DONE; 1534 return NOTIFY_DONE;
1537} 1535}
1538 1536
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index f19f2182894c..74dfc9e5211f 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -27,7 +27,7 @@ config NF_CONNTRACK_IPV4
27 27
28config NF_CONNTRACK_PROC_COMPAT 28config NF_CONNTRACK_PROC_COMPAT
29 bool "proc/sysctl compatibility with old connection tracking" 29 bool "proc/sysctl compatibility with old connection tracking"
30 depends on NF_CONNTRACK_IPV4 30 depends on NF_CONNTRACK_PROCFS && NF_CONNTRACK_IPV4
31 default y 31 default y
32 help 32 help
33 This option enables /proc and sysctl compatibility with the old 33 This option enables /proc and sysctl compatibility with the old
@@ -76,11 +76,21 @@ config IP_NF_MATCH_AH
76config IP_NF_MATCH_ECN 76config IP_NF_MATCH_ECN
77 tristate '"ecn" match support' 77 tristate '"ecn" match support'
78 depends on NETFILTER_ADVANCED 78 depends on NETFILTER_ADVANCED
79 help 79 select NETFILTER_XT_MATCH_ECN
80 This option adds a `ECN' match, which allows you to match against 80 ---help---
81 the IPv4 and TCP header ECN fields. 81 This is a backwards-compat option for the user's convenience
82 (e.g. when running oldconfig). It selects
83 CONFIG_NETFILTER_XT_MATCH_ECN.
84
85config IP_NF_MATCH_RPFILTER
86 tristate '"rpfilter" reverse path filter match support'
87 depends on NETFILTER_ADVANCED
88 ---help---
89 This option allows you to match packets whose replies would
90 go out via the interface the packet came in.
82 91
83 To compile it as a module, choose M here. If unsure, say N. 92 To compile it as a module, choose M here. If unsure, say N.
93 The module will be called ipt_rpfilter.
84 94
85config IP_NF_MATCH_TTL 95config IP_NF_MATCH_TTL
86 tristate '"ttl" match support' 96 tristate '"ttl" match support'
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index dca2082ec683..213a462b739b 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -49,7 +49,7 @@ obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
49 49
50# matches 50# matches
51obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o 51obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
52obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o 52obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o
53 53
54# targets 54# targets
55obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o 55obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index e59aabd0eae4..94d45e1f8882 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -404,6 +404,7 @@ __ipq_rcv_skb(struct sk_buff *skb)
404 int status, type, pid, flags; 404 int status, type, pid, flags;
405 unsigned int nlmsglen, skblen; 405 unsigned int nlmsglen, skblen;
406 struct nlmsghdr *nlh; 406 struct nlmsghdr *nlh;
407 bool enable_timestamp = false;
407 408
408 skblen = skb->len; 409 skblen = skb->len;
409 if (skblen < sizeof(*nlh)) 410 if (skblen < sizeof(*nlh))
@@ -430,7 +431,7 @@ __ipq_rcv_skb(struct sk_buff *skb)
430 if (type <= IPQM_BASE) 431 if (type <= IPQM_BASE)
431 return; 432 return;
432 433
433 if (security_netlink_recv(skb, CAP_NET_ADMIN)) 434 if (!capable(CAP_NET_ADMIN))
434 RCV_SKB_FAIL(-EPERM); 435 RCV_SKB_FAIL(-EPERM);
435 436
436 spin_lock_bh(&queue_lock); 437 spin_lock_bh(&queue_lock);
@@ -441,12 +442,13 @@ __ipq_rcv_skb(struct sk_buff *skb)
441 RCV_SKB_FAIL(-EBUSY); 442 RCV_SKB_FAIL(-EBUSY);
442 } 443 }
443 } else { 444 } else {
444 net_enable_timestamp(); 445 enable_timestamp = true;
445 peer_pid = pid; 446 peer_pid = pid;
446 } 447 }
447 448
448 spin_unlock_bh(&queue_lock); 449 spin_unlock_bh(&queue_lock);
449 450 if (enable_timestamp)
451 net_enable_timestamp();
450 status = ipq_receive_peer(NLMSG_DATA(nlh), type, 452 status = ipq_receive_peer(NLMSG_DATA(nlh), type,
451 nlmsglen - NLMSG_LENGTH(0)); 453 nlmsglen - NLMSG_LENGTH(0));
452 if (status < 0) 454 if (status < 0)
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 9931152a78b5..2f210c79dc87 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -30,9 +30,9 @@ MODULE_DESCRIPTION("Xtables: automatic-address SNAT");
30/* FIXME: Multiple targets. --RR */ 30/* FIXME: Multiple targets. --RR */
31static int masquerade_tg_check(const struct xt_tgchk_param *par) 31static int masquerade_tg_check(const struct xt_tgchk_param *par)
32{ 32{
33 const struct nf_nat_multi_range_compat *mr = par->targinfo; 33 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
34 34
35 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { 35 if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) {
36 pr_debug("bad MAP_IPS.\n"); 36 pr_debug("bad MAP_IPS.\n");
37 return -EINVAL; 37 return -EINVAL;
38 } 38 }
@@ -49,8 +49,8 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
49 struct nf_conn *ct; 49 struct nf_conn *ct;
50 struct nf_conn_nat *nat; 50 struct nf_conn_nat *nat;
51 enum ip_conntrack_info ctinfo; 51 enum ip_conntrack_info ctinfo;
52 struct nf_nat_range newrange; 52 struct nf_nat_ipv4_range newrange;
53 const struct nf_nat_multi_range_compat *mr; 53 const struct nf_nat_ipv4_multi_range_compat *mr;
54 const struct rtable *rt; 54 const struct rtable *rt;
55 __be32 newsrc; 55 __be32 newsrc;
56 56
@@ -79,13 +79,13 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
79 nat->masq_index = par->out->ifindex; 79 nat->masq_index = par->out->ifindex;
80 80
81 /* Transfer from original range. */ 81 /* Transfer from original range. */
82 newrange = ((struct nf_nat_range) 82 newrange = ((struct nf_nat_ipv4_range)
83 { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, 83 { mr->range[0].flags | NF_NAT_RANGE_MAP_IPS,
84 newsrc, newsrc, 84 newsrc, newsrc,
85 mr->range[0].min, mr->range[0].max }); 85 mr->range[0].min, mr->range[0].max });
86 86
87 /* Hand modified range to generic setup. */ 87 /* Hand modified range to generic setup. */
88 return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_SRC); 88 return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
89} 89}
90 90
91static int 91static int
@@ -139,7 +139,7 @@ static struct xt_target masquerade_tg_reg __read_mostly = {
139 .name = "MASQUERADE", 139 .name = "MASQUERADE",
140 .family = NFPROTO_IPV4, 140 .family = NFPROTO_IPV4,
141 .target = masquerade_tg, 141 .target = masquerade_tg,
142 .targetsize = sizeof(struct nf_nat_multi_range_compat), 142 .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
143 .table = "nat", 143 .table = "nat",
144 .hooks = 1 << NF_INET_POST_ROUTING, 144 .hooks = 1 << NF_INET_POST_ROUTING,
145 .checkentry = masquerade_tg_check, 145 .checkentry = masquerade_tg_check,
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index 6cdb298f1035..b5bfbbabf70d 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -24,9 +24,9 @@ MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets");
24 24
25static int netmap_tg_check(const struct xt_tgchk_param *par) 25static int netmap_tg_check(const struct xt_tgchk_param *par)
26{ 26{
27 const struct nf_nat_multi_range_compat *mr = par->targinfo; 27 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
28 28
29 if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) { 29 if (!(mr->range[0].flags & NF_NAT_RANGE_MAP_IPS)) {
30 pr_debug("bad MAP_IPS.\n"); 30 pr_debug("bad MAP_IPS.\n");
31 return -EINVAL; 31 return -EINVAL;
32 } 32 }
@@ -43,8 +43,8 @@ netmap_tg(struct sk_buff *skb, const struct xt_action_param *par)
43 struct nf_conn *ct; 43 struct nf_conn *ct;
44 enum ip_conntrack_info ctinfo; 44 enum ip_conntrack_info ctinfo;
45 __be32 new_ip, netmask; 45 __be32 new_ip, netmask;
46 const struct nf_nat_multi_range_compat *mr = par->targinfo; 46 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
47 struct nf_nat_range newrange; 47 struct nf_nat_ipv4_range newrange;
48 48
49 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || 49 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
50 par->hooknum == NF_INET_POST_ROUTING || 50 par->hooknum == NF_INET_POST_ROUTING ||
@@ -61,8 +61,8 @@ netmap_tg(struct sk_buff *skb, const struct xt_action_param *par)
61 new_ip = ip_hdr(skb)->saddr & ~netmask; 61 new_ip = ip_hdr(skb)->saddr & ~netmask;
62 new_ip |= mr->range[0].min_ip & netmask; 62 new_ip |= mr->range[0].min_ip & netmask;
63 63
64 newrange = ((struct nf_nat_range) 64 newrange = ((struct nf_nat_ipv4_range)
65 { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, 65 { mr->range[0].flags | NF_NAT_RANGE_MAP_IPS,
66 new_ip, new_ip, 66 new_ip, new_ip,
67 mr->range[0].min, mr->range[0].max }); 67 mr->range[0].min, mr->range[0].max });
68 68
@@ -74,7 +74,7 @@ static struct xt_target netmap_tg_reg __read_mostly = {
74 .name = "NETMAP", 74 .name = "NETMAP",
75 .family = NFPROTO_IPV4, 75 .family = NFPROTO_IPV4,
76 .target = netmap_tg, 76 .target = netmap_tg,
77 .targetsize = sizeof(struct nf_nat_multi_range_compat), 77 .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
78 .table = "nat", 78 .table = "nat",
79 .hooks = (1 << NF_INET_PRE_ROUTING) | 79 .hooks = (1 << NF_INET_PRE_ROUTING) |
80 (1 << NF_INET_POST_ROUTING) | 80 (1 << NF_INET_POST_ROUTING) |
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
index 18a0656505a0..7c0103a5203e 100644
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -28,9 +28,9 @@ MODULE_DESCRIPTION("Xtables: Connection redirection to localhost");
28/* FIXME: Take multiple ranges --RR */ 28/* FIXME: Take multiple ranges --RR */
29static int redirect_tg_check(const struct xt_tgchk_param *par) 29static int redirect_tg_check(const struct xt_tgchk_param *par)
30{ 30{
31 const struct nf_nat_multi_range_compat *mr = par->targinfo; 31 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
32 32
33 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { 33 if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) {
34 pr_debug("bad MAP_IPS.\n"); 34 pr_debug("bad MAP_IPS.\n");
35 return -EINVAL; 35 return -EINVAL;
36 } 36 }
@@ -47,8 +47,8 @@ redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
47 struct nf_conn *ct; 47 struct nf_conn *ct;
48 enum ip_conntrack_info ctinfo; 48 enum ip_conntrack_info ctinfo;
49 __be32 newdst; 49 __be32 newdst;
50 const struct nf_nat_multi_range_compat *mr = par->targinfo; 50 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
51 struct nf_nat_range newrange; 51 struct nf_nat_ipv4_range newrange;
52 52
53 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || 53 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
54 par->hooknum == NF_INET_LOCAL_OUT); 54 par->hooknum == NF_INET_LOCAL_OUT);
@@ -76,20 +76,20 @@ redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
76 } 76 }
77 77
78 /* Transfer from original range. */ 78 /* Transfer from original range. */
79 newrange = ((struct nf_nat_range) 79 newrange = ((struct nf_nat_ipv4_range)
80 { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, 80 { mr->range[0].flags | NF_NAT_RANGE_MAP_IPS,
81 newdst, newdst, 81 newdst, newdst,
82 mr->range[0].min, mr->range[0].max }); 82 mr->range[0].min, mr->range[0].max });
83 83
84 /* Hand modified range to generic setup. */ 84 /* Hand modified range to generic setup. */
85 return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_DST); 85 return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
86} 86}
87 87
88static struct xt_target redirect_tg_reg __read_mostly = { 88static struct xt_target redirect_tg_reg __read_mostly = {
89 .name = "REDIRECT", 89 .name = "REDIRECT",
90 .family = NFPROTO_IPV4, 90 .family = NFPROTO_IPV4,
91 .target = redirect_tg, 91 .target = redirect_tg,
92 .targetsize = sizeof(struct nf_nat_multi_range_compat), 92 .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
93 .table = "nat", 93 .table = "nat",
94 .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), 94 .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT),
95 .checkentry = redirect_tg_check, 95 .checkentry = redirect_tg_check,
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index b5508151e547..ba5756d20165 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -65,7 +65,7 @@ static unsigned int flushtimeout = 10;
65module_param(flushtimeout, uint, 0600); 65module_param(flushtimeout, uint, 0600);
66MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)"); 66MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)");
67 67
68static int nflog = 1; 68static bool nflog = true;
69module_param(nflog, bool, 0400); 69module_param(nflog, bool, 0400);
70MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); 70MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
71 71
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
deleted file mode 100644
index 2b57e52c746c..000000000000
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ /dev/null
@@ -1,127 +0,0 @@
1/* IP tables module for matching the value of the IPv4 and TCP ECN bits
2 *
3 * (C) 2002 by Harald Welte <laforge@gnumonks.org>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10#include <linux/in.h>
11#include <linux/ip.h>
12#include <net/ip.h>
13#include <linux/module.h>
14#include <linux/skbuff.h>
15#include <linux/tcp.h>
16
17#include <linux/netfilter/x_tables.h>
18#include <linux/netfilter_ipv4/ip_tables.h>
19#include <linux/netfilter_ipv4/ipt_ecn.h>
20
21MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
22MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag match for IPv4");
23MODULE_LICENSE("GPL");
24
25static inline bool match_ip(const struct sk_buff *skb,
26 const struct ipt_ecn_info *einfo)
27{
28 return ((ip_hdr(skb)->tos & IPT_ECN_IP_MASK) == einfo->ip_ect) ^
29 !!(einfo->invert & IPT_ECN_OP_MATCH_IP);
30}
31
32static inline bool match_tcp(const struct sk_buff *skb,
33 const struct ipt_ecn_info *einfo,
34 bool *hotdrop)
35{
36 struct tcphdr _tcph;
37 const struct tcphdr *th;
38
39 /* In practice, TCP match does this, so can't fail. But let's
40 * be good citizens.
41 */
42 th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
43 if (th == NULL) {
44 *hotdrop = false;
45 return false;
46 }
47
48 if (einfo->operation & IPT_ECN_OP_MATCH_ECE) {
49 if (einfo->invert & IPT_ECN_OP_MATCH_ECE) {
50 if (th->ece == 1)
51 return false;
52 } else {
53 if (th->ece == 0)
54 return false;
55 }
56 }
57
58 if (einfo->operation & IPT_ECN_OP_MATCH_CWR) {
59 if (einfo->invert & IPT_ECN_OP_MATCH_CWR) {
60 if (th->cwr == 1)
61 return false;
62 } else {
63 if (th->cwr == 0)
64 return false;
65 }
66 }
67
68 return true;
69}
70
71static bool ecn_mt(const struct sk_buff *skb, struct xt_action_param *par)
72{
73 const struct ipt_ecn_info *info = par->matchinfo;
74
75 if (info->operation & IPT_ECN_OP_MATCH_IP)
76 if (!match_ip(skb, info))
77 return false;
78
79 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) {
80 if (!match_tcp(skb, info, &par->hotdrop))
81 return false;
82 }
83
84 return true;
85}
86
87static int ecn_mt_check(const struct xt_mtchk_param *par)
88{
89 const struct ipt_ecn_info *info = par->matchinfo;
90 const struct ipt_ip *ip = par->entryinfo;
91
92 if (info->operation & IPT_ECN_OP_MATCH_MASK)
93 return -EINVAL;
94
95 if (info->invert & IPT_ECN_OP_MATCH_MASK)
96 return -EINVAL;
97
98 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) &&
99 (ip->proto != IPPROTO_TCP || ip->invflags & IPT_INV_PROTO)) {
100 pr_info("cannot match TCP bits in rule for non-tcp packets\n");
101 return -EINVAL;
102 }
103
104 return 0;
105}
106
107static struct xt_match ecn_mt_reg __read_mostly = {
108 .name = "ecn",
109 .family = NFPROTO_IPV4,
110 .match = ecn_mt,
111 .matchsize = sizeof(struct ipt_ecn_info),
112 .checkentry = ecn_mt_check,
113 .me = THIS_MODULE,
114};
115
116static int __init ecn_mt_init(void)
117{
118 return xt_register_match(&ecn_mt_reg);
119}
120
121static void __exit ecn_mt_exit(void)
122{
123 xt_unregister_match(&ecn_mt_reg);
124}
125
126module_init(ecn_mt_init);
127module_exit(ecn_mt_exit);
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
new file mode 100644
index 000000000000..31371be8174b
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -0,0 +1,141 @@
1/*
2 * Copyright (c) 2011 Florian Westphal <fw@strlen.de>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * based on fib_frontend.c; Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 */
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11#include <linux/module.h>
12#include <linux/skbuff.h>
13#include <linux/netdevice.h>
14#include <linux/ip.h>
15#include <net/ip.h>
16#include <net/ip_fib.h>
17#include <net/route.h>
18
19#include <linux/netfilter/xt_rpfilter.h>
20#include <linux/netfilter/x_tables.h>
21
22MODULE_LICENSE("GPL");
23MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
24MODULE_DESCRIPTION("iptables: ipv4 reverse path filter match");
25
26/* don't try to find route from mcast/bcast/zeronet */
27static __be32 rpfilter_get_saddr(__be32 addr)
28{
29 if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) ||
30 ipv4_is_zeronet(addr))
31 return 0;
32 return addr;
33}
34
35static bool rpfilter_lookup_reverse(struct flowi4 *fl4,
36 const struct net_device *dev, u8 flags)
37{
38 struct fib_result res;
39 bool dev_match;
40 struct net *net = dev_net(dev);
41 int ret __maybe_unused;
42
43 if (fib_lookup(net, fl4, &res))
44 return false;
45
46 if (res.type != RTN_UNICAST) {
47 if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL))
48 return false;
49 }
50 dev_match = false;
51#ifdef CONFIG_IP_ROUTE_MULTIPATH
52 for (ret = 0; ret < res.fi->fib_nhs; ret++) {
53 struct fib_nh *nh = &res.fi->fib_nh[ret];
54
55 if (nh->nh_dev == dev) {
56 dev_match = true;
57 break;
58 }
59 }
60#else
61 if (FIB_RES_DEV(res) == dev)
62 dev_match = true;
63#endif
64 if (dev_match || flags & XT_RPFILTER_LOOSE)
65 return FIB_RES_NH(res).nh_scope <= RT_SCOPE_HOST;
66 return dev_match;
67}
68
69static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
70{
71 const struct xt_rpfilter_info *info;
72 const struct iphdr *iph;
73 struct flowi4 flow;
74 bool invert;
75
76 info = par->matchinfo;
77 invert = info->flags & XT_RPFILTER_INVERT;
78
79 if (par->in->flags & IFF_LOOPBACK)
80 return true ^ invert;
81
82 iph = ip_hdr(skb);
83 if (ipv4_is_multicast(iph->daddr)) {
84 if (ipv4_is_zeronet(iph->saddr))
85 return ipv4_is_local_multicast(iph->daddr) ^ invert;
86 flow.flowi4_iif = 0;
87 } else {
88 flow.flowi4_iif = dev_net(par->in)->loopback_dev->ifindex;
89 }
90
91 flow.daddr = iph->saddr;
92 flow.saddr = rpfilter_get_saddr(iph->daddr);
93 flow.flowi4_oif = 0;
94 flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
95 flow.flowi4_tos = RT_TOS(iph->tos);
96 flow.flowi4_scope = RT_SCOPE_UNIVERSE;
97
98 return rpfilter_lookup_reverse(&flow, par->in, info->flags) ^ invert;
99}
100
101static int rpfilter_check(const struct xt_mtchk_param *par)
102{
103 const struct xt_rpfilter_info *info = par->matchinfo;
104 unsigned int options = ~XT_RPFILTER_OPTION_MASK;
105 if (info->flags & options) {
106 pr_info("unknown options encountered");
107 return -EINVAL;
108 }
109
110 if (strcmp(par->table, "mangle") != 0 &&
111 strcmp(par->table, "raw") != 0) {
112 pr_info("match only valid in the \'raw\' "
113 "or \'mangle\' tables, not \'%s\'.\n", par->table);
114 return -EINVAL;
115 }
116
117 return 0;
118}
119
120static struct xt_match rpfilter_mt_reg __read_mostly = {
121 .name = "rpfilter",
122 .family = NFPROTO_IPV4,
123 .checkentry = rpfilter_check,
124 .match = rpfilter_mt,
125 .matchsize = sizeof(struct xt_rpfilter_info),
126 .hooks = (1 << NF_INET_PRE_ROUTING),
127 .me = THIS_MODULE
128};
129
130static int __init rpfilter_mt_init(void)
131{
132 return xt_register_match(&rpfilter_mt_reg);
133}
134
135static void __exit rpfilter_mt_exit(void)
136{
137 xt_unregister_match(&rpfilter_mt_reg);
138}
139
140module_init(rpfilter_mt_init);
141module_exit(rpfilter_mt_exit);
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index c37641e819f2..0e58f09e59fb 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -52,7 +52,7 @@ iptable_filter_hook(unsigned int hook, struct sk_buff *skb,
52static struct nf_hook_ops *filter_ops __read_mostly; 52static struct nf_hook_ops *filter_ops __read_mostly;
53 53
54/* Default to forward because I got too much mail already. */ 54/* Default to forward because I got too much mail already. */
55static int forward = NF_ACCEPT; 55static bool forward = NF_ACCEPT;
56module_param(forward, bool, 0000); 56module_param(forward, bool, 0000);
57 57
58static int __net_init iptable_filter_net_init(struct net *net) 58static int __net_init iptable_filter_net_init(struct net *net)
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 447bc5cfdc6c..a708933dc230 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -30,7 +30,6 @@
30#include <net/netfilter/nf_nat_helper.h> 30#include <net/netfilter/nf_nat_helper.h>
31#include <net/netfilter/nf_conntrack_helper.h> 31#include <net/netfilter/nf_conntrack_helper.h>
32#include <net/netfilter/nf_conntrack_l3proto.h> 32#include <net/netfilter/nf_conntrack_l3proto.h>
33#include <net/netfilter/nf_conntrack_l4proto.h>
34#include <net/netfilter/nf_conntrack_zones.h> 33#include <net/netfilter/nf_conntrack_zones.h>
35 34
36static DEFINE_SPINLOCK(nf_nat_lock); 35static DEFINE_SPINLOCK(nf_nat_lock);
@@ -57,7 +56,7 @@ hash_by_src(const struct net *net, u16 zone,
57 /* Original src, to ensure we map it consistently if poss. */ 56 /* Original src, to ensure we map it consistently if poss. */
58 hash = jhash_3words((__force u32)tuple->src.u3.ip, 57 hash = jhash_3words((__force u32)tuple->src.u3.ip,
59 (__force u32)tuple->src.u.all ^ zone, 58 (__force u32)tuple->src.u.all ^ zone,
60 tuple->dst.protonum, 0); 59 tuple->dst.protonum, nf_conntrack_hash_rnd);
61 return ((u64)hash * net->ipv4.nat_htable_size) >> 32; 60 return ((u64)hash * net->ipv4.nat_htable_size) >> 32;
62} 61}
63 62
@@ -82,14 +81,14 @@ EXPORT_SYMBOL(nf_nat_used_tuple);
82 * that meet the constraints of range. */ 81 * that meet the constraints of range. */
83static int 82static int
84in_range(const struct nf_conntrack_tuple *tuple, 83in_range(const struct nf_conntrack_tuple *tuple,
85 const struct nf_nat_range *range) 84 const struct nf_nat_ipv4_range *range)
86{ 85{
87 const struct nf_nat_protocol *proto; 86 const struct nf_nat_protocol *proto;
88 int ret = 0; 87 int ret = 0;
89 88
90 /* If we are supposed to map IPs, then we must be in the 89 /* If we are supposed to map IPs, then we must be in the
91 range specified, otherwise let this drag us onto a new src IP. */ 90 range specified, otherwise let this drag us onto a new src IP. */
92 if (range->flags & IP_NAT_RANGE_MAP_IPS) { 91 if (range->flags & NF_NAT_RANGE_MAP_IPS) {
93 if (ntohl(tuple->src.u3.ip) < ntohl(range->min_ip) || 92 if (ntohl(tuple->src.u3.ip) < ntohl(range->min_ip) ||
94 ntohl(tuple->src.u3.ip) > ntohl(range->max_ip)) 93 ntohl(tuple->src.u3.ip) > ntohl(range->max_ip))
95 return 0; 94 return 0;
@@ -97,8 +96,8 @@ in_range(const struct nf_conntrack_tuple *tuple,
97 96
98 rcu_read_lock(); 97 rcu_read_lock();
99 proto = __nf_nat_proto_find(tuple->dst.protonum); 98 proto = __nf_nat_proto_find(tuple->dst.protonum);
100 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) || 99 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) ||
101 proto->in_range(tuple, IP_NAT_MANIP_SRC, 100 proto->in_range(tuple, NF_NAT_MANIP_SRC,
102 &range->min, &range->max)) 101 &range->min, &range->max))
103 ret = 1; 102 ret = 1;
104 rcu_read_unlock(); 103 rcu_read_unlock();
@@ -123,7 +122,7 @@ static int
123find_appropriate_src(struct net *net, u16 zone, 122find_appropriate_src(struct net *net, u16 zone,
124 const struct nf_conntrack_tuple *tuple, 123 const struct nf_conntrack_tuple *tuple,
125 struct nf_conntrack_tuple *result, 124 struct nf_conntrack_tuple *result,
126 const struct nf_nat_range *range) 125 const struct nf_nat_ipv4_range *range)
127{ 126{
128 unsigned int h = hash_by_src(net, zone, tuple); 127 unsigned int h = hash_by_src(net, zone, tuple);
129 const struct nf_conn_nat *nat; 128 const struct nf_conn_nat *nat;
@@ -157,7 +156,7 @@ find_appropriate_src(struct net *net, u16 zone,
157*/ 156*/
158static void 157static void
159find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple, 158find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
160 const struct nf_nat_range *range, 159 const struct nf_nat_ipv4_range *range,
161 const struct nf_conn *ct, 160 const struct nf_conn *ct,
162 enum nf_nat_manip_type maniptype) 161 enum nf_nat_manip_type maniptype)
163{ 162{
@@ -166,10 +165,10 @@ find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
166 u_int32_t minip, maxip, j; 165 u_int32_t minip, maxip, j;
167 166
168 /* No IP mapping? Do nothing. */ 167 /* No IP mapping? Do nothing. */
169 if (!(range->flags & IP_NAT_RANGE_MAP_IPS)) 168 if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
170 return; 169 return;
171 170
172 if (maniptype == IP_NAT_MANIP_SRC) 171 if (maniptype == NF_NAT_MANIP_SRC)
173 var_ipp = &tuple->src.u3.ip; 172 var_ipp = &tuple->src.u3.ip;
174 else 173 else
175 var_ipp = &tuple->dst.u3.ip; 174 var_ipp = &tuple->dst.u3.ip;
@@ -189,7 +188,7 @@ find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
189 minip = ntohl(range->min_ip); 188 minip = ntohl(range->min_ip);
190 maxip = ntohl(range->max_ip); 189 maxip = ntohl(range->max_ip);
191 j = jhash_2words((__force u32)tuple->src.u3.ip, 190 j = jhash_2words((__force u32)tuple->src.u3.ip,
192 range->flags & IP_NAT_RANGE_PERSISTENT ? 191 range->flags & NF_NAT_RANGE_PERSISTENT ?
193 0 : (__force u32)tuple->dst.u3.ip ^ zone, 0); 192 0 : (__force u32)tuple->dst.u3.ip ^ zone, 0);
194 j = ((u64)j * (maxip - minip + 1)) >> 32; 193 j = ((u64)j * (maxip - minip + 1)) >> 32;
195 *var_ipp = htonl(minip + j); 194 *var_ipp = htonl(minip + j);
@@ -204,7 +203,7 @@ find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
204static void 203static void
205get_unique_tuple(struct nf_conntrack_tuple *tuple, 204get_unique_tuple(struct nf_conntrack_tuple *tuple,
206 const struct nf_conntrack_tuple *orig_tuple, 205 const struct nf_conntrack_tuple *orig_tuple,
207 const struct nf_nat_range *range, 206 const struct nf_nat_ipv4_range *range,
208 struct nf_conn *ct, 207 struct nf_conn *ct,
209 enum nf_nat_manip_type maniptype) 208 enum nf_nat_manip_type maniptype)
210{ 209{
@@ -219,8 +218,8 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
219 This is only required for source (ie. NAT/masq) mappings. 218 This is only required for source (ie. NAT/masq) mappings.
220 So far, we don't do local source mappings, so multiple 219 So far, we don't do local source mappings, so multiple
221 manips not an issue. */ 220 manips not an issue. */
222 if (maniptype == IP_NAT_MANIP_SRC && 221 if (maniptype == NF_NAT_MANIP_SRC &&
223 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { 222 !(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) {
224 /* try the original tuple first */ 223 /* try the original tuple first */
225 if (in_range(orig_tuple, range)) { 224 if (in_range(orig_tuple, range)) {
226 if (!nf_nat_used_tuple(orig_tuple, ct)) { 225 if (!nf_nat_used_tuple(orig_tuple, ct)) {
@@ -247,8 +246,8 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
247 proto = __nf_nat_proto_find(orig_tuple->dst.protonum); 246 proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
248 247
249 /* Only bother mapping if it's not already in range and unique */ 248 /* Only bother mapping if it's not already in range and unique */
250 if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { 249 if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) {
251 if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) { 250 if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
252 if (proto->in_range(tuple, maniptype, &range->min, 251 if (proto->in_range(tuple, maniptype, &range->min,
253 &range->max) && 252 &range->max) &&
254 (range->min.all == range->max.all || 253 (range->min.all == range->max.all ||
@@ -267,7 +266,7 @@ out:
267 266
268unsigned int 267unsigned int
269nf_nat_setup_info(struct nf_conn *ct, 268nf_nat_setup_info(struct nf_conn *ct,
270 const struct nf_nat_range *range, 269 const struct nf_nat_ipv4_range *range,
271 enum nf_nat_manip_type maniptype) 270 enum nf_nat_manip_type maniptype)
272{ 271{
273 struct net *net = nf_ct_net(ct); 272 struct net *net = nf_ct_net(ct);
@@ -284,8 +283,8 @@ nf_nat_setup_info(struct nf_conn *ct,
284 } 283 }
285 } 284 }
286 285
287 NF_CT_ASSERT(maniptype == IP_NAT_MANIP_SRC || 286 NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC ||
288 maniptype == IP_NAT_MANIP_DST); 287 maniptype == NF_NAT_MANIP_DST);
289 BUG_ON(nf_nat_initialized(ct, maniptype)); 288 BUG_ON(nf_nat_initialized(ct, maniptype));
290 289
291 /* What we've got will look like inverse of reply. Normally 290 /* What we've got will look like inverse of reply. Normally
@@ -306,19 +305,19 @@ nf_nat_setup_info(struct nf_conn *ct,
306 nf_conntrack_alter_reply(ct, &reply); 305 nf_conntrack_alter_reply(ct, &reply);
307 306
308 /* Non-atomic: we own this at the moment. */ 307 /* Non-atomic: we own this at the moment. */
309 if (maniptype == IP_NAT_MANIP_SRC) 308 if (maniptype == NF_NAT_MANIP_SRC)
310 ct->status |= IPS_SRC_NAT; 309 ct->status |= IPS_SRC_NAT;
311 else 310 else
312 ct->status |= IPS_DST_NAT; 311 ct->status |= IPS_DST_NAT;
313 } 312 }
314 313
315 if (maniptype == IP_NAT_MANIP_SRC) { 314 if (maniptype == NF_NAT_MANIP_SRC) {
316 unsigned int srchash; 315 unsigned int srchash;
317 316
318 srchash = hash_by_src(net, nf_ct_zone(ct), 317 srchash = hash_by_src(net, nf_ct_zone(ct),
319 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 318 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
320 spin_lock_bh(&nf_nat_lock); 319 spin_lock_bh(&nf_nat_lock);
321 /* nf_conntrack_alter_reply might re-allocate exntension aera */ 320 /* nf_conntrack_alter_reply might re-allocate extension area */
322 nat = nfct_nat(ct); 321 nat = nfct_nat(ct);
323 nat->ct = ct; 322 nat->ct = ct;
324 hlist_add_head_rcu(&nat->bysource, 323 hlist_add_head_rcu(&nat->bysource,
@@ -327,7 +326,7 @@ nf_nat_setup_info(struct nf_conn *ct,
327 } 326 }
328 327
329 /* It's done. */ 328 /* It's done. */
330 if (maniptype == IP_NAT_MANIP_DST) 329 if (maniptype == NF_NAT_MANIP_DST)
331 ct->status |= IPS_DST_NAT_DONE; 330 ct->status |= IPS_DST_NAT_DONE;
332 else 331 else
333 ct->status |= IPS_SRC_NAT_DONE; 332 ct->status |= IPS_SRC_NAT_DONE;
@@ -361,7 +360,7 @@ manip_pkt(u_int16_t proto,
361 360
362 iph = (void *)skb->data + iphdroff; 361 iph = (void *)skb->data + iphdroff;
363 362
364 if (maniptype == IP_NAT_MANIP_SRC) { 363 if (maniptype == NF_NAT_MANIP_SRC) {
365 csum_replace4(&iph->check, iph->saddr, target->src.u3.ip); 364 csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
366 iph->saddr = target->src.u3.ip; 365 iph->saddr = target->src.u3.ip;
367 } else { 366 } else {
@@ -381,7 +380,7 @@ unsigned int nf_nat_packet(struct nf_conn *ct,
381 unsigned long statusbit; 380 unsigned long statusbit;
382 enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); 381 enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
383 382
384 if (mtype == IP_NAT_MANIP_SRC) 383 if (mtype == NF_NAT_MANIP_SRC)
385 statusbit = IPS_SRC_NAT; 384 statusbit = IPS_SRC_NAT;
386 else 385 else
387 statusbit = IPS_DST_NAT; 386 statusbit = IPS_DST_NAT;
@@ -414,8 +413,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
414 struct icmphdr icmp; 413 struct icmphdr icmp;
415 struct iphdr ip; 414 struct iphdr ip;
416 } *inside; 415 } *inside;
417 const struct nf_conntrack_l4proto *l4proto; 416 struct nf_conntrack_tuple target;
418 struct nf_conntrack_tuple inner, target;
419 int hdrlen = ip_hdrlen(skb); 417 int hdrlen = ip_hdrlen(skb);
420 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 418 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
421 unsigned long statusbit; 419 unsigned long statusbit;
@@ -447,7 +445,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
447 return 0; 445 return 0;
448 } 446 }
449 447
450 if (manip == IP_NAT_MANIP_SRC) 448 if (manip == NF_NAT_MANIP_SRC)
451 statusbit = IPS_SRC_NAT; 449 statusbit = IPS_SRC_NAT;
452 else 450 else
453 statusbit = IPS_DST_NAT; 451 statusbit = IPS_DST_NAT;
@@ -463,16 +461,6 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
463 "dir %s\n", skb, manip, 461 "dir %s\n", skb, manip,
464 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); 462 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
465 463
466 /* rcu_read_lock()ed by nf_hook_slow */
467 l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol);
468
469 if (!nf_ct_get_tuple(skb, hdrlen + sizeof(struct icmphdr),
470 (hdrlen +
471 sizeof(struct icmphdr) + inside->ip.ihl * 4),
472 (u_int16_t)AF_INET, inside->ip.protocol,
473 &inner, l3proto, l4proto))
474 return 0;
475
476 /* Change inner back to look like incoming packet. We do the 464 /* Change inner back to look like incoming packet. We do the
477 opposite manip on this hook to normal, because it might not 465 opposite manip on this hook to normal, because it might not
478 pass all hooks (locally-generated ICMP). Consider incoming 466 pass all hooks (locally-generated ICMP). Consider incoming
@@ -575,26 +563,6 @@ static struct nf_ct_ext_type nat_extend __read_mostly = {
575#include <linux/netfilter/nfnetlink.h> 563#include <linux/netfilter/nfnetlink.h>
576#include <linux/netfilter/nfnetlink_conntrack.h> 564#include <linux/netfilter/nfnetlink_conntrack.h>
577 565
578static const struct nf_nat_protocol *
579nf_nat_proto_find_get(u_int8_t protonum)
580{
581 const struct nf_nat_protocol *p;
582
583 rcu_read_lock();
584 p = __nf_nat_proto_find(protonum);
585 if (!try_module_get(p->me))
586 p = &nf_nat_unknown_protocol;
587 rcu_read_unlock();
588
589 return p;
590}
591
592static void
593nf_nat_proto_put(const struct nf_nat_protocol *p)
594{
595 module_put(p->me);
596}
597
598static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { 566static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
599 [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, 567 [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 },
600 [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, 568 [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 },
@@ -602,7 +570,7 @@ static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
602 570
603static int nfnetlink_parse_nat_proto(struct nlattr *attr, 571static int nfnetlink_parse_nat_proto(struct nlattr *attr,
604 const struct nf_conn *ct, 572 const struct nf_conn *ct,
605 struct nf_nat_range *range) 573 struct nf_nat_ipv4_range *range)
606{ 574{
607 struct nlattr *tb[CTA_PROTONAT_MAX+1]; 575 struct nlattr *tb[CTA_PROTONAT_MAX+1];
608 const struct nf_nat_protocol *npt; 576 const struct nf_nat_protocol *npt;
@@ -612,21 +580,23 @@ static int nfnetlink_parse_nat_proto(struct nlattr *attr,
612 if (err < 0) 580 if (err < 0)
613 return err; 581 return err;
614 582
615 npt = nf_nat_proto_find_get(nf_ct_protonum(ct)); 583 rcu_read_lock();
584 npt = __nf_nat_proto_find(nf_ct_protonum(ct));
616 if (npt->nlattr_to_range) 585 if (npt->nlattr_to_range)
617 err = npt->nlattr_to_range(tb, range); 586 err = npt->nlattr_to_range(tb, range);
618 nf_nat_proto_put(npt); 587 rcu_read_unlock();
619 return err; 588 return err;
620} 589}
621 590
622static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { 591static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
623 [CTA_NAT_MINIP] = { .type = NLA_U32 }, 592 [CTA_NAT_MINIP] = { .type = NLA_U32 },
624 [CTA_NAT_MAXIP] = { .type = NLA_U32 }, 593 [CTA_NAT_MAXIP] = { .type = NLA_U32 },
594 [CTA_NAT_PROTO] = { .type = NLA_NESTED },
625}; 595};
626 596
627static int 597static int
628nfnetlink_parse_nat(const struct nlattr *nat, 598nfnetlink_parse_nat(const struct nlattr *nat,
629 const struct nf_conn *ct, struct nf_nat_range *range) 599 const struct nf_conn *ct, struct nf_nat_ipv4_range *range)
630{ 600{
631 struct nlattr *tb[CTA_NAT_MAX+1]; 601 struct nlattr *tb[CTA_NAT_MAX+1];
632 int err; 602 int err;
@@ -646,7 +616,7 @@ nfnetlink_parse_nat(const struct nlattr *nat,
646 range->max_ip = nla_get_be32(tb[CTA_NAT_MAXIP]); 616 range->max_ip = nla_get_be32(tb[CTA_NAT_MAXIP]);
647 617
648 if (range->min_ip) 618 if (range->min_ip)
649 range->flags |= IP_NAT_RANGE_MAP_IPS; 619 range->flags |= NF_NAT_RANGE_MAP_IPS;
650 620
651 if (!tb[CTA_NAT_PROTO]) 621 if (!tb[CTA_NAT_PROTO])
652 return 0; 622 return 0;
@@ -663,7 +633,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
663 enum nf_nat_manip_type manip, 633 enum nf_nat_manip_type manip,
664 const struct nlattr *attr) 634 const struct nlattr *attr)
665{ 635{
666 struct nf_nat_range range; 636 struct nf_nat_ipv4_range range;
667 637
668 if (nfnetlink_parse_nat(attr, ct, &range) < 0) 638 if (nfnetlink_parse_nat(attr, ct, &range) < 0)
669 return -EINVAL; 639 return -EINVAL;
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index b9a1136addbd..dc1dd912baf4 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -398,7 +398,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
398static void ip_nat_q931_expect(struct nf_conn *new, 398static void ip_nat_q931_expect(struct nf_conn *new,
399 struct nf_conntrack_expect *this) 399 struct nf_conntrack_expect *this)
400{ 400{
401 struct nf_nat_range range; 401 struct nf_nat_ipv4_range range;
402 402
403 if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */ 403 if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */
404 nf_nat_follow_master(new, this); 404 nf_nat_follow_master(new, this);
@@ -409,16 +409,16 @@ static void ip_nat_q931_expect(struct nf_conn *new,
409 BUG_ON(new->status & IPS_NAT_DONE_MASK); 409 BUG_ON(new->status & IPS_NAT_DONE_MASK);
410 410
411 /* Change src to where master sends to */ 411 /* Change src to where master sends to */
412 range.flags = IP_NAT_RANGE_MAP_IPS; 412 range.flags = NF_NAT_RANGE_MAP_IPS;
413 range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip; 413 range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip;
414 nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC); 414 nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
415 415
416 /* For DST manip, map port here to where it's expected. */ 416 /* For DST manip, map port here to where it's expected. */
417 range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); 417 range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
418 range.min = range.max = this->saved_proto; 418 range.min = range.max = this->saved_proto;
419 range.min_ip = range.max_ip = 419 range.min_ip = range.max_ip =
420 new->master->tuplehash[!this->dir].tuple.src.u3.ip; 420 new->master->tuplehash[!this->dir].tuple.src.u3.ip;
421 nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST); 421 nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
422} 422}
423 423
424/****************************************************************************/ 424/****************************************************************************/
@@ -496,21 +496,21 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
496static void ip_nat_callforwarding_expect(struct nf_conn *new, 496static void ip_nat_callforwarding_expect(struct nf_conn *new,
497 struct nf_conntrack_expect *this) 497 struct nf_conntrack_expect *this)
498{ 498{
499 struct nf_nat_range range; 499 struct nf_nat_ipv4_range range;
500 500
501 /* This must be a fresh one. */ 501 /* This must be a fresh one. */
502 BUG_ON(new->status & IPS_NAT_DONE_MASK); 502 BUG_ON(new->status & IPS_NAT_DONE_MASK);
503 503
504 /* Change src to where master sends to */ 504 /* Change src to where master sends to */
505 range.flags = IP_NAT_RANGE_MAP_IPS; 505 range.flags = NF_NAT_RANGE_MAP_IPS;
506 range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip; 506 range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip;
507 nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC); 507 nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
508 508
509 /* For DST manip, map port here to where it's expected. */ 509 /* For DST manip, map port here to where it's expected. */
510 range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); 510 range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
511 range.min = range.max = this->saved_proto; 511 range.min = range.max = this->saved_proto;
512 range.min_ip = range.max_ip = this->saved_ip; 512 range.min_ip = range.max_ip = this->saved_ip;
513 nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST); 513 nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
514} 514}
515 515
516/****************************************************************************/ 516/****************************************************************************/
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index ebc5f8894f99..af65958f6308 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -253,12 +253,6 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
253 struct udphdr *udph; 253 struct udphdr *udph;
254 int datalen, oldlen; 254 int datalen, oldlen;
255 255
256 /* UDP helpers might accidentally mangle the wrong packet */
257 iph = ip_hdr(skb);
258 if (skb->len < iph->ihl*4 + sizeof(*udph) +
259 match_offset + match_len)
260 return 0;
261
262 if (!skb_make_writable(skb, skb->len)) 256 if (!skb_make_writable(skb, skb->len))
263 return 0; 257 return 0;
264 258
@@ -430,22 +424,22 @@ nf_nat_seq_adjust(struct sk_buff *skb,
430void nf_nat_follow_master(struct nf_conn *ct, 424void nf_nat_follow_master(struct nf_conn *ct,
431 struct nf_conntrack_expect *exp) 425 struct nf_conntrack_expect *exp)
432{ 426{
433 struct nf_nat_range range; 427 struct nf_nat_ipv4_range range;
434 428
435 /* This must be a fresh one. */ 429 /* This must be a fresh one. */
436 BUG_ON(ct->status & IPS_NAT_DONE_MASK); 430 BUG_ON(ct->status & IPS_NAT_DONE_MASK);
437 431
438 /* Change src to where master sends to */ 432 /* Change src to where master sends to */
439 range.flags = IP_NAT_RANGE_MAP_IPS; 433 range.flags = NF_NAT_RANGE_MAP_IPS;
440 range.min_ip = range.max_ip 434 range.min_ip = range.max_ip
441 = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; 435 = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
442 nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC); 436 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
443 437
444 /* For DST manip, map port here to where it's expected. */ 438 /* For DST manip, map port here to where it's expected. */
445 range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); 439 range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
446 range.min = range.max = exp->saved_proto; 440 range.min = range.max = exp->saved_proto;
447 range.min_ip = range.max_ip 441 range.min_ip = range.max_ip
448 = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip; 442 = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip;
449 nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST); 443 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
450} 444}
451EXPORT_SYMBOL(nf_nat_follow_master); 445EXPORT_SYMBOL(nf_nat_follow_master);
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 3e8284ba46b8..c273d58980ae 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -47,7 +47,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
47 struct nf_conntrack_tuple t; 47 struct nf_conntrack_tuple t;
48 const struct nf_ct_pptp_master *ct_pptp_info; 48 const struct nf_ct_pptp_master *ct_pptp_info;
49 const struct nf_nat_pptp *nat_pptp_info; 49 const struct nf_nat_pptp *nat_pptp_info;
50 struct nf_nat_range range; 50 struct nf_nat_ipv4_range range;
51 51
52 ct_pptp_info = &nfct_help(master)->help.ct_pptp_info; 52 ct_pptp_info = &nfct_help(master)->help.ct_pptp_info;
53 nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info; 53 nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info;
@@ -88,24 +88,24 @@ static void pptp_nat_expected(struct nf_conn *ct,
88 BUG_ON(ct->status & IPS_NAT_DONE_MASK); 88 BUG_ON(ct->status & IPS_NAT_DONE_MASK);
89 89
90 /* Change src to where master sends to */ 90 /* Change src to where master sends to */
91 range.flags = IP_NAT_RANGE_MAP_IPS; 91 range.flags = NF_NAT_RANGE_MAP_IPS;
92 range.min_ip = range.max_ip 92 range.min_ip = range.max_ip
93 = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; 93 = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
94 if (exp->dir == IP_CT_DIR_ORIGINAL) { 94 if (exp->dir == IP_CT_DIR_ORIGINAL) {
95 range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; 95 range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
96 range.min = range.max = exp->saved_proto; 96 range.min = range.max = exp->saved_proto;
97 } 97 }
98 nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC); 98 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
99 99
100 /* For DST manip, map port here to where it's expected. */ 100 /* For DST manip, map port here to where it's expected. */
101 range.flags = IP_NAT_RANGE_MAP_IPS; 101 range.flags = NF_NAT_RANGE_MAP_IPS;
102 range.min_ip = range.max_ip 102 range.min_ip = range.max_ip
103 = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip; 103 = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip;
104 if (exp->dir == IP_CT_DIR_REPLY) { 104 if (exp->dir == IP_CT_DIR_REPLY) {
105 range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; 105 range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
106 range.min = range.max = exp->saved_proto; 106 range.min = range.max = exp->saved_proto;
107 } 107 }
108 nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST); 108 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
109} 109}
110 110
111/* outbound packets == from PNS to PAC */ 111/* outbound packets == from PNS to PAC */
diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c
index a3d997618602..9993bc93e102 100644
--- a/net/ipv4/netfilter/nf_nat_proto_common.c
+++ b/net/ipv4/netfilter/nf_nat_proto_common.c
@@ -26,7 +26,7 @@ bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple,
26{ 26{
27 __be16 port; 27 __be16 port;
28 28
29 if (maniptype == IP_NAT_MANIP_SRC) 29 if (maniptype == NF_NAT_MANIP_SRC)
30 port = tuple->src.u.all; 30 port = tuple->src.u.all;
31 else 31 else
32 port = tuple->dst.u.all; 32 port = tuple->dst.u.all;
@@ -37,7 +37,7 @@ bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple,
37EXPORT_SYMBOL_GPL(nf_nat_proto_in_range); 37EXPORT_SYMBOL_GPL(nf_nat_proto_in_range);
38 38
39void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple, 39void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
40 const struct nf_nat_range *range, 40 const struct nf_nat_ipv4_range *range,
41 enum nf_nat_manip_type maniptype, 41 enum nf_nat_manip_type maniptype,
42 const struct nf_conn *ct, 42 const struct nf_conn *ct,
43 u_int16_t *rover) 43 u_int16_t *rover)
@@ -46,15 +46,15 @@ void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
46 __be16 *portptr; 46 __be16 *portptr;
47 u_int16_t off; 47 u_int16_t off;
48 48
49 if (maniptype == IP_NAT_MANIP_SRC) 49 if (maniptype == NF_NAT_MANIP_SRC)
50 portptr = &tuple->src.u.all; 50 portptr = &tuple->src.u.all;
51 else 51 else
52 portptr = &tuple->dst.u.all; 52 portptr = &tuple->dst.u.all;
53 53
54 /* If no range specified... */ 54 /* If no range specified... */
55 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { 55 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
56 /* If it's dst rewrite, can't change port */ 56 /* If it's dst rewrite, can't change port */
57 if (maniptype == IP_NAT_MANIP_DST) 57 if (maniptype == NF_NAT_MANIP_DST)
58 return; 58 return;
59 59
60 if (ntohs(*portptr) < 1024) { 60 if (ntohs(*portptr) < 1024) {
@@ -75,9 +75,9 @@ void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
75 range_size = ntohs(range->max.all) - min + 1; 75 range_size = ntohs(range->max.all) - min + 1;
76 } 76 }
77 77
78 if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) 78 if (range->flags & NF_NAT_RANGE_PROTO_RANDOM)
79 off = secure_ipv4_port_ephemeral(tuple->src.u3.ip, tuple->dst.u3.ip, 79 off = secure_ipv4_port_ephemeral(tuple->src.u3.ip, tuple->dst.u3.ip,
80 maniptype == IP_NAT_MANIP_SRC 80 maniptype == NF_NAT_MANIP_SRC
81 ? tuple->dst.u.all 81 ? tuple->dst.u.all
82 : tuple->src.u.all); 82 : tuple->src.u.all);
83 else 83 else
@@ -87,7 +87,7 @@ void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
87 *portptr = htons(min + off % range_size); 87 *portptr = htons(min + off % range_size);
88 if (++i != range_size && nf_nat_used_tuple(tuple, ct)) 88 if (++i != range_size && nf_nat_used_tuple(tuple, ct))
89 continue; 89 continue;
90 if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) 90 if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM))
91 *rover = off; 91 *rover = off;
92 return; 92 return;
93 } 93 }
@@ -96,31 +96,19 @@ void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
96EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple); 96EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple);
97 97
98#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 98#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
99int nf_nat_proto_range_to_nlattr(struct sk_buff *skb,
100 const struct nf_nat_range *range)
101{
102 NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MIN, range->min.all);
103 NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MAX, range->max.all);
104 return 0;
105
106nla_put_failure:
107 return -1;
108}
109EXPORT_SYMBOL_GPL(nf_nat_proto_nlattr_to_range);
110
111int nf_nat_proto_nlattr_to_range(struct nlattr *tb[], 99int nf_nat_proto_nlattr_to_range(struct nlattr *tb[],
112 struct nf_nat_range *range) 100 struct nf_nat_ipv4_range *range)
113{ 101{
114 if (tb[CTA_PROTONAT_PORT_MIN]) { 102 if (tb[CTA_PROTONAT_PORT_MIN]) {
115 range->min.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); 103 range->min.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
116 range->max.all = range->min.tcp.port; 104 range->max.all = range->min.tcp.port;
117 range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED; 105 range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
118 } 106 }
119 if (tb[CTA_PROTONAT_PORT_MAX]) { 107 if (tb[CTA_PROTONAT_PORT_MAX]) {
120 range->max.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); 108 range->max.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]);
121 range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED; 109 range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
122 } 110 }
123 return 0; 111 return 0;
124} 112}
125EXPORT_SYMBOL_GPL(nf_nat_proto_range_to_nlattr); 113EXPORT_SYMBOL_GPL(nf_nat_proto_nlattr_to_range);
126#endif 114#endif
diff --git a/net/ipv4/netfilter/nf_nat_proto_dccp.c b/net/ipv4/netfilter/nf_nat_proto_dccp.c
index 570faf2667b2..3f67138d187c 100644
--- a/net/ipv4/netfilter/nf_nat_proto_dccp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_dccp.c
@@ -24,7 +24,7 @@ static u_int16_t dccp_port_rover;
24 24
25static void 25static void
26dccp_unique_tuple(struct nf_conntrack_tuple *tuple, 26dccp_unique_tuple(struct nf_conntrack_tuple *tuple,
27 const struct nf_nat_range *range, 27 const struct nf_nat_ipv4_range *range,
28 enum nf_nat_manip_type maniptype, 28 enum nf_nat_manip_type maniptype,
29 const struct nf_conn *ct) 29 const struct nf_conn *ct)
30{ 30{
@@ -54,7 +54,7 @@ dccp_manip_pkt(struct sk_buff *skb,
54 iph = (struct iphdr *)(skb->data + iphdroff); 54 iph = (struct iphdr *)(skb->data + iphdroff);
55 hdr = (struct dccp_hdr *)(skb->data + hdroff); 55 hdr = (struct dccp_hdr *)(skb->data + hdroff);
56 56
57 if (maniptype == IP_NAT_MANIP_SRC) { 57 if (maniptype == NF_NAT_MANIP_SRC) {
58 oldip = iph->saddr; 58 oldip = iph->saddr;
59 newip = tuple->src.u3.ip; 59 newip = tuple->src.u3.ip;
60 newport = tuple->src.u.dccp.port; 60 newport = tuple->src.u.dccp.port;
@@ -80,12 +80,10 @@ dccp_manip_pkt(struct sk_buff *skb,
80 80
81static const struct nf_nat_protocol nf_nat_protocol_dccp = { 81static const struct nf_nat_protocol nf_nat_protocol_dccp = {
82 .protonum = IPPROTO_DCCP, 82 .protonum = IPPROTO_DCCP,
83 .me = THIS_MODULE,
84 .manip_pkt = dccp_manip_pkt, 83 .manip_pkt = dccp_manip_pkt,
85 .in_range = nf_nat_proto_in_range, 84 .in_range = nf_nat_proto_in_range,
86 .unique_tuple = dccp_unique_tuple, 85 .unique_tuple = dccp_unique_tuple,
87#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 86#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
88 .range_to_nlattr = nf_nat_proto_range_to_nlattr,
89 .nlattr_to_range = nf_nat_proto_nlattr_to_range, 87 .nlattr_to_range = nf_nat_proto_nlattr_to_range,
90#endif 88#endif
91}; 89};
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index bc8d83a31c73..46ba0b9ab985 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -39,7 +39,7 @@ MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
39/* generate unique tuple ... */ 39/* generate unique tuple ... */
40static void 40static void
41gre_unique_tuple(struct nf_conntrack_tuple *tuple, 41gre_unique_tuple(struct nf_conntrack_tuple *tuple,
42 const struct nf_nat_range *range, 42 const struct nf_nat_ipv4_range *range,
43 enum nf_nat_manip_type maniptype, 43 enum nf_nat_manip_type maniptype,
44 const struct nf_conn *ct) 44 const struct nf_conn *ct)
45{ 45{
@@ -52,12 +52,12 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
52 if (!ct->master) 52 if (!ct->master)
53 return; 53 return;
54 54
55 if (maniptype == IP_NAT_MANIP_SRC) 55 if (maniptype == NF_NAT_MANIP_SRC)
56 keyptr = &tuple->src.u.gre.key; 56 keyptr = &tuple->src.u.gre.key;
57 else 57 else
58 keyptr = &tuple->dst.u.gre.key; 58 keyptr = &tuple->dst.u.gre.key;
59 59
60 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { 60 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
61 pr_debug("%p: NATing GRE PPTP\n", ct); 61 pr_debug("%p: NATing GRE PPTP\n", ct);
62 min = 1; 62 min = 1;
63 range_size = 0xffff; 63 range_size = 0xffff;
@@ -99,7 +99,7 @@ gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff,
99 99
100 /* we only have destination manip of a packet, since 'source key' 100 /* we only have destination manip of a packet, since 'source key'
101 * is not present in the packet itself */ 101 * is not present in the packet itself */
102 if (maniptype != IP_NAT_MANIP_DST) 102 if (maniptype != NF_NAT_MANIP_DST)
103 return true; 103 return true;
104 switch (greh->version) { 104 switch (greh->version) {
105 case GRE_VERSION_1701: 105 case GRE_VERSION_1701:
@@ -119,12 +119,10 @@ gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff,
119 119
120static const struct nf_nat_protocol gre = { 120static const struct nf_nat_protocol gre = {
121 .protonum = IPPROTO_GRE, 121 .protonum = IPPROTO_GRE,
122 .me = THIS_MODULE,
123 .manip_pkt = gre_manip_pkt, 122 .manip_pkt = gre_manip_pkt,
124 .in_range = nf_nat_proto_in_range, 123 .in_range = nf_nat_proto_in_range,
125 .unique_tuple = gre_unique_tuple, 124 .unique_tuple = gre_unique_tuple,
126#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 125#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
127 .range_to_nlattr = nf_nat_proto_range_to_nlattr,
128 .nlattr_to_range = nf_nat_proto_nlattr_to_range, 126 .nlattr_to_range = nf_nat_proto_nlattr_to_range,
129#endif 127#endif
130}; 128};
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 9f4dc1235dc7..b35172851bae 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -30,7 +30,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,
30 30
31static void 31static void
32icmp_unique_tuple(struct nf_conntrack_tuple *tuple, 32icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
33 const struct nf_nat_range *range, 33 const struct nf_nat_ipv4_range *range,
34 enum nf_nat_manip_type maniptype, 34 enum nf_nat_manip_type maniptype,
35 const struct nf_conn *ct) 35 const struct nf_conn *ct)
36{ 36{
@@ -40,7 +40,7 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
40 40
41 range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1; 41 range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1;
42 /* If no range specified... */ 42 /* If no range specified... */
43 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) 43 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
44 range_size = 0xFFFF; 44 range_size = 0xFFFF;
45 45
46 for (i = 0; ; ++id) { 46 for (i = 0; ; ++id) {
@@ -74,12 +74,10 @@ icmp_manip_pkt(struct sk_buff *skb,
74 74
75const struct nf_nat_protocol nf_nat_protocol_icmp = { 75const struct nf_nat_protocol nf_nat_protocol_icmp = {
76 .protonum = IPPROTO_ICMP, 76 .protonum = IPPROTO_ICMP,
77 .me = THIS_MODULE,
78 .manip_pkt = icmp_manip_pkt, 77 .manip_pkt = icmp_manip_pkt,
79 .in_range = icmp_in_range, 78 .in_range = icmp_in_range,
80 .unique_tuple = icmp_unique_tuple, 79 .unique_tuple = icmp_unique_tuple,
81#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 80#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
82 .range_to_nlattr = nf_nat_proto_range_to_nlattr,
83 .nlattr_to_range = nf_nat_proto_nlattr_to_range, 81 .nlattr_to_range = nf_nat_proto_nlattr_to_range,
84#endif 82#endif
85}; 83};
diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c
index bd5a80a62a5b..3cce9b6c1c29 100644
--- a/net/ipv4/netfilter/nf_nat_proto_sctp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_sctp.c
@@ -19,7 +19,7 @@ static u_int16_t nf_sctp_port_rover;
19 19
20static void 20static void
21sctp_unique_tuple(struct nf_conntrack_tuple *tuple, 21sctp_unique_tuple(struct nf_conntrack_tuple *tuple,
22 const struct nf_nat_range *range, 22 const struct nf_nat_ipv4_range *range,
23 enum nf_nat_manip_type maniptype, 23 enum nf_nat_manip_type maniptype,
24 const struct nf_conn *ct) 24 const struct nf_conn *ct)
25{ 25{
@@ -46,7 +46,7 @@ sctp_manip_pkt(struct sk_buff *skb,
46 iph = (struct iphdr *)(skb->data + iphdroff); 46 iph = (struct iphdr *)(skb->data + iphdroff);
47 hdr = (struct sctphdr *)(skb->data + hdroff); 47 hdr = (struct sctphdr *)(skb->data + hdroff);
48 48
49 if (maniptype == IP_NAT_MANIP_SRC) { 49 if (maniptype == NF_NAT_MANIP_SRC) {
50 /* Get rid of src ip and src pt */ 50 /* Get rid of src ip and src pt */
51 oldip = iph->saddr; 51 oldip = iph->saddr;
52 newip = tuple->src.u3.ip; 52 newip = tuple->src.u3.ip;
@@ -70,12 +70,10 @@ sctp_manip_pkt(struct sk_buff *skb,
70 70
71static const struct nf_nat_protocol nf_nat_protocol_sctp = { 71static const struct nf_nat_protocol nf_nat_protocol_sctp = {
72 .protonum = IPPROTO_SCTP, 72 .protonum = IPPROTO_SCTP,
73 .me = THIS_MODULE,
74 .manip_pkt = sctp_manip_pkt, 73 .manip_pkt = sctp_manip_pkt,
75 .in_range = nf_nat_proto_in_range, 74 .in_range = nf_nat_proto_in_range,
76 .unique_tuple = sctp_unique_tuple, 75 .unique_tuple = sctp_unique_tuple,
77#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 76#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
78 .range_to_nlattr = nf_nat_proto_range_to_nlattr,
79 .nlattr_to_range = nf_nat_proto_nlattr_to_range, 77 .nlattr_to_range = nf_nat_proto_nlattr_to_range,
80#endif 78#endif
81}; 79};
diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c
index 0d67bb80130f..9fb4b4e72bbf 100644
--- a/net/ipv4/netfilter/nf_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c
@@ -23,7 +23,7 @@ static u_int16_t tcp_port_rover;
23 23
24static void 24static void
25tcp_unique_tuple(struct nf_conntrack_tuple *tuple, 25tcp_unique_tuple(struct nf_conntrack_tuple *tuple,
26 const struct nf_nat_range *range, 26 const struct nf_nat_ipv4_range *range,
27 enum nf_nat_manip_type maniptype, 27 enum nf_nat_manip_type maniptype,
28 const struct nf_conn *ct) 28 const struct nf_conn *ct)
29{ 29{
@@ -55,7 +55,7 @@ tcp_manip_pkt(struct sk_buff *skb,
55 iph = (struct iphdr *)(skb->data + iphdroff); 55 iph = (struct iphdr *)(skb->data + iphdroff);
56 hdr = (struct tcphdr *)(skb->data + hdroff); 56 hdr = (struct tcphdr *)(skb->data + hdroff);
57 57
58 if (maniptype == IP_NAT_MANIP_SRC) { 58 if (maniptype == NF_NAT_MANIP_SRC) {
59 /* Get rid of src ip and src pt */ 59 /* Get rid of src ip and src pt */
60 oldip = iph->saddr; 60 oldip = iph->saddr;
61 newip = tuple->src.u3.ip; 61 newip = tuple->src.u3.ip;
@@ -82,12 +82,10 @@ tcp_manip_pkt(struct sk_buff *skb,
82 82
83const struct nf_nat_protocol nf_nat_protocol_tcp = { 83const struct nf_nat_protocol nf_nat_protocol_tcp = {
84 .protonum = IPPROTO_TCP, 84 .protonum = IPPROTO_TCP,
85 .me = THIS_MODULE,
86 .manip_pkt = tcp_manip_pkt, 85 .manip_pkt = tcp_manip_pkt,
87 .in_range = nf_nat_proto_in_range, 86 .in_range = nf_nat_proto_in_range,
88 .unique_tuple = tcp_unique_tuple, 87 .unique_tuple = tcp_unique_tuple,
89#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 88#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
90 .range_to_nlattr = nf_nat_proto_range_to_nlattr,
91 .nlattr_to_range = nf_nat_proto_nlattr_to_range, 89 .nlattr_to_range = nf_nat_proto_nlattr_to_range,
92#endif 90#endif
93}; 91};
diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c
index 0b1b8601cba7..9883336e628f 100644
--- a/net/ipv4/netfilter/nf_nat_proto_udp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_udp.c
@@ -22,7 +22,7 @@ static u_int16_t udp_port_rover;
22 22
23static void 23static void
24udp_unique_tuple(struct nf_conntrack_tuple *tuple, 24udp_unique_tuple(struct nf_conntrack_tuple *tuple,
25 const struct nf_nat_range *range, 25 const struct nf_nat_ipv4_range *range,
26 enum nf_nat_manip_type maniptype, 26 enum nf_nat_manip_type maniptype,
27 const struct nf_conn *ct) 27 const struct nf_conn *ct)
28{ 28{
@@ -47,7 +47,7 @@ udp_manip_pkt(struct sk_buff *skb,
47 iph = (struct iphdr *)(skb->data + iphdroff); 47 iph = (struct iphdr *)(skb->data + iphdroff);
48 hdr = (struct udphdr *)(skb->data + hdroff); 48 hdr = (struct udphdr *)(skb->data + hdroff);
49 49
50 if (maniptype == IP_NAT_MANIP_SRC) { 50 if (maniptype == NF_NAT_MANIP_SRC) {
51 /* Get rid of src ip and src pt */ 51 /* Get rid of src ip and src pt */
52 oldip = iph->saddr; 52 oldip = iph->saddr;
53 newip = tuple->src.u3.ip; 53 newip = tuple->src.u3.ip;
@@ -73,12 +73,10 @@ udp_manip_pkt(struct sk_buff *skb,
73 73
74const struct nf_nat_protocol nf_nat_protocol_udp = { 74const struct nf_nat_protocol nf_nat_protocol_udp = {
75 .protonum = IPPROTO_UDP, 75 .protonum = IPPROTO_UDP,
76 .me = THIS_MODULE,
77 .manip_pkt = udp_manip_pkt, 76 .manip_pkt = udp_manip_pkt,
78 .in_range = nf_nat_proto_in_range, 77 .in_range = nf_nat_proto_in_range,
79 .unique_tuple = udp_unique_tuple, 78 .unique_tuple = udp_unique_tuple,
80#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 79#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
81 .range_to_nlattr = nf_nat_proto_range_to_nlattr,
82 .nlattr_to_range = nf_nat_proto_nlattr_to_range, 80 .nlattr_to_range = nf_nat_proto_nlattr_to_range,
83#endif 81#endif
84}; 82};
diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c
index f83ef23e2ab7..d24d10a7beb2 100644
--- a/net/ipv4/netfilter/nf_nat_proto_udplite.c
+++ b/net/ipv4/netfilter/nf_nat_proto_udplite.c
@@ -21,7 +21,7 @@ static u_int16_t udplite_port_rover;
21 21
22static void 22static void
23udplite_unique_tuple(struct nf_conntrack_tuple *tuple, 23udplite_unique_tuple(struct nf_conntrack_tuple *tuple,
24 const struct nf_nat_range *range, 24 const struct nf_nat_ipv4_range *range,
25 enum nf_nat_manip_type maniptype, 25 enum nf_nat_manip_type maniptype,
26 const struct nf_conn *ct) 26 const struct nf_conn *ct)
27{ 27{
@@ -47,7 +47,7 @@ udplite_manip_pkt(struct sk_buff *skb,
47 iph = (struct iphdr *)(skb->data + iphdroff); 47 iph = (struct iphdr *)(skb->data + iphdroff);
48 hdr = (struct udphdr *)(skb->data + hdroff); 48 hdr = (struct udphdr *)(skb->data + hdroff);
49 49
50 if (maniptype == IP_NAT_MANIP_SRC) { 50 if (maniptype == NF_NAT_MANIP_SRC) {
51 /* Get rid of src ip and src pt */ 51 /* Get rid of src ip and src pt */
52 oldip = iph->saddr; 52 oldip = iph->saddr;
53 newip = tuple->src.u3.ip; 53 newip = tuple->src.u3.ip;
@@ -72,12 +72,10 @@ udplite_manip_pkt(struct sk_buff *skb,
72 72
73static const struct nf_nat_protocol nf_nat_protocol_udplite = { 73static const struct nf_nat_protocol nf_nat_protocol_udplite = {
74 .protonum = IPPROTO_UDPLITE, 74 .protonum = IPPROTO_UDPLITE,
75 .me = THIS_MODULE,
76 .manip_pkt = udplite_manip_pkt, 75 .manip_pkt = udplite_manip_pkt,
77 .in_range = nf_nat_proto_in_range, 76 .in_range = nf_nat_proto_in_range,
78 .unique_tuple = udplite_unique_tuple, 77 .unique_tuple = udplite_unique_tuple,
79#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 78#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
80 .range_to_nlattr = nf_nat_proto_range_to_nlattr,
81 .nlattr_to_range = nf_nat_proto_nlattr_to_range, 79 .nlattr_to_range = nf_nat_proto_nlattr_to_range,
82#endif 80#endif
83}; 81};
diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c
index a50f2bc1c732..e0afe8112b1c 100644
--- a/net/ipv4/netfilter/nf_nat_proto_unknown.c
+++ b/net/ipv4/netfilter/nf_nat_proto_unknown.c
@@ -27,7 +27,7 @@ static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
27} 27}
28 28
29static void unknown_unique_tuple(struct nf_conntrack_tuple *tuple, 29static void unknown_unique_tuple(struct nf_conntrack_tuple *tuple,
30 const struct nf_nat_range *range, 30 const struct nf_nat_ipv4_range *range,
31 enum nf_nat_manip_type maniptype, 31 enum nf_nat_manip_type maniptype,
32 const struct nf_conn *ct) 32 const struct nf_conn *ct)
33{ 33{
@@ -46,7 +46,6 @@ unknown_manip_pkt(struct sk_buff *skb,
46} 46}
47 47
48const struct nf_nat_protocol nf_nat_unknown_protocol = { 48const struct nf_nat_protocol nf_nat_unknown_protocol = {
49 /* .me isn't set: getting a ref to this cannot fail. */
50 .manip_pkt = unknown_manip_pkt, 49 .manip_pkt = unknown_manip_pkt,
51 .in_range = unknown_in_range, 50 .in_range = unknown_in_range,
52 .unique_tuple = unknown_unique_tuple, 51 .unique_tuple = unknown_unique_tuple,
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index 733c9abc1cbd..d2a9dc314e0e 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -44,7 +44,7 @@ ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par)
44{ 44{
45 struct nf_conn *ct; 45 struct nf_conn *ct;
46 enum ip_conntrack_info ctinfo; 46 enum ip_conntrack_info ctinfo;
47 const struct nf_nat_multi_range_compat *mr = par->targinfo; 47 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
48 48
49 NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING || 49 NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING ||
50 par->hooknum == NF_INET_LOCAL_IN); 50 par->hooknum == NF_INET_LOCAL_IN);
@@ -56,7 +56,7 @@ ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par)
56 ctinfo == IP_CT_RELATED_REPLY)); 56 ctinfo == IP_CT_RELATED_REPLY));
57 NF_CT_ASSERT(par->out != NULL); 57 NF_CT_ASSERT(par->out != NULL);
58 58
59 return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC); 59 return nf_nat_setup_info(ct, &mr->range[0], NF_NAT_MANIP_SRC);
60} 60}
61 61
62static unsigned int 62static unsigned int
@@ -64,7 +64,7 @@ ipt_dnat_target(struct sk_buff *skb, const struct xt_action_param *par)
64{ 64{
65 struct nf_conn *ct; 65 struct nf_conn *ct;
66 enum ip_conntrack_info ctinfo; 66 enum ip_conntrack_info ctinfo;
67 const struct nf_nat_multi_range_compat *mr = par->targinfo; 67 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
68 68
69 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || 69 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
70 par->hooknum == NF_INET_LOCAL_OUT); 70 par->hooknum == NF_INET_LOCAL_OUT);
@@ -74,12 +74,12 @@ ipt_dnat_target(struct sk_buff *skb, const struct xt_action_param *par)
74 /* Connection must be valid and new. */ 74 /* Connection must be valid and new. */
75 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); 75 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
76 76
77 return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST); 77 return nf_nat_setup_info(ct, &mr->range[0], NF_NAT_MANIP_DST);
78} 78}
79 79
80static int ipt_snat_checkentry(const struct xt_tgchk_param *par) 80static int ipt_snat_checkentry(const struct xt_tgchk_param *par)
81{ 81{
82 const struct nf_nat_multi_range_compat *mr = par->targinfo; 82 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
83 83
84 /* Must be a valid range */ 84 /* Must be a valid range */
85 if (mr->rangesize != 1) { 85 if (mr->rangesize != 1) {
@@ -91,7 +91,7 @@ static int ipt_snat_checkentry(const struct xt_tgchk_param *par)
91 91
92static int ipt_dnat_checkentry(const struct xt_tgchk_param *par) 92static int ipt_dnat_checkentry(const struct xt_tgchk_param *par)
93{ 93{
94 const struct nf_nat_multi_range_compat *mr = par->targinfo; 94 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
95 95
96 /* Must be a valid range */ 96 /* Must be a valid range */
97 if (mr->rangesize != 1) { 97 if (mr->rangesize != 1) {
@@ -105,13 +105,13 @@ static unsigned int
105alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) 105alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
106{ 106{
107 /* Force range to this IP; let proto decide mapping for 107 /* Force range to this IP; let proto decide mapping for
108 per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). 108 per-proto parts (hence not NF_NAT_RANGE_PROTO_SPECIFIED).
109 */ 109 */
110 struct nf_nat_range range; 110 struct nf_nat_ipv4_range range;
111 111
112 range.flags = 0; 112 range.flags = 0;
113 pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, 113 pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
114 HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ? 114 HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ?
115 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip : 115 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
116 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); 116 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
117 117
@@ -140,7 +140,7 @@ int nf_nat_rule_find(struct sk_buff *skb,
140static struct xt_target ipt_snat_reg __read_mostly = { 140static struct xt_target ipt_snat_reg __read_mostly = {
141 .name = "SNAT", 141 .name = "SNAT",
142 .target = ipt_snat_target, 142 .target = ipt_snat_target,
143 .targetsize = sizeof(struct nf_nat_multi_range_compat), 143 .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
144 .table = "nat", 144 .table = "nat",
145 .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), 145 .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN),
146 .checkentry = ipt_snat_checkentry, 146 .checkentry = ipt_snat_checkentry,
@@ -150,7 +150,7 @@ static struct xt_target ipt_snat_reg __read_mostly = {
150static struct xt_target ipt_dnat_reg __read_mostly = { 150static struct xt_target ipt_dnat_reg __read_mostly = {
151 .name = "DNAT", 151 .name = "DNAT",
152 .target = ipt_dnat_target, 152 .target = ipt_dnat_target,
153 .targetsize = sizeof(struct nf_nat_multi_range_compat), 153 .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
154 .table = "nat", 154 .table = "nat",
155 .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), 155 .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT),
156 .checkentry = ipt_dnat_checkentry, 156 .checkentry = ipt_dnat_checkentry,
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index 78844d9208f1..d0319f96269f 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -249,25 +249,25 @@ static void ip_nat_sip_seq_adjust(struct sk_buff *skb, s16 off)
249static void ip_nat_sip_expected(struct nf_conn *ct, 249static void ip_nat_sip_expected(struct nf_conn *ct,
250 struct nf_conntrack_expect *exp) 250 struct nf_conntrack_expect *exp)
251{ 251{
252 struct nf_nat_range range; 252 struct nf_nat_ipv4_range range;
253 253
254 /* This must be a fresh one. */ 254 /* This must be a fresh one. */
255 BUG_ON(ct->status & IPS_NAT_DONE_MASK); 255 BUG_ON(ct->status & IPS_NAT_DONE_MASK);
256 256
257 /* For DST manip, map port here to where it's expected. */ 257 /* For DST manip, map port here to where it's expected. */
258 range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); 258 range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
259 range.min = range.max = exp->saved_proto; 259 range.min = range.max = exp->saved_proto;
260 range.min_ip = range.max_ip = exp->saved_ip; 260 range.min_ip = range.max_ip = exp->saved_ip;
261 nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST); 261 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
262 262
263 /* Change src to where master sends to, but only if the connection 263 /* Change src to where master sends to, but only if the connection
264 * actually came from the same source. */ 264 * actually came from the same source. */
265 if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 265 if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip ==
266 ct->master->tuplehash[exp->dir].tuple.src.u3.ip) { 266 ct->master->tuplehash[exp->dir].tuple.src.u3.ip) {
267 range.flags = IP_NAT_RANGE_MAP_IPS; 267 range.flags = NF_NAT_RANGE_MAP_IPS;
268 range.min_ip = range.max_ip 268 range.min_ip = range.max_ip
269 = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; 269 = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
270 nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC); 270 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
271 } 271 }
272} 272}
273 273
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index 92900482edea..3828a4229822 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -137,7 +137,7 @@ nf_nat_fn(unsigned int hooknum,
137 return ret; 137 return ret;
138 } else 138 } else
139 pr_debug("Already setup manip %s for ct %p\n", 139 pr_debug("Already setup manip %s for ct %p\n",
140 maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST", 140 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
141 ct); 141 ct);
142 break; 142 break;
143 143
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 43d4c3b22369..b072386cee21 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -140,13 +140,14 @@ static void ping_v4_unhash(struct sock *sk)
140 write_lock_bh(&ping_table.lock); 140 write_lock_bh(&ping_table.lock);
141 hlist_nulls_del(&sk->sk_nulls_node); 141 hlist_nulls_del(&sk->sk_nulls_node);
142 sock_put(sk); 142 sock_put(sk);
143 isk->inet_num = isk->inet_sport = 0; 143 isk->inet_num = 0;
144 isk->inet_sport = 0;
144 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 145 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
145 write_unlock_bh(&ping_table.lock); 146 write_unlock_bh(&ping_table.lock);
146 } 147 }
147} 148}
148 149
149static struct sock *ping_v4_lookup(struct net *net, u32 saddr, u32 daddr, 150static struct sock *ping_v4_lookup(struct net *net, __be32 saddr, __be32 daddr,
150 u16 ident, int dif) 151 u16 ident, int dif)
151{ 152{
152 struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident); 153 struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident);
@@ -154,15 +155,15 @@ static struct sock *ping_v4_lookup(struct net *net, u32 saddr, u32 daddr,
154 struct inet_sock *isk; 155 struct inet_sock *isk;
155 struct hlist_nulls_node *hnode; 156 struct hlist_nulls_node *hnode;
156 157
157 pr_debug("try to find: num = %d, daddr = %ld, dif = %d\n", 158 pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n",
158 (int)ident, (unsigned long)daddr, dif); 159 (int)ident, &daddr, dif);
159 read_lock_bh(&ping_table.lock); 160 read_lock_bh(&ping_table.lock);
160 161
161 ping_portaddr_for_each_entry(sk, hnode, hslot) { 162 ping_portaddr_for_each_entry(sk, hnode, hslot) {
162 isk = inet_sk(sk); 163 isk = inet_sk(sk);
163 164
164 pr_debug("found: %p: num = %d, daddr = %ld, dif = %d\n", sk, 165 pr_debug("found: %p: num = %d, daddr = %pI4, dif = %d\n", sk,
165 (int)isk->inet_num, (unsigned long)isk->inet_rcv_saddr, 166 (int)isk->inet_num, &isk->inet_rcv_saddr,
166 sk->sk_bound_dev_if); 167 sk->sk_bound_dev_if);
167 168
168 pr_debug("iterate\n"); 169 pr_debug("iterate\n");
@@ -254,7 +255,7 @@ static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
254 sk, addr->sin_addr.s_addr, ntohs(addr->sin_port)); 255 sk, addr->sin_addr.s_addr, ntohs(addr->sin_port));
255 256
256 chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); 257 chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
257 if (addr->sin_addr.s_addr == INADDR_ANY) 258 if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
258 chk_addr_ret = RTN_LOCAL; 259 chk_addr_ret = RTN_LOCAL;
259 260
260 if ((sysctl_ip_nonlocal_bind == 0 && 261 if ((sysctl_ip_nonlocal_bind == 0 &&
@@ -278,9 +279,9 @@ static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
278 goto out; 279 goto out;
279 } 280 }
280 281
281 pr_debug("after bind(): num = %d, daddr = %ld, dif = %d\n", 282 pr_debug("after bind(): num = %d, daddr = %pI4, dif = %d\n",
282 (int)isk->inet_num, 283 (int)isk->inet_num,
283 (unsigned long) isk->inet_rcv_saddr, 284 &isk->inet_rcv_saddr,
284 (int)sk->sk_bound_dev_if); 285 (int)sk->sk_bound_dev_if);
285 286
286 err = 0; 287 err = 0;
@@ -407,7 +408,7 @@ out:
407struct pingfakehdr { 408struct pingfakehdr {
408 struct icmphdr icmph; 409 struct icmphdr icmph;
409 struct iovec *iov; 410 struct iovec *iov;
410 u32 wcheck; 411 __wsum wcheck;
411}; 412};
412 413
413static int ping_getfrag(void *from, char * to, 414static int ping_getfrag(void *from, char * to,
@@ -459,7 +460,7 @@ static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
459 struct rtable *rt = NULL; 460 struct rtable *rt = NULL;
460 struct ip_options_data opt_copy; 461 struct ip_options_data opt_copy;
461 int free = 0; 462 int free = 0;
462 u32 saddr, daddr, faddr; 463 __be32 saddr, daddr, faddr;
463 u8 tos; 464 u8 tos;
464 int err; 465 int err;
465 466
@@ -629,6 +630,7 @@ static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
629 630
630 pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num); 631 pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num);
631 632
633 err = -EOPNOTSUPP;
632 if (flags & MSG_OOB) 634 if (flags & MSG_OOB)
633 goto out; 635 goto out;
634 636
@@ -696,8 +698,8 @@ void ping_rcv(struct sk_buff *skb)
696 struct net *net = dev_net(skb->dev); 698 struct net *net = dev_net(skb->dev);
697 struct iphdr *iph = ip_hdr(skb); 699 struct iphdr *iph = ip_hdr(skb);
698 struct icmphdr *icmph = icmp_hdr(skb); 700 struct icmphdr *icmph = icmp_hdr(skb);
699 u32 saddr = iph->saddr; 701 __be32 saddr = iph->saddr;
700 u32 daddr = iph->daddr; 702 __be32 daddr = iph->daddr;
701 703
702 /* We assume the packet has already been checked by icmp_rcv */ 704 /* We assume the packet has already been checked by icmp_rcv */
703 705
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 466ea8bb7a4d..6afc807ee2ad 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -56,17 +56,17 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
56 56
57 local_bh_disable(); 57 local_bh_disable();
58 orphans = percpu_counter_sum_positive(&tcp_orphan_count); 58 orphans = percpu_counter_sum_positive(&tcp_orphan_count);
59 sockets = percpu_counter_sum_positive(&tcp_sockets_allocated); 59 sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
60 local_bh_enable(); 60 local_bh_enable();
61 61
62 socket_seq_show(seq); 62 socket_seq_show(seq);
63 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", 63 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
64 sock_prot_inuse_get(net, &tcp_prot), orphans, 64 sock_prot_inuse_get(net, &tcp_prot), orphans,
65 tcp_death_row.tw_count, sockets, 65 tcp_death_row.tw_count, sockets,
66 atomic_long_read(&tcp_memory_allocated)); 66 proto_memory_allocated(&tcp_prot));
67 seq_printf(seq, "UDP: inuse %d mem %ld\n", 67 seq_printf(seq, "UDP: inuse %d mem %ld\n",
68 sock_prot_inuse_get(net, &udp_prot), 68 sock_prot_inuse_get(net, &udp_prot),
69 atomic_long_read(&udp_memory_allocated)); 69 proto_memory_allocated(&udp_prot));
70 seq_printf(seq, "UDPLITE: inuse %d\n", 70 seq_printf(seq, "UDPLITE: inuse %d\n",
71 sock_prot_inuse_get(net, &udplite_prot)); 71 sock_prot_inuse_get(net, &udplite_prot));
72 seq_printf(seq, "RAW: inuse %d\n", 72 seq_printf(seq, "RAW: inuse %d\n",
@@ -216,7 +216,6 @@ static const struct snmp_mib snmp4_net_list[] = {
216 SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO), 216 SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO),
217 SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO), 217 SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO),
218 SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO), 218 SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO),
219 SNMP_MIB_ITEM("TCPLoss", LINUX_MIB_TCPLOSS),
220 SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT), 219 SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT),
221 SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES), 220 SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES),
222 SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES), 221 SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
@@ -288,7 +287,7 @@ static void icmpmsg_put(struct seq_file *seq)
288 287
289 count = 0; 288 count = 0;
290 for (i = 0; i < ICMPMSG_MIB_MAX; i++) { 289 for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
291 val = snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, i); 290 val = atomic_long_read(&net->mib.icmpmsg_statistics->mibs[i]);
292 if (val) { 291 if (val) {
293 type[count] = i; 292 type[count] = i;
294 vals[count++] = val; 293 vals[count++] = val;
@@ -307,6 +306,7 @@ static void icmp_put(struct seq_file *seq)
307{ 306{
308 int i; 307 int i;
309 struct net *net = seq->private; 308 struct net *net = seq->private;
309 atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs;
310 310
311 seq_puts(seq, "\nIcmp: InMsgs InErrors"); 311 seq_puts(seq, "\nIcmp: InMsgs InErrors");
312 for (i=0; icmpmibmap[i].name != NULL; i++) 312 for (i=0; icmpmibmap[i].name != NULL; i++)
@@ -319,15 +319,13 @@ static void icmp_put(struct seq_file *seq)
319 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS)); 319 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS));
320 for (i=0; icmpmibmap[i].name != NULL; i++) 320 for (i=0; icmpmibmap[i].name != NULL; i++)
321 seq_printf(seq, " %lu", 321 seq_printf(seq, " %lu",
322 snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, 322 atomic_long_read(ptr + icmpmibmap[i].index));
323 icmpmibmap[i].index));
324 seq_printf(seq, " %lu %lu", 323 seq_printf(seq, " %lu %lu",
325 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), 324 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
326 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); 325 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
327 for (i=0; icmpmibmap[i].name != NULL; i++) 326 for (i=0; icmpmibmap[i].name != NULL; i++)
328 seq_printf(seq, " %lu", 327 seq_printf(seq, " %lu",
329 snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, 328 atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
330 icmpmibmap[i].index | 0x100));
331} 329}
332 330
333/* 331/*
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 007e2eb769d3..3ccda5ae8a27 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -292,7 +292,8 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
292{ 292{
293 /* Charge it to the socket. */ 293 /* Charge it to the socket. */
294 294
295 if (ip_queue_rcv_skb(sk, skb) < 0) { 295 ipv4_pktinfo_prepare(skb);
296 if (sock_queue_rcv_skb(sk, skb) < 0) {
296 kfree_skb(skb); 297 kfree_skb(skb);
297 return NET_RX_DROP; 298 return NET_RX_DROP;
298 } 299 }
@@ -327,6 +328,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
327 unsigned int iphlen; 328 unsigned int iphlen;
328 int err; 329 int err;
329 struct rtable *rt = *rtp; 330 struct rtable *rt = *rtp;
331 int hlen, tlen;
330 332
331 if (length > rt->dst.dev->mtu) { 333 if (length > rt->dst.dev->mtu) {
332 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, 334 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
@@ -336,12 +338,14 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
336 if (flags&MSG_PROBE) 338 if (flags&MSG_PROBE)
337 goto out; 339 goto out;
338 340
341 hlen = LL_RESERVED_SPACE(rt->dst.dev);
342 tlen = rt->dst.dev->needed_tailroom;
339 skb = sock_alloc_send_skb(sk, 343 skb = sock_alloc_send_skb(sk,
340 length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15, 344 length + hlen + tlen + 15,
341 flags & MSG_DONTWAIT, &err); 345 flags & MSG_DONTWAIT, &err);
342 if (skb == NULL) 346 if (skb == NULL)
343 goto error; 347 goto error;
344 skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev)); 348 skb_reserve(skb, hlen);
345 349
346 skb->priority = sk->sk_priority; 350 skb->priority = sk->sk_priority;
347 skb->mark = sk->sk_mark; 351 skb->mark = sk->sk_mark;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 94cdbc55ca7e..019774796174 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -109,7 +109,6 @@
109#ifdef CONFIG_SYSCTL 109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h> 110#include <linux/sysctl.h>
111#endif 111#endif
112#include <net/atmclip.h>
113#include <net/secure_seq.h> 112#include <net/secure_seq.h>
114 113
115#define RT_FL_TOS(oldflp4) \ 114#define RT_FL_TOS(oldflp4) \
@@ -133,7 +132,6 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
133static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 132static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
134static int ip_rt_min_advmss __read_mostly = 256; 133static int ip_rt_min_advmss __read_mostly = 256;
135static int rt_chain_length_max __read_mostly = 20; 134static int rt_chain_length_max __read_mostly = 20;
136static int redirect_genid;
137 135
138static struct delayed_work expires_work; 136static struct delayed_work expires_work;
139static unsigned long expires_ljiffies; 137static unsigned long expires_ljiffies;
@@ -425,7 +423,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
425 int len, HHUptod; 423 int len, HHUptod;
426 424
427 rcu_read_lock(); 425 rcu_read_lock();
428 n = dst_get_neighbour(&r->dst); 426 n = dst_get_neighbour_noref(&r->dst);
429 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0; 427 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
430 rcu_read_unlock(); 428 rcu_read_unlock();
431 429
@@ -938,7 +936,7 @@ static void rt_cache_invalidate(struct net *net)
938 936
939 get_random_bytes(&shuffle, sizeof(shuffle)); 937 get_random_bytes(&shuffle, sizeof(shuffle));
940 atomic_add(shuffle + 1U, &net->ipv4.rt_genid); 938 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
941 redirect_genid++; 939 inetpeer_invalidate_tree(AF_INET);
942} 940}
943 941
944/* 942/*
@@ -1115,23 +1113,18 @@ static int slow_chain_length(const struct rtable *head)
1115 1113
1116static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) 1114static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1117{ 1115{
1118 struct neigh_table *tbl = &arp_tbl;
1119 static const __be32 inaddr_any = 0; 1116 static const __be32 inaddr_any = 0;
1120 struct net_device *dev = dst->dev; 1117 struct net_device *dev = dst->dev;
1121 const __be32 *pkey = daddr; 1118 const __be32 *pkey = daddr;
1122 struct neighbour *n; 1119 struct neighbour *n;
1123 1120
1124#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1125 if (dev->type == ARPHRD_ATM)
1126 tbl = clip_tbl_hook;
1127#endif
1128 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) 1121 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1129 pkey = &inaddr_any; 1122 pkey = &inaddr_any;
1130 1123
1131 n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey); 1124 n = __ipv4_neigh_lookup(&arp_tbl, dev, *(__force u32 *)pkey);
1132 if (n) 1125 if (n)
1133 return n; 1126 return n;
1134 return neigh_create(tbl, pkey, dev); 1127 return neigh_create(&arp_tbl, pkey, dev);
1135} 1128}
1136 1129
1137static int rt_bind_neighbour(struct rtable *rt) 1130static int rt_bind_neighbour(struct rtable *rt)
@@ -1491,10 +1484,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1491 1484
1492 peer = rt->peer; 1485 peer = rt->peer;
1493 if (peer) { 1486 if (peer) {
1494 if (peer->redirect_learned.a4 != new_gw || 1487 if (peer->redirect_learned.a4 != new_gw) {
1495 peer->redirect_genid != redirect_genid) {
1496 peer->redirect_learned.a4 = new_gw; 1488 peer->redirect_learned.a4 = new_gw;
1497 peer->redirect_genid = redirect_genid;
1498 atomic_inc(&__rt_peer_genid); 1489 atomic_inc(&__rt_peer_genid);
1499 } 1490 }
1500 check_peer_redir(&rt->dst, peer); 1491 check_peer_redir(&rt->dst, peer);
@@ -1799,8 +1790,6 @@ static void ipv4_validate_peer(struct rtable *rt)
1799 if (peer) { 1790 if (peer) {
1800 check_peer_pmtu(&rt->dst, peer); 1791 check_peer_pmtu(&rt->dst, peer);
1801 1792
1802 if (peer->redirect_genid != redirect_genid)
1803 peer->redirect_learned.a4 = 0;
1804 if (peer->redirect_learned.a4 && 1793 if (peer->redirect_learned.a4 &&
1805 peer->redirect_learned.a4 != rt->rt_gateway) 1794 peer->redirect_learned.a4 != rt->rt_gateway)
1806 check_peer_redir(&rt->dst, peer); 1795 check_peer_redir(&rt->dst, peer);
@@ -1964,8 +1953,7 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1964 dst_init_metrics(&rt->dst, peer->metrics, false); 1953 dst_init_metrics(&rt->dst, peer->metrics, false);
1965 1954
1966 check_peer_pmtu(&rt->dst, peer); 1955 check_peer_pmtu(&rt->dst, peer);
1967 if (peer->redirect_genid != redirect_genid) 1956
1968 peer->redirect_learned.a4 = 0;
1969 if (peer->redirect_learned.a4 && 1957 if (peer->redirect_learned.a4 &&
1970 peer->redirect_learned.a4 != rt->rt_gateway) { 1958 peer->redirect_learned.a4 != rt->rt_gateway) {
1971 rt->rt_gateway = peer->redirect_learned.a4; 1959 rt->rt_gateway = peer->redirect_learned.a4;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 90f6544c13e2..eab2a7fb15d1 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -245,7 +245,7 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok)
245 if (!sysctl_tcp_timestamps) 245 if (!sysctl_tcp_timestamps)
246 return false; 246 return false;
247 247
248 tcp_opt->sack_ok = (options >> 4) & 0x1; 248 tcp_opt->sack_ok = (options & (1 << 4)) ? TCP_SACK_SEEN : 0;
249 *ecn_ok = (options >> 5) & 1; 249 *ecn_ok = (options >> 5) & 1;
250 if (*ecn_ok && !sysctl_tcp_ecn) 250 if (*ecn_ok && !sysctl_tcp_ecn)
251 return false; 251 return false;
@@ -278,6 +278,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
278 struct rtable *rt; 278 struct rtable *rt;
279 __u8 rcv_wscale; 279 __u8 rcv_wscale;
280 bool ecn_ok = false; 280 bool ecn_ok = false;
281 struct flowi4 fl4;
281 282
282 if (!sysctl_tcp_syncookies || !th->ack || th->rst) 283 if (!sysctl_tcp_syncookies || !th->ack || th->rst)
283 goto out; 284 goto out;
@@ -346,20 +347,16 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
346 * hasn't changed since we received the original syn, but I see 347 * hasn't changed since we received the original syn, but I see
347 * no easy way to do this. 348 * no easy way to do this.
348 */ 349 */
349 { 350 flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk),
350 struct flowi4 fl4; 351 RT_SCOPE_UNIVERSE, IPPROTO_TCP,
351 352 inet_sk_flowi_flags(sk),
352 flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk), 353 (opt && opt->srr) ? opt->faddr : ireq->rmt_addr,
353 RT_SCOPE_UNIVERSE, IPPROTO_TCP, 354 ireq->loc_addr, th->source, th->dest);
354 inet_sk_flowi_flags(sk), 355 security_req_classify_flow(req, flowi4_to_flowi(&fl4));
355 (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, 356 rt = ip_route_output_key(sock_net(sk), &fl4);
356 ireq->loc_addr, th->source, th->dest); 357 if (IS_ERR(rt)) {
357 security_req_classify_flow(req, flowi4_to_flowi(&fl4)); 358 reqsk_free(req);
358 rt = ip_route_output_key(sock_net(sk), &fl4); 359 goto out;
359 if (IS_ERR(rt)) {
360 reqsk_free(req);
361 goto out;
362 }
363 } 360 }
364 361
365 /* Try to redo what tcp_v4_send_synack did. */ 362 /* Try to redo what tcp_v4_send_synack did. */
@@ -373,5 +370,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
373 ireq->rcv_wscale = rcv_wscale; 370 ireq->rcv_wscale = rcv_wscale;
374 371
375 ret = get_cookie_sock(sk, skb, req, &rt->dst); 372 ret = get_cookie_sock(sk, skb, req, &rt->dst);
373 /* ip_queue_xmit() depends on our flow being setup
374 * Normal sockets get it right from inet_csk_route_child_sock()
375 */
376 if (ret)
377 inet_sk(ret)->cork.fl.u.ip4 = fl4;
376out: return ret; 378out: return ret;
377} 379}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 69fd7201129a..7a7724da9bff 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -14,6 +14,7 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/nsproxy.h> 16#include <linux/nsproxy.h>
17#include <linux/swap.h>
17#include <net/snmp.h> 18#include <net/snmp.h>
18#include <net/icmp.h> 19#include <net/icmp.h>
19#include <net/ip.h> 20#include <net/ip.h>
@@ -23,6 +24,7 @@
23#include <net/cipso_ipv4.h> 24#include <net/cipso_ipv4.h>
24#include <net/inet_frag.h> 25#include <net/inet_frag.h>
25#include <net/ping.h> 26#include <net/ping.h>
27#include <net/tcp_memcontrol.h>
26 28
27static int zero; 29static int zero;
28static int tcp_retr1_max = 255; 30static int tcp_retr1_max = 255;
@@ -73,7 +75,7 @@ static int ipv4_local_port_range(ctl_table *table, int write,
73} 75}
74 76
75 77
76void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high) 78static void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high)
77{ 79{
78 gid_t *data = table->data; 80 gid_t *data = table->data;
79 unsigned seq; 81 unsigned seq;
@@ -86,7 +88,7 @@ void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t
86} 88}
87 89
88/* Update system visible IP port range */ 90/* Update system visible IP port range */
89static void set_ping_group_range(struct ctl_table *table, int range[2]) 91static void set_ping_group_range(struct ctl_table *table, gid_t range[2])
90{ 92{
91 gid_t *data = table->data; 93 gid_t *data = table->data;
92 write_seqlock(&sysctl_local_ports.lock); 94 write_seqlock(&sysctl_local_ports.lock);
@@ -174,6 +176,49 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
174 return ret; 176 return ret;
175} 177}
176 178
179static int ipv4_tcp_mem(ctl_table *ctl, int write,
180 void __user *buffer, size_t *lenp,
181 loff_t *ppos)
182{
183 int ret;
184 unsigned long vec[3];
185 struct net *net = current->nsproxy->net_ns;
186#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
187 struct mem_cgroup *memcg;
188#endif
189
190 ctl_table tmp = {
191 .data = &vec,
192 .maxlen = sizeof(vec),
193 .mode = ctl->mode,
194 };
195
196 if (!write) {
197 ctl->data = &net->ipv4.sysctl_tcp_mem;
198 return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
199 }
200
201 ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
202 if (ret)
203 return ret;
204
205#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
206 rcu_read_lock();
207 memcg = mem_cgroup_from_task(current);
208
209 tcp_prot_mem(memcg, vec[0], 0);
210 tcp_prot_mem(memcg, vec[1], 1);
211 tcp_prot_mem(memcg, vec[2], 2);
212 rcu_read_unlock();
213#endif
214
215 net->ipv4.sysctl_tcp_mem[0] = vec[0];
216 net->ipv4.sysctl_tcp_mem[1] = vec[1];
217 net->ipv4.sysctl_tcp_mem[2] = vec[2];
218
219 return 0;
220}
221
177static struct ctl_table ipv4_table[] = { 222static struct ctl_table ipv4_table[] = {
178 { 223 {
179 .procname = "tcp_timestamps", 224 .procname = "tcp_timestamps",
@@ -433,13 +478,6 @@ static struct ctl_table ipv4_table[] = {
433 .proc_handler = proc_dointvec 478 .proc_handler = proc_dointvec
434 }, 479 },
435 { 480 {
436 .procname = "tcp_mem",
437 .data = &sysctl_tcp_mem,
438 .maxlen = sizeof(sysctl_tcp_mem),
439 .mode = 0644,
440 .proc_handler = proc_doulongvec_minmax
441 },
442 {
443 .procname = "tcp_wmem", 481 .procname = "tcp_wmem",
444 .data = &sysctl_tcp_wmem, 482 .data = &sysctl_tcp_wmem,
445 .maxlen = sizeof(sysctl_tcp_wmem), 483 .maxlen = sizeof(sysctl_tcp_wmem),
@@ -721,6 +759,12 @@ static struct ctl_table ipv4_net_table[] = {
721 .mode = 0644, 759 .mode = 0644,
722 .proc_handler = ipv4_ping_group_range, 760 .proc_handler = ipv4_ping_group_range,
723 }, 761 },
762 {
763 .procname = "tcp_mem",
764 .maxlen = sizeof(init_net.ipv4.sysctl_tcp_mem),
765 .mode = 0644,
766 .proc_handler = ipv4_tcp_mem,
767 },
724 { } 768 { }
725}; 769};
726 770
@@ -769,6 +813,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
769 813
770 net->ipv4.sysctl_rt_cache_rebuild_count = 4; 814 net->ipv4.sysctl_rt_cache_rebuild_count = 4;
771 815
816 tcp_init_mem(net);
817
772 net->ipv4.ipv4_hdr = register_net_sysctl_table(net, 818 net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
773 net_ipv4_ctl_path, table); 819 net_ipv4_ctl_path, table);
774 if (net->ipv4.ipv4_hdr == NULL) 820 if (net->ipv4.ipv4_hdr == NULL)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 34f5db1e1c8b..22ef5f9fd2ff 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -282,11 +282,9 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
282struct percpu_counter tcp_orphan_count; 282struct percpu_counter tcp_orphan_count;
283EXPORT_SYMBOL_GPL(tcp_orphan_count); 283EXPORT_SYMBOL_GPL(tcp_orphan_count);
284 284
285long sysctl_tcp_mem[3] __read_mostly;
286int sysctl_tcp_wmem[3] __read_mostly; 285int sysctl_tcp_wmem[3] __read_mostly;
287int sysctl_tcp_rmem[3] __read_mostly; 286int sysctl_tcp_rmem[3] __read_mostly;
288 287
289EXPORT_SYMBOL(sysctl_tcp_mem);
290EXPORT_SYMBOL(sysctl_tcp_rmem); 288EXPORT_SYMBOL(sysctl_tcp_rmem);
291EXPORT_SYMBOL(sysctl_tcp_wmem); 289EXPORT_SYMBOL(sysctl_tcp_wmem);
292 290
@@ -888,18 +886,18 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
888} 886}
889EXPORT_SYMBOL(tcp_sendpage); 887EXPORT_SYMBOL(tcp_sendpage);
890 888
891#define TCP_PAGE(sk) (sk->sk_sndmsg_page) 889static inline int select_size(const struct sock *sk, bool sg)
892#define TCP_OFF(sk) (sk->sk_sndmsg_off)
893
894static inline int select_size(const struct sock *sk, int sg)
895{ 890{
896 const struct tcp_sock *tp = tcp_sk(sk); 891 const struct tcp_sock *tp = tcp_sk(sk);
897 int tmp = tp->mss_cache; 892 int tmp = tp->mss_cache;
898 893
899 if (sg) { 894 if (sg) {
900 if (sk_can_gso(sk)) 895 if (sk_can_gso(sk)) {
901 tmp = 0; 896 /* Small frames wont use a full page:
902 else { 897 * Payload will immediately follow tcp header.
898 */
899 tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
900 } else {
903 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); 901 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
904 902
905 if (tmp >= pgbreak && 903 if (tmp >= pgbreak &&
@@ -917,9 +915,9 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
917 struct iovec *iov; 915 struct iovec *iov;
918 struct tcp_sock *tp = tcp_sk(sk); 916 struct tcp_sock *tp = tcp_sk(sk);
919 struct sk_buff *skb; 917 struct sk_buff *skb;
920 int iovlen, flags; 918 int iovlen, flags, err, copied;
921 int mss_now, size_goal; 919 int mss_now, size_goal;
922 int sg, err, copied; 920 bool sg;
923 long timeo; 921 long timeo;
924 922
925 lock_sock(sk); 923 lock_sock(sk);
@@ -946,7 +944,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
946 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 944 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
947 goto out_err; 945 goto out_err;
948 946
949 sg = sk->sk_route_caps & NETIF_F_SG; 947 sg = !!(sk->sk_route_caps & NETIF_F_SG);
950 948
951 while (--iovlen >= 0) { 949 while (--iovlen >= 0) {
952 size_t seglen = iov->iov_len; 950 size_t seglen = iov->iov_len;
@@ -1005,8 +1003,13 @@ new_segment:
1005 } else { 1003 } else {
1006 int merge = 0; 1004 int merge = 0;
1007 int i = skb_shinfo(skb)->nr_frags; 1005 int i = skb_shinfo(skb)->nr_frags;
1008 struct page *page = TCP_PAGE(sk); 1006 struct page *page = sk->sk_sndmsg_page;
1009 int off = TCP_OFF(sk); 1007 int off;
1008
1009 if (page && page_count(page) == 1)
1010 sk->sk_sndmsg_off = 0;
1011
1012 off = sk->sk_sndmsg_off;
1010 1013
1011 if (skb_can_coalesce(skb, i, page, off) && 1014 if (skb_can_coalesce(skb, i, page, off) &&
1012 off != PAGE_SIZE) { 1015 off != PAGE_SIZE) {
@@ -1023,7 +1026,7 @@ new_segment:
1023 } else if (page) { 1026 } else if (page) {
1024 if (off == PAGE_SIZE) { 1027 if (off == PAGE_SIZE) {
1025 put_page(page); 1028 put_page(page);
1026 TCP_PAGE(sk) = page = NULL; 1029 sk->sk_sndmsg_page = page = NULL;
1027 off = 0; 1030 off = 0;
1028 } 1031 }
1029 } else 1032 } else
@@ -1049,9 +1052,9 @@ new_segment:
1049 /* If this page was new, give it to the 1052 /* If this page was new, give it to the
1050 * socket so it does not get leaked. 1053 * socket so it does not get leaked.
1051 */ 1054 */
1052 if (!TCP_PAGE(sk)) { 1055 if (!sk->sk_sndmsg_page) {
1053 TCP_PAGE(sk) = page; 1056 sk->sk_sndmsg_page = page;
1054 TCP_OFF(sk) = 0; 1057 sk->sk_sndmsg_off = 0;
1055 } 1058 }
1056 goto do_error; 1059 goto do_error;
1057 } 1060 }
@@ -1061,15 +1064,15 @@ new_segment:
1061 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1064 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1062 } else { 1065 } else {
1063 skb_fill_page_desc(skb, i, page, off, copy); 1066 skb_fill_page_desc(skb, i, page, off, copy);
1064 if (TCP_PAGE(sk)) { 1067 if (sk->sk_sndmsg_page) {
1065 get_page(page); 1068 get_page(page);
1066 } else if (off + copy < PAGE_SIZE) { 1069 } else if (off + copy < PAGE_SIZE) {
1067 get_page(page); 1070 get_page(page);
1068 TCP_PAGE(sk) = page; 1071 sk->sk_sndmsg_page = page;
1069 } 1072 }
1070 } 1073 }
1071 1074
1072 TCP_OFF(sk) = off + copy; 1075 sk->sk_sndmsg_off = off + copy;
1073 } 1076 }
1074 1077
1075 if (!copied) 1078 if (!copied)
@@ -1873,6 +1876,20 @@ void tcp_shutdown(struct sock *sk, int how)
1873} 1876}
1874EXPORT_SYMBOL(tcp_shutdown); 1877EXPORT_SYMBOL(tcp_shutdown);
1875 1878
1879bool tcp_check_oom(struct sock *sk, int shift)
1880{
1881 bool too_many_orphans, out_of_socket_memory;
1882
1883 too_many_orphans = tcp_too_many_orphans(sk, shift);
1884 out_of_socket_memory = tcp_out_of_memory(sk);
1885
1886 if (too_many_orphans && net_ratelimit())
1887 pr_info("TCP: too many orphaned sockets\n");
1888 if (out_of_socket_memory && net_ratelimit())
1889 pr_info("TCP: out of memory -- consider tuning tcp_mem\n");
1890 return too_many_orphans || out_of_socket_memory;
1891}
1892
1876void tcp_close(struct sock *sk, long timeout) 1893void tcp_close(struct sock *sk, long timeout)
1877{ 1894{
1878 struct sk_buff *skb; 1895 struct sk_buff *skb;
@@ -2012,10 +2029,7 @@ adjudge_to_death:
2012 } 2029 }
2013 if (sk->sk_state != TCP_CLOSE) { 2030 if (sk->sk_state != TCP_CLOSE) {
2014 sk_mem_reclaim(sk); 2031 sk_mem_reclaim(sk);
2015 if (tcp_too_many_orphans(sk, 0)) { 2032 if (tcp_check_oom(sk, 0)) {
2016 if (net_ratelimit())
2017 printk(KERN_INFO "TCP: too many of orphaned "
2018 "sockets\n");
2019 tcp_set_state(sk, TCP_CLOSE); 2033 tcp_set_state(sk, TCP_CLOSE);
2020 tcp_send_active_reset(sk, GFP_ATOMIC); 2034 tcp_send_active_reset(sk, GFP_ATOMIC);
2021 NET_INC_STATS_BH(sock_net(sk), 2035 NET_INC_STATS_BH(sock_net(sk),
@@ -2653,7 +2667,8 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2653EXPORT_SYMBOL(compat_tcp_getsockopt); 2667EXPORT_SYMBOL(compat_tcp_getsockopt);
2654#endif 2668#endif
2655 2669
2656struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features) 2670struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
2671 netdev_features_t features)
2657{ 2672{
2658 struct sk_buff *segs = ERR_PTR(-EINVAL); 2673 struct sk_buff *segs = ERR_PTR(-EINVAL);
2659 struct tcphdr *th; 2674 struct tcphdr *th;
@@ -3212,11 +3227,21 @@ static int __init set_thash_entries(char *str)
3212} 3227}
3213__setup("thash_entries=", set_thash_entries); 3228__setup("thash_entries=", set_thash_entries);
3214 3229
3230void tcp_init_mem(struct net *net)
3231{
3232 unsigned long limit = nr_free_buffer_pages() / 8;
3233 limit = max(limit, 128UL);
3234 net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
3235 net->ipv4.sysctl_tcp_mem[1] = limit;
3236 net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
3237}
3238
3215void __init tcp_init(void) 3239void __init tcp_init(void)
3216{ 3240{
3217 struct sk_buff *skb = NULL; 3241 struct sk_buff *skb = NULL;
3218 unsigned long limit; 3242 unsigned long limit;
3219 int i, max_share, cnt; 3243 int max_share, cnt;
3244 unsigned int i;
3220 unsigned long jiffy = jiffies; 3245 unsigned long jiffy = jiffies;
3221 3246
3222 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); 3247 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
@@ -3259,7 +3284,7 @@ void __init tcp_init(void)
3259 &tcp_hashinfo.bhash_size, 3284 &tcp_hashinfo.bhash_size,
3260 NULL, 3285 NULL,
3261 64 * 1024); 3286 64 * 1024);
3262 tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; 3287 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
3263 for (i = 0; i < tcp_hashinfo.bhash_size; i++) { 3288 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
3264 spin_lock_init(&tcp_hashinfo.bhash[i].lock); 3289 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
3265 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); 3290 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
@@ -3272,14 +3297,10 @@ void __init tcp_init(void)
3272 sysctl_tcp_max_orphans = cnt / 2; 3297 sysctl_tcp_max_orphans = cnt / 2;
3273 sysctl_max_syn_backlog = max(128, cnt / 256); 3298 sysctl_max_syn_backlog = max(128, cnt / 256);
3274 3299
3275 limit = nr_free_buffer_pages() / 8; 3300 tcp_init_mem(&init_net);
3276 limit = max(limit, 128UL);
3277 sysctl_tcp_mem[0] = limit / 4 * 3;
3278 sysctl_tcp_mem[1] = limit;
3279 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
3280
3281 /* Set per-socket limits to no more than 1/128 the pressure threshold */ 3301 /* Set per-socket limits to no more than 1/128 the pressure threshold */
3282 limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); 3302 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 10);
3303 limit = max(limit, 128UL);
3283 max_share = min(4UL*1024*1024, limit); 3304 max_share = min(4UL*1024*1024, limit);
3284 3305
3285 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; 3306 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 6187eb4d1dcf..f45e1c242440 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -63,7 +63,6 @@ static inline void bictcp_reset(struct bictcp *ca)
63{ 63{
64 ca->cnt = 0; 64 ca->cnt = 0;
65 ca->last_max_cwnd = 0; 65 ca->last_max_cwnd = 0;
66 ca->loss_cwnd = 0;
67 ca->last_cwnd = 0; 66 ca->last_cwnd = 0;
68 ca->last_time = 0; 67 ca->last_time = 0;
69 ca->epoch_start = 0; 68 ca->epoch_start = 0;
@@ -72,7 +71,11 @@ static inline void bictcp_reset(struct bictcp *ca)
72 71
73static void bictcp_init(struct sock *sk) 72static void bictcp_init(struct sock *sk)
74{ 73{
75 bictcp_reset(inet_csk_ca(sk)); 74 struct bictcp *ca = inet_csk_ca(sk);
75
76 bictcp_reset(ca);
77 ca->loss_cwnd = 0;
78
76 if (initial_ssthresh) 79 if (initial_ssthresh)
77 tcp_sk(sk)->snd_ssthresh = initial_ssthresh; 80 tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
78} 81}
@@ -127,7 +130,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
127 } 130 }
128 131
129 /* if in slow start or link utilization is very low */ 132 /* if in slow start or link utilization is very low */
130 if (ca->loss_cwnd == 0) { 133 if (ca->last_max_cwnd == 0) {
131 if (ca->cnt > 20) /* increase cwnd 5% per RTT */ 134 if (ca->cnt > 20) /* increase cwnd 5% per RTT */
132 ca->cnt = 20; 135 ca->cnt = 20;
133 } 136 }
@@ -185,7 +188,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
185{ 188{
186 const struct tcp_sock *tp = tcp_sk(sk); 189 const struct tcp_sock *tp = tcp_sk(sk);
187 const struct bictcp *ca = inet_csk_ca(sk); 190 const struct bictcp *ca = inet_csk_ca(sk);
188 return max(tp->snd_cwnd, ca->last_max_cwnd); 191 return max(tp->snd_cwnd, ca->loss_cwnd);
189} 192}
190 193
191static void bictcp_state(struct sock *sk, u8 new_state) 194static void bictcp_state(struct sock *sk, u8 new_state)
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 850c737e08e2..fc6d475f488f 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -292,7 +292,7 @@ int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
292 left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && 292 left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
293 left * tp->mss_cache < sk->sk_gso_max_size) 293 left * tp->mss_cache < sk->sk_gso_max_size)
294 return 1; 294 return 1;
295 return left <= tcp_max_burst(tp); 295 return left <= tcp_max_tso_deferred_mss(tp);
296} 296}
297EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); 297EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
298 298
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index f376b05cca81..a9077f441cb2 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -107,7 +107,6 @@ static inline void bictcp_reset(struct bictcp *ca)
107{ 107{
108 ca->cnt = 0; 108 ca->cnt = 0;
109 ca->last_max_cwnd = 0; 109 ca->last_max_cwnd = 0;
110 ca->loss_cwnd = 0;
111 ca->last_cwnd = 0; 110 ca->last_cwnd = 0;
112 ca->last_time = 0; 111 ca->last_time = 0;
113 ca->bic_origin_point = 0; 112 ca->bic_origin_point = 0;
@@ -142,7 +141,10 @@ static inline void bictcp_hystart_reset(struct sock *sk)
142 141
143static void bictcp_init(struct sock *sk) 142static void bictcp_init(struct sock *sk)
144{ 143{
145 bictcp_reset(inet_csk_ca(sk)); 144 struct bictcp *ca = inet_csk_ca(sk);
145
146 bictcp_reset(ca);
147 ca->loss_cwnd = 0;
146 148
147 if (hystart) 149 if (hystart)
148 bictcp_hystart_reset(sk); 150 bictcp_hystart_reset(sk);
@@ -275,7 +277,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
275 * The initial growth of cubic function may be too conservative 277 * The initial growth of cubic function may be too conservative
276 * when the available bandwidth is still unknown. 278 * when the available bandwidth is still unknown.
277 */ 279 */
278 if (ca->loss_cwnd == 0 && ca->cnt > 20) 280 if (ca->last_max_cwnd == 0 && ca->cnt > 20)
279 ca->cnt = 20; /* increase cwnd 5% per RTT */ 281 ca->cnt = 20; /* increase cwnd 5% per RTT */
280 282
281 /* TCP Friendly */ 283 /* TCP Friendly */
@@ -342,7 +344,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
342{ 344{
343 struct bictcp *ca = inet_csk_ca(sk); 345 struct bictcp *ca = inet_csk_ca(sk);
344 346
345 return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); 347 return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
346} 348}
347 349
348static void bictcp_state(struct sock *sk, u8 new_state) 350static void bictcp_state(struct sock *sk, u8 new_state)
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 939edb3b8e4d..ed3f2ad42e0f 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -34,11 +34,23 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
34 tcp_get_info(sk, info); 34 tcp_get_info(sk, info);
35} 35}
36 36
37static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
38 struct inet_diag_req_v2 *r, struct nlattr *bc)
39{
40 inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc);
41}
42
43static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
44 struct inet_diag_req_v2 *req)
45{
46 return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req);
47}
48
37static const struct inet_diag_handler tcp_diag_handler = { 49static const struct inet_diag_handler tcp_diag_handler = {
38 .idiag_hashinfo = &tcp_hashinfo, 50 .dump = tcp_diag_dump,
51 .dump_one = tcp_diag_dump_one,
39 .idiag_get_info = tcp_diag_get_info, 52 .idiag_get_info = tcp_diag_get_info,
40 .idiag_type = TCPDIAG_GETSOCK, 53 .idiag_type = IPPROTO_TCP,
41 .idiag_info_size = sizeof(struct tcp_info),
42}; 54};
43 55
44static int __init tcp_diag_init(void) 56static int __init tcp_diag_init(void)
@@ -54,4 +66,4 @@ static void __exit tcp_diag_exit(void)
54module_init(tcp_diag_init); 66module_init(tcp_diag_init);
55module_exit(tcp_diag_exit); 67module_exit(tcp_diag_exit);
56MODULE_LICENSE("GPL"); 68MODULE_LICENSE("GPL");
57MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_INET_DIAG, TCPDIAG_GETSOCK); 69MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-6 /* AF_INET - IPPROTO_TCP */);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 52b5c2d0ecd0..b5e315f13641 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -105,7 +105,6 @@ int sysctl_tcp_abc __read_mostly;
105#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ 105#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
106#define FLAG_DATA_SACKED 0x20 /* New SACK. */ 106#define FLAG_DATA_SACKED 0x20 /* New SACK. */
107#define FLAG_ECE 0x40 /* ECE in this ACK */ 107#define FLAG_ECE 0x40 /* ECE in this ACK */
108#define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */
109#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ 108#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
110#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ 109#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */
111#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ 110#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
@@ -322,7 +321,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
322 /* Check #1 */ 321 /* Check #1 */
323 if (tp->rcv_ssthresh < tp->window_clamp && 322 if (tp->rcv_ssthresh < tp->window_clamp &&
324 (int)tp->rcv_ssthresh < tcp_space(sk) && 323 (int)tp->rcv_ssthresh < tcp_space(sk) &&
325 !tcp_memory_pressure) { 324 !sk_under_memory_pressure(sk)) {
326 int incr; 325 int incr;
327 326
328 /* Check #2. Increase window, if skb with such overhead 327 /* Check #2. Increase window, if skb with such overhead
@@ -411,8 +410,8 @@ static void tcp_clamp_window(struct sock *sk)
411 410
412 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && 411 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
413 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && 412 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
414 !tcp_memory_pressure && 413 !sk_under_memory_pressure(sk) &&
415 atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { 414 sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
416 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), 415 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
417 sysctl_tcp_rmem[2]); 416 sysctl_tcp_rmem[2]);
418 } 417 }
@@ -865,13 +864,13 @@ static void tcp_disable_fack(struct tcp_sock *tp)
865 /* RFC3517 uses different metric in lost marker => reset on change */ 864 /* RFC3517 uses different metric in lost marker => reset on change */
866 if (tcp_is_fack(tp)) 865 if (tcp_is_fack(tp))
867 tp->lost_skb_hint = NULL; 866 tp->lost_skb_hint = NULL;
868 tp->rx_opt.sack_ok &= ~2; 867 tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
869} 868}
870 869
871/* Take a notice that peer is sending D-SACKs */ 870/* Take a notice that peer is sending D-SACKs */
872static void tcp_dsack_seen(struct tcp_sock *tp) 871static void tcp_dsack_seen(struct tcp_sock *tp)
873{ 872{
874 tp->rx_opt.sack_ok |= 4; 873 tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
875} 874}
876 875
877/* Initialize metrics on socket. */ 876/* Initialize metrics on socket. */
@@ -1040,13 +1039,11 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
1040 * These 6 states form finite state machine, controlled by the following events: 1039 * These 6 states form finite state machine, controlled by the following events:
1041 * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) 1040 * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
1042 * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) 1041 * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
1043 * 3. Loss detection event of one of three flavors: 1042 * 3. Loss detection event of two flavors:
1044 * A. Scoreboard estimator decided the packet is lost. 1043 * A. Scoreboard estimator decided the packet is lost.
1045 * A'. Reno "three dupacks" marks head of queue lost. 1044 * A'. Reno "three dupacks" marks head of queue lost.
1046 * A''. Its FACK modfication, head until snd.fack is lost. 1045 * A''. Its FACK modification, head until snd.fack is lost.
1047 * B. SACK arrives sacking data transmitted after never retransmitted 1046 * B. SACK arrives sacking SND.NXT at the moment, when the
1048 * hole was sent out.
1049 * C. SACK arrives sacking SND.NXT at the moment, when the
1050 * segment was retransmitted. 1047 * segment was retransmitted.
1051 * 4. D-SACK added new rule: D-SACK changes any tag to S. 1048 * 4. D-SACK added new rule: D-SACK changes any tag to S.
1052 * 1049 *
@@ -1153,7 +1150,7 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
1153} 1150}
1154 1151
1155/* Check for lost retransmit. This superb idea is borrowed from "ratehalving". 1152/* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
1156 * Event "C". Later note: FACK people cheated me again 8), we have to account 1153 * Event "B". Later note: FACK people cheated me again 8), we have to account
1157 * for reordering! Ugly, but should help. 1154 * for reordering! Ugly, but should help.
1158 * 1155 *
1159 * Search retransmitted skbs from write_queue that were sent when snd_nxt was 1156 * Search retransmitted skbs from write_queue that were sent when snd_nxt was
@@ -1310,25 +1307,26 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1310 return in_sack; 1307 return in_sack;
1311} 1308}
1312 1309
1313static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk, 1310/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
1314 struct tcp_sacktag_state *state, 1311static u8 tcp_sacktag_one(struct sock *sk,
1312 struct tcp_sacktag_state *state, u8 sacked,
1313 u32 start_seq, u32 end_seq,
1315 int dup_sack, int pcount) 1314 int dup_sack, int pcount)
1316{ 1315{
1317 struct tcp_sock *tp = tcp_sk(sk); 1316 struct tcp_sock *tp = tcp_sk(sk);
1318 u8 sacked = TCP_SKB_CB(skb)->sacked;
1319 int fack_count = state->fack_count; 1317 int fack_count = state->fack_count;
1320 1318
1321 /* Account D-SACK for retransmitted packet. */ 1319 /* Account D-SACK for retransmitted packet. */
1322 if (dup_sack && (sacked & TCPCB_RETRANS)) { 1320 if (dup_sack && (sacked & TCPCB_RETRANS)) {
1323 if (tp->undo_marker && tp->undo_retrans && 1321 if (tp->undo_marker && tp->undo_retrans &&
1324 after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) 1322 after(end_seq, tp->undo_marker))
1325 tp->undo_retrans--; 1323 tp->undo_retrans--;
1326 if (sacked & TCPCB_SACKED_ACKED) 1324 if (sacked & TCPCB_SACKED_ACKED)
1327 state->reord = min(fack_count, state->reord); 1325 state->reord = min(fack_count, state->reord);
1328 } 1326 }
1329 1327
1330 /* Nothing to do; acked frame is about to be dropped (was ACKed). */ 1328 /* Nothing to do; acked frame is about to be dropped (was ACKed). */
1331 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) 1329 if (!after(end_seq, tp->snd_una))
1332 return sacked; 1330 return sacked;
1333 1331
1334 if (!(sacked & TCPCB_SACKED_ACKED)) { 1332 if (!(sacked & TCPCB_SACKED_ACKED)) {
@@ -1347,13 +1345,13 @@ static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk,
1347 /* New sack for not retransmitted frame, 1345 /* New sack for not retransmitted frame,
1348 * which was in hole. It is reordering. 1346 * which was in hole. It is reordering.
1349 */ 1347 */
1350 if (before(TCP_SKB_CB(skb)->seq, 1348 if (before(start_seq,
1351 tcp_highest_sack_seq(tp))) 1349 tcp_highest_sack_seq(tp)))
1352 state->reord = min(fack_count, 1350 state->reord = min(fack_count,
1353 state->reord); 1351 state->reord);
1354 1352
1355 /* SACK enhanced F-RTO (RFC4138; Appendix B) */ 1353 /* SACK enhanced F-RTO (RFC4138; Appendix B) */
1356 if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) 1354 if (!after(end_seq, tp->frto_highmark))
1357 state->flag |= FLAG_ONLY_ORIG_SACKED; 1355 state->flag |= FLAG_ONLY_ORIG_SACKED;
1358 } 1356 }
1359 1357
@@ -1371,8 +1369,7 @@ static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk,
1371 1369
1372 /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ 1370 /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
1373 if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && 1371 if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) &&
1374 before(TCP_SKB_CB(skb)->seq, 1372 before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
1375 TCP_SKB_CB(tp->lost_skb_hint)->seq))
1376 tp->lost_cnt_hint += pcount; 1373 tp->lost_cnt_hint += pcount;
1377 1374
1378 if (fack_count > tp->fackets_out) 1375 if (fack_count > tp->fackets_out)
@@ -1391,6 +1388,9 @@ static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk,
1391 return sacked; 1388 return sacked;
1392} 1389}
1393 1390
1391/* Shift newly-SACKed bytes from this skb to the immediately previous
1392 * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
1393 */
1394static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, 1394static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1395 struct tcp_sacktag_state *state, 1395 struct tcp_sacktag_state *state,
1396 unsigned int pcount, int shifted, int mss, 1396 unsigned int pcount, int shifted, int mss,
@@ -1398,9 +1398,20 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1398{ 1398{
1399 struct tcp_sock *tp = tcp_sk(sk); 1399 struct tcp_sock *tp = tcp_sk(sk);
1400 struct sk_buff *prev = tcp_write_queue_prev(sk, skb); 1400 struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
1401 u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */
1402 u32 end_seq = start_seq + shifted; /* end of newly-SACKed */
1401 1403
1402 BUG_ON(!pcount); 1404 BUG_ON(!pcount);
1403 1405
1406 /* Adjust counters and hints for the newly sacked sequence
1407 * range but discard the return value since prev is already
1408 * marked. We must tag the range first because the seq
1409 * advancement below implicitly advances
1410 * tcp_highest_sack_seq() when skb is highest_sack.
1411 */
1412 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
1413 start_seq, end_seq, dup_sack, pcount);
1414
1404 if (skb == tp->lost_skb_hint) 1415 if (skb == tp->lost_skb_hint)
1405 tp->lost_cnt_hint += pcount; 1416 tp->lost_cnt_hint += pcount;
1406 1417
@@ -1427,9 +1438,6 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1427 skb_shinfo(skb)->gso_type = 0; 1438 skb_shinfo(skb)->gso_type = 0;
1428 } 1439 }
1429 1440
1430 /* We discard results */
1431 tcp_sacktag_one(skb, sk, state, dup_sack, pcount);
1432
1433 /* Difference in this won't matter, both ACKed by the same cumul. ACK */ 1441 /* Difference in this won't matter, both ACKed by the same cumul. ACK */
1434 TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); 1442 TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
1435 1443
@@ -1577,6 +1585,10 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1577 } 1585 }
1578 } 1586 }
1579 1587
1588 /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
1589 if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
1590 goto fallback;
1591
1580 if (!skb_shift(prev, skb, len)) 1592 if (!skb_shift(prev, skb, len))
1581 goto fallback; 1593 goto fallback;
1582 if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) 1594 if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
@@ -1667,10 +1679,14 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1667 break; 1679 break;
1668 1680
1669 if (in_sack) { 1681 if (in_sack) {
1670 TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk, 1682 TCP_SKB_CB(skb)->sacked =
1671 state, 1683 tcp_sacktag_one(sk,
1672 dup_sack, 1684 state,
1673 tcp_skb_pcount(skb)); 1685 TCP_SKB_CB(skb)->sacked,
1686 TCP_SKB_CB(skb)->seq,
1687 TCP_SKB_CB(skb)->end_seq,
1688 dup_sack,
1689 tcp_skb_pcount(skb));
1674 1690
1675 if (!before(TCP_SKB_CB(skb)->seq, 1691 if (!before(TCP_SKB_CB(skb)->seq,
1676 tcp_highest_sack_seq(tp))) 1692 tcp_highest_sack_seq(tp)))
@@ -1844,10 +1860,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1844 if (found_dup_sack && ((i + 1) == first_sack_index)) 1860 if (found_dup_sack && ((i + 1) == first_sack_index))
1845 next_dup = &sp[i + 1]; 1861 next_dup = &sp[i + 1];
1846 1862
1847 /* Event "B" in the comment above. */
1848 if (after(end_seq, tp->high_seq))
1849 state.flag |= FLAG_DATA_LOST;
1850
1851 /* Skip too early cached blocks */ 1863 /* Skip too early cached blocks */
1852 while (tcp_sack_cache_ok(tp, cache) && 1864 while (tcp_sack_cache_ok(tp, cache) &&
1853 !before(start_seq, cache->end_seq)) 1865 !before(start_seq, cache->end_seq))
@@ -2515,8 +2527,11 @@ static void tcp_timeout_skbs(struct sock *sk)
2515 tcp_verify_left_out(tp); 2527 tcp_verify_left_out(tp);
2516} 2528}
2517 2529
2518/* Mark head of queue up as lost. With RFC3517 SACK, the packets is 2530/* Detect loss in event "A" above by marking head of queue up as lost.
2519 * is against sacked "cnt", otherwise it's against facked "cnt" 2531 * For FACK or non-SACK(Reno) senders, the first "packets" number of segments
2532 * are considered lost. For RFC3517 SACK, a segment is considered lost if it
2533 * has at least tp->reordering SACKed seqments above it; "packets" refers to
2534 * the maximum SACKed segments to pass before reaching this limit.
2520 */ 2535 */
2521static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) 2536static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2522{ 2537{
@@ -2525,6 +2540,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2525 int cnt, oldcnt; 2540 int cnt, oldcnt;
2526 int err; 2541 int err;
2527 unsigned int mss; 2542 unsigned int mss;
2543 /* Use SACK to deduce losses of new sequences sent during recovery */
2544 const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
2528 2545
2529 WARN_ON(packets > tp->packets_out); 2546 WARN_ON(packets > tp->packets_out);
2530 if (tp->lost_skb_hint) { 2547 if (tp->lost_skb_hint) {
@@ -2546,7 +2563,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2546 tp->lost_skb_hint = skb; 2563 tp->lost_skb_hint = skb;
2547 tp->lost_cnt_hint = cnt; 2564 tp->lost_cnt_hint = cnt;
2548 2565
2549 if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) 2566 if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
2550 break; 2567 break;
2551 2568
2552 oldcnt = cnt; 2569 oldcnt = cnt;
@@ -2556,6 +2573,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2556 2573
2557 if (cnt > packets) { 2574 if (cnt > packets) {
2558 if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || 2575 if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
2576 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
2559 (oldcnt >= packets)) 2577 (oldcnt >= packets))
2560 break; 2578 break;
2561 2579
@@ -2663,7 +2681,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
2663 tp->snd_ssthresh, tp->prior_ssthresh, 2681 tp->snd_ssthresh, tp->prior_ssthresh,
2664 tp->packets_out); 2682 tp->packets_out);
2665 } 2683 }
2666#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 2684#if IS_ENABLED(CONFIG_IPV6)
2667 else if (sk->sk_family == AF_INET6) { 2685 else if (sk->sk_family == AF_INET6) {
2668 struct ipv6_pinfo *np = inet6_sk(sk); 2686 struct ipv6_pinfo *np = inet6_sk(sk);
2669 printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", 2687 printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
@@ -2858,7 +2876,7 @@ static void tcp_try_keep_open(struct sock *sk)
2858 struct tcp_sock *tp = tcp_sk(sk); 2876 struct tcp_sock *tp = tcp_sk(sk);
2859 int state = TCP_CA_Open; 2877 int state = TCP_CA_Open;
2860 2878
2861 if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker) 2879 if (tcp_left_out(tp) || tcp_any_retrans_done(sk))
2862 state = TCP_CA_Disorder; 2880 state = TCP_CA_Disorder;
2863 2881
2864 if (inet_csk(sk)->icsk_ca_state != state) { 2882 if (inet_csk(sk)->icsk_ca_state != state) {
@@ -2881,7 +2899,8 @@ static void tcp_try_to_open(struct sock *sk, int flag)
2881 2899
2882 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { 2900 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
2883 tcp_try_keep_open(sk); 2901 tcp_try_keep_open(sk);
2884 tcp_moderate_cwnd(tp); 2902 if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
2903 tcp_moderate_cwnd(tp);
2885 } else { 2904 } else {
2886 tcp_cwnd_down(sk, flag); 2905 tcp_cwnd_down(sk, flag);
2887 } 2906 }
@@ -3009,11 +3028,11 @@ static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
3009 * tcp_xmit_retransmit_queue(). 3028 * tcp_xmit_retransmit_queue().
3010 */ 3029 */
3011static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, 3030static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
3012 int newly_acked_sacked, int flag) 3031 int newly_acked_sacked, bool is_dupack,
3032 int flag)
3013{ 3033{
3014 struct inet_connection_sock *icsk = inet_csk(sk); 3034 struct inet_connection_sock *icsk = inet_csk(sk);
3015 struct tcp_sock *tp = tcp_sk(sk); 3035 struct tcp_sock *tp = tcp_sk(sk);
3016 int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3017 int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && 3036 int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
3018 (tcp_fackets_out(tp) > tp->reordering)); 3037 (tcp_fackets_out(tp) > tp->reordering));
3019 int fast_rexmit = 0, mib_idx; 3038 int fast_rexmit = 0, mib_idx;
@@ -3032,19 +3051,10 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
3032 if (tcp_check_sack_reneging(sk, flag)) 3051 if (tcp_check_sack_reneging(sk, flag))
3033 return; 3052 return;
3034 3053
3035 /* C. Process data loss notification, provided it is valid. */ 3054 /* C. Check consistency of the current state. */
3036 if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) &&
3037 before(tp->snd_una, tp->high_seq) &&
3038 icsk->icsk_ca_state != TCP_CA_Open &&
3039 tp->fackets_out > tp->reordering) {
3040 tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
3041 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
3042 }
3043
3044 /* D. Check consistency of the current state. */
3045 tcp_verify_left_out(tp); 3055 tcp_verify_left_out(tp);
3046 3056
3047 /* E. Check state exit conditions. State can be terminated 3057 /* D. Check state exit conditions. State can be terminated
3048 * when high_seq is ACKed. */ 3058 * when high_seq is ACKed. */
3049 if (icsk->icsk_ca_state == TCP_CA_Open) { 3059 if (icsk->icsk_ca_state == TCP_CA_Open) {
3050 WARN_ON(tp->retrans_out != 0); 3060 WARN_ON(tp->retrans_out != 0);
@@ -3066,17 +3076,6 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
3066 } 3076 }
3067 break; 3077 break;
3068 3078
3069 case TCP_CA_Disorder:
3070 tcp_try_undo_dsack(sk);
3071 if (!tp->undo_marker ||
3072 /* For SACK case do not Open to allow to undo
3073 * catching for all duplicate ACKs. */
3074 tcp_is_reno(tp) || tp->snd_una != tp->high_seq) {
3075 tp->undo_marker = 0;
3076 tcp_set_ca_state(sk, TCP_CA_Open);
3077 }
3078 break;
3079
3080 case TCP_CA_Recovery: 3079 case TCP_CA_Recovery:
3081 if (tcp_is_reno(tp)) 3080 if (tcp_is_reno(tp))
3082 tcp_reset_reno_sack(tp); 3081 tcp_reset_reno_sack(tp);
@@ -3087,7 +3086,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
3087 } 3086 }
3088 } 3087 }
3089 3088
3090 /* F. Process state. */ 3089 /* E. Process state. */
3091 switch (icsk->icsk_ca_state) { 3090 switch (icsk->icsk_ca_state) {
3092 case TCP_CA_Recovery: 3091 case TCP_CA_Recovery:
3093 if (!(flag & FLAG_SND_UNA_ADVANCED)) { 3092 if (!(flag & FLAG_SND_UNA_ADVANCED)) {
@@ -3117,7 +3116,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
3117 tcp_add_reno_sack(sk); 3116 tcp_add_reno_sack(sk);
3118 } 3117 }
3119 3118
3120 if (icsk->icsk_ca_state == TCP_CA_Disorder) 3119 if (icsk->icsk_ca_state <= TCP_CA_Disorder)
3121 tcp_try_undo_dsack(sk); 3120 tcp_try_undo_dsack(sk);
3122 3121
3123 if (!tcp_time_to_recover(sk)) { 3122 if (!tcp_time_to_recover(sk)) {
@@ -3681,10 +3680,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3681 u32 prior_snd_una = tp->snd_una; 3680 u32 prior_snd_una = tp->snd_una;
3682 u32 ack_seq = TCP_SKB_CB(skb)->seq; 3681 u32 ack_seq = TCP_SKB_CB(skb)->seq;
3683 u32 ack = TCP_SKB_CB(skb)->ack_seq; 3682 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3683 bool is_dupack = false;
3684 u32 prior_in_flight; 3684 u32 prior_in_flight;
3685 u32 prior_fackets; 3685 u32 prior_fackets;
3686 int prior_packets; 3686 int prior_packets;
3687 int prior_sacked = tp->sacked_out; 3687 int prior_sacked = tp->sacked_out;
3688 int pkts_acked = 0;
3688 int newly_acked_sacked = 0; 3689 int newly_acked_sacked = 0;
3689 int frto_cwnd = 0; 3690 int frto_cwnd = 0;
3690 3691
@@ -3757,6 +3758,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3757 /* See if we can take anything off of the retransmit queue. */ 3758 /* See if we can take anything off of the retransmit queue. */
3758 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); 3759 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
3759 3760
3761 pkts_acked = prior_packets - tp->packets_out;
3760 newly_acked_sacked = (prior_packets - prior_sacked) - 3762 newly_acked_sacked = (prior_packets - prior_sacked) -
3761 (tp->packets_out - tp->sacked_out); 3763 (tp->packets_out - tp->sacked_out);
3762 3764
@@ -3771,8 +3773,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3771 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && 3773 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd &&
3772 tcp_may_raise_cwnd(sk, flag)) 3774 tcp_may_raise_cwnd(sk, flag))
3773 tcp_cong_avoid(sk, ack, prior_in_flight); 3775 tcp_cong_avoid(sk, ack, prior_in_flight);
3774 tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, 3776 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3775 newly_acked_sacked, flag); 3777 tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked,
3778 is_dupack, flag);
3776 } else { 3779 } else {
3777 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) 3780 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
3778 tcp_cong_avoid(sk, ack, prior_in_flight); 3781 tcp_cong_avoid(sk, ack, prior_in_flight);
@@ -3784,6 +3787,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3784 return 1; 3787 return 1;
3785 3788
3786no_queue: 3789no_queue:
3790 /* If data was DSACKed, see if we can undo a cwnd reduction. */
3791 if (flag & FLAG_DSACKING_ACK)
3792 tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked,
3793 is_dupack, flag);
3787 /* If this ack opens up a zero window, clear backoff. It was 3794 /* If this ack opens up a zero window, clear backoff. It was
3788 * being used to time the probes, and is probably far higher than 3795 * being used to time the probes, and is probably far higher than
3789 * it needs to be for normal retransmission. 3796 * it needs to be for normal retransmission.
@@ -3797,10 +3804,14 @@ invalid_ack:
3797 return -1; 3804 return -1;
3798 3805
3799old_ack: 3806old_ack:
3807 /* If data was SACKed, tag it and see if we should send more data.
3808 * If data was DSACKed, see if we can undo a cwnd reduction.
3809 */
3800 if (TCP_SKB_CB(skb)->sacked) { 3810 if (TCP_SKB_CB(skb)->sacked) {
3801 tcp_sacktag_write_queue(sk, skb, prior_snd_una); 3811 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
3802 if (icsk->icsk_ca_state == TCP_CA_Open) 3812 newly_acked_sacked = tp->sacked_out - prior_sacked;
3803 tcp_try_keep_open(sk); 3813 tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked,
3814 is_dupack, flag);
3804 } 3815 }
3805 3816
3806 SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); 3817 SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
@@ -3876,7 +3887,7 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
3876 case TCPOPT_SACK_PERM: 3887 case TCPOPT_SACK_PERM:
3877 if (opsize == TCPOLEN_SACK_PERM && th->syn && 3888 if (opsize == TCPOLEN_SACK_PERM && th->syn &&
3878 !estab && sysctl_tcp_sack) { 3889 !estab && sysctl_tcp_sack) {
3879 opt_rx->sack_ok = 1; 3890 opt_rx->sack_ok = TCP_SACK_SEEN;
3880 tcp_sack_reset(opt_rx); 3891 tcp_sack_reset(opt_rx);
3881 } 3892 }
3882 break; 3893 break;
@@ -4864,7 +4875,7 @@ static int tcp_prune_queue(struct sock *sk)
4864 4875
4865 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) 4876 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
4866 tcp_clamp_window(sk); 4877 tcp_clamp_window(sk);
4867 else if (tcp_memory_pressure) 4878 else if (sk_under_memory_pressure(sk))
4868 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); 4879 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
4869 4880
4870 tcp_collapse_ofo_queue(sk); 4881 tcp_collapse_ofo_queue(sk);
@@ -4930,11 +4941,11 @@ static int tcp_should_expand_sndbuf(const struct sock *sk)
4930 return 0; 4941 return 0;
4931 4942
4932 /* If we are under global TCP memory pressure, do not expand. */ 4943 /* If we are under global TCP memory pressure, do not expand. */
4933 if (tcp_memory_pressure) 4944 if (sk_under_memory_pressure(sk))
4934 return 0; 4945 return 0;
4935 4946
4936 /* If we are under soft global TCP memory pressure, do not expand. */ 4947 /* If we are under soft global TCP memory pressure, do not expand. */
4937 if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) 4948 if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
4938 return 0; 4949 return 0;
4939 4950
4940 /* If we filled the congestion window, do not expand. */ 4951 /* If we filled the congestion window, do not expand. */
@@ -5809,6 +5820,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5809 goto discard; 5820 goto discard;
5810 5821
5811 if (th->syn) { 5822 if (th->syn) {
5823 if (th->fin)
5824 goto discard;
5812 if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) 5825 if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
5813 return 1; 5826 return 1;
5814 5827
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a9db4b1a2215..fd54c5f8a255 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -73,6 +73,7 @@
73#include <net/xfrm.h> 73#include <net/xfrm.h>
74#include <net/netdma.h> 74#include <net/netdma.h>
75#include <net/secure_seq.h> 75#include <net/secure_seq.h>
76#include <net/tcp_memcontrol.h>
76 77
77#include <linux/inet.h> 78#include <linux/inet.h>
78#include <linux/ipv6.h> 79#include <linux/ipv6.h>
@@ -630,7 +631,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
630 arg.iov[0].iov_len = sizeof(rep.th); 631 arg.iov[0].iov_len = sizeof(rep.th);
631 632
632#ifdef CONFIG_TCP_MD5SIG 633#ifdef CONFIG_TCP_MD5SIG
633 key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL; 634 key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->saddr) : NULL;
634 if (key) { 635 if (key) {
635 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 636 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
636 (TCPOPT_NOP << 16) | 637 (TCPOPT_NOP << 16) |
@@ -650,6 +651,11 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
650 arg.iov[0].iov_len, IPPROTO_TCP, 0); 651 arg.iov[0].iov_len, IPPROTO_TCP, 0);
651 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 652 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
652 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; 653 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
654 /* When socket is gone, all binding information is lost.
655 * routing might fail in this case. using iif for oif to
656 * make sure we can deliver it
657 */
658 arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
653 659
654 net = dev_net(skb_dst(skb)->dev); 660 net = dev_net(skb_dst(skb)->dev);
655 arg.tos = ip_hdr(skb)->tos; 661 arg.tos = ip_hdr(skb)->tos;
@@ -1460,9 +1466,13 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1460 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1466 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1461 newinet->inet_id = newtp->write_seq ^ jiffies; 1467 newinet->inet_id = newtp->write_seq ^ jiffies;
1462 1468
1463 if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL) 1469 if (!dst) {
1464 goto put_and_exit; 1470 dst = inet_csk_route_child_sock(sk, newsk, req);
1465 1471 if (!dst)
1472 goto put_and_exit;
1473 } else {
1474 /* syncookie case : see end of cookie_v4_check() */
1475 }
1466 sk_setup_caps(newsk, dst); 1476 sk_setup_caps(newsk, dst);
1467 1477
1468 tcp_mtup_init(newsk); 1478 tcp_mtup_init(newsk);
@@ -1511,6 +1521,7 @@ exit:
1511 return NULL; 1521 return NULL;
1512put_and_exit: 1522put_and_exit:
1513 tcp_clear_xmit_timers(newsk); 1523 tcp_clear_xmit_timers(newsk);
1524 tcp_cleanup_congestion_control(newsk);
1514 bh_unlock_sock(newsk); 1525 bh_unlock_sock(newsk);
1515 sock_put(newsk); 1526 sock_put(newsk);
1516 goto exit; 1527 goto exit;
@@ -1916,7 +1927,8 @@ static int tcp_v4_init_sock(struct sock *sk)
1916 sk->sk_rcvbuf = sysctl_tcp_rmem[1]; 1927 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1917 1928
1918 local_bh_disable(); 1929 local_bh_disable();
1919 percpu_counter_inc(&tcp_sockets_allocated); 1930 sock_update_memcg(sk);
1931 sk_sockets_allocated_inc(sk);
1920 local_bh_enable(); 1932 local_bh_enable();
1921 1933
1922 return 0; 1934 return 0;
@@ -1972,7 +1984,8 @@ void tcp_v4_destroy_sock(struct sock *sk)
1972 tp->cookie_values = NULL; 1984 tp->cookie_values = NULL;
1973 } 1985 }
1974 1986
1975 percpu_counter_dec(&tcp_sockets_allocated); 1987 sk_sockets_allocated_dec(sk);
1988 sock_release_memcg(sk);
1976} 1989}
1977EXPORT_SYMBOL(tcp_v4_destroy_sock); 1990EXPORT_SYMBOL(tcp_v4_destroy_sock);
1978 1991
@@ -2619,7 +2632,6 @@ struct proto tcp_prot = {
2619 .orphan_count = &tcp_orphan_count, 2632 .orphan_count = &tcp_orphan_count,
2620 .memory_allocated = &tcp_memory_allocated, 2633 .memory_allocated = &tcp_memory_allocated,
2621 .memory_pressure = &tcp_memory_pressure, 2634 .memory_pressure = &tcp_memory_pressure,
2622 .sysctl_mem = sysctl_tcp_mem,
2623 .sysctl_wmem = sysctl_tcp_wmem, 2635 .sysctl_wmem = sysctl_tcp_wmem,
2624 .sysctl_rmem = sysctl_tcp_rmem, 2636 .sysctl_rmem = sysctl_tcp_rmem,
2625 .max_header = MAX_TCP_HEADER, 2637 .max_header = MAX_TCP_HEADER,
@@ -2633,10 +2645,14 @@ struct proto tcp_prot = {
2633 .compat_setsockopt = compat_tcp_setsockopt, 2645 .compat_setsockopt = compat_tcp_setsockopt,
2634 .compat_getsockopt = compat_tcp_getsockopt, 2646 .compat_getsockopt = compat_tcp_getsockopt,
2635#endif 2647#endif
2648#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
2649 .init_cgroup = tcp_init_cgroup,
2650 .destroy_cgroup = tcp_destroy_cgroup,
2651 .proto_cgroup = tcp_proto_cgroup,
2652#endif
2636}; 2653};
2637EXPORT_SYMBOL(tcp_prot); 2654EXPORT_SYMBOL(tcp_prot);
2638 2655
2639
2640static int __net_init tcp_sk_init(struct net *net) 2656static int __net_init tcp_sk_init(struct net *net)
2641{ 2657{
2642 return inet_ctl_sock_create(&net->ipv4.tcp_sock, 2658 return inet_ctl_sock_create(&net->ipv4.tcp_sock,
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
new file mode 100644
index 000000000000..49978788a9dc
--- /dev/null
+++ b/net/ipv4/tcp_memcontrol.c
@@ -0,0 +1,272 @@
1#include <net/tcp.h>
2#include <net/tcp_memcontrol.h>
3#include <net/sock.h>
4#include <net/ip.h>
5#include <linux/nsproxy.h>
6#include <linux/memcontrol.h>
7#include <linux/module.h>
8
9static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft);
10static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft,
11 const char *buffer);
12static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event);
13
14static struct cftype tcp_files[] = {
15 {
16 .name = "kmem.tcp.limit_in_bytes",
17 .write_string = tcp_cgroup_write,
18 .read_u64 = tcp_cgroup_read,
19 .private = RES_LIMIT,
20 },
21 {
22 .name = "kmem.tcp.usage_in_bytes",
23 .read_u64 = tcp_cgroup_read,
24 .private = RES_USAGE,
25 },
26 {
27 .name = "kmem.tcp.failcnt",
28 .private = RES_FAILCNT,
29 .trigger = tcp_cgroup_reset,
30 .read_u64 = tcp_cgroup_read,
31 },
32 {
33 .name = "kmem.tcp.max_usage_in_bytes",
34 .private = RES_MAX_USAGE,
35 .trigger = tcp_cgroup_reset,
36 .read_u64 = tcp_cgroup_read,
37 },
38};
39
40static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto)
41{
42 return container_of(cg_proto, struct tcp_memcontrol, cg_proto);
43}
44
45static void memcg_tcp_enter_memory_pressure(struct sock *sk)
46{
47 if (sk->sk_cgrp->memory_pressure)
48 *sk->sk_cgrp->memory_pressure = 1;
49}
50EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure);
51
52int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
53{
54 /*
55 * The root cgroup does not use res_counters, but rather,
56 * rely on the data already collected by the network
57 * subsystem
58 */
59 struct res_counter *res_parent = NULL;
60 struct cg_proto *cg_proto, *parent_cg;
61 struct tcp_memcontrol *tcp;
62 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
63 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
64 struct net *net = current->nsproxy->net_ns;
65
66 cg_proto = tcp_prot.proto_cgroup(memcg);
67 if (!cg_proto)
68 goto create_files;
69
70 tcp = tcp_from_cgproto(cg_proto);
71
72 tcp->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0];
73 tcp->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1];
74 tcp->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2];
75 tcp->tcp_memory_pressure = 0;
76
77 parent_cg = tcp_prot.proto_cgroup(parent);
78 if (parent_cg)
79 res_parent = parent_cg->memory_allocated;
80
81 res_counter_init(&tcp->tcp_memory_allocated, res_parent);
82 percpu_counter_init(&tcp->tcp_sockets_allocated, 0);
83
84 cg_proto->enter_memory_pressure = memcg_tcp_enter_memory_pressure;
85 cg_proto->memory_pressure = &tcp->tcp_memory_pressure;
86 cg_proto->sysctl_mem = tcp->tcp_prot_mem;
87 cg_proto->memory_allocated = &tcp->tcp_memory_allocated;
88 cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated;
89 cg_proto->memcg = memcg;
90
91create_files:
92 return cgroup_add_files(cgrp, ss, tcp_files,
93 ARRAY_SIZE(tcp_files));
94}
95EXPORT_SYMBOL(tcp_init_cgroup);
96
97void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
98{
99 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
100 struct cg_proto *cg_proto;
101 struct tcp_memcontrol *tcp;
102 u64 val;
103
104 cg_proto = tcp_prot.proto_cgroup(memcg);
105 if (!cg_proto)
106 return;
107
108 tcp = tcp_from_cgproto(cg_proto);
109 percpu_counter_destroy(&tcp->tcp_sockets_allocated);
110
111 val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
112
113 if (val != RESOURCE_MAX)
114 jump_label_dec(&memcg_socket_limit_enabled);
115}
116EXPORT_SYMBOL(tcp_destroy_cgroup);
117
118static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
119{
120 struct net *net = current->nsproxy->net_ns;
121 struct tcp_memcontrol *tcp;
122 struct cg_proto *cg_proto;
123 u64 old_lim;
124 int i;
125 int ret;
126
127 cg_proto = tcp_prot.proto_cgroup(memcg);
128 if (!cg_proto)
129 return -EINVAL;
130
131 if (val > RESOURCE_MAX)
132 val = RESOURCE_MAX;
133
134 tcp = tcp_from_cgproto(cg_proto);
135
136 old_lim = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
137 ret = res_counter_set_limit(&tcp->tcp_memory_allocated, val);
138 if (ret)
139 return ret;
140
141 for (i = 0; i < 3; i++)
142 tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT,
143 net->ipv4.sysctl_tcp_mem[i]);
144
145 if (val == RESOURCE_MAX && old_lim != RESOURCE_MAX)
146 jump_label_dec(&memcg_socket_limit_enabled);
147 else if (old_lim == RESOURCE_MAX && val != RESOURCE_MAX)
148 jump_label_inc(&memcg_socket_limit_enabled);
149
150 return 0;
151}
152
153static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft,
154 const char *buffer)
155{
156 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
157 unsigned long long val;
158 int ret = 0;
159
160 switch (cft->private) {
161 case RES_LIMIT:
162 /* see memcontrol.c */
163 ret = res_counter_memparse_write_strategy(buffer, &val);
164 if (ret)
165 break;
166 ret = tcp_update_limit(memcg, val);
167 break;
168 default:
169 ret = -EINVAL;
170 break;
171 }
172 return ret;
173}
174
175static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val)
176{
177 struct tcp_memcontrol *tcp;
178 struct cg_proto *cg_proto;
179
180 cg_proto = tcp_prot.proto_cgroup(memcg);
181 if (!cg_proto)
182 return default_val;
183
184 tcp = tcp_from_cgproto(cg_proto);
185 return res_counter_read_u64(&tcp->tcp_memory_allocated, type);
186}
187
188static u64 tcp_read_usage(struct mem_cgroup *memcg)
189{
190 struct tcp_memcontrol *tcp;
191 struct cg_proto *cg_proto;
192
193 cg_proto = tcp_prot.proto_cgroup(memcg);
194 if (!cg_proto)
195 return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT;
196
197 tcp = tcp_from_cgproto(cg_proto);
198 return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE);
199}
200
201static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft)
202{
203 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
204 u64 val;
205
206 switch (cft->private) {
207 case RES_LIMIT:
208 val = tcp_read_stat(memcg, RES_LIMIT, RESOURCE_MAX);
209 break;
210 case RES_USAGE:
211 val = tcp_read_usage(memcg);
212 break;
213 case RES_FAILCNT:
214 case RES_MAX_USAGE:
215 val = tcp_read_stat(memcg, cft->private, 0);
216 break;
217 default:
218 BUG();
219 }
220 return val;
221}
222
223static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event)
224{
225 struct mem_cgroup *memcg;
226 struct tcp_memcontrol *tcp;
227 struct cg_proto *cg_proto;
228
229 memcg = mem_cgroup_from_cont(cont);
230 cg_proto = tcp_prot.proto_cgroup(memcg);
231 if (!cg_proto)
232 return 0;
233 tcp = tcp_from_cgproto(cg_proto);
234
235 switch (event) {
236 case RES_MAX_USAGE:
237 res_counter_reset_max(&tcp->tcp_memory_allocated);
238 break;
239 case RES_FAILCNT:
240 res_counter_reset_failcnt(&tcp->tcp_memory_allocated);
241 break;
242 }
243
244 return 0;
245}
246
247unsigned long long tcp_max_memory(const struct mem_cgroup *memcg)
248{
249 struct tcp_memcontrol *tcp;
250 struct cg_proto *cg_proto;
251
252 cg_proto = tcp_prot.proto_cgroup((struct mem_cgroup *)memcg);
253 if (!cg_proto)
254 return 0;
255
256 tcp = tcp_from_cgproto(cg_proto);
257 return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
258}
259
260void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx)
261{
262 struct tcp_memcontrol *tcp;
263 struct cg_proto *cg_proto;
264
265 cg_proto = tcp_prot.proto_cgroup(memcg);
266 if (!cg_proto)
267 return;
268
269 tcp = tcp_from_cgproto(cg_proto);
270
271 tcp->tcp_prot_mem[idx] = val;
272}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 66363b689ad6..550e755747e0 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -336,15 +336,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
336 tcptw->tw_ts_recent = tp->rx_opt.ts_recent; 336 tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
337 tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; 337 tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
338 338
339#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 339#if IS_ENABLED(CONFIG_IPV6)
340 if (tw->tw_family == PF_INET6) { 340 if (tw->tw_family == PF_INET6) {
341 struct ipv6_pinfo *np = inet6_sk(sk); 341 struct ipv6_pinfo *np = inet6_sk(sk);
342 struct inet6_timewait_sock *tw6; 342 struct inet6_timewait_sock *tw6;
343 343
344 tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); 344 tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
345 tw6 = inet6_twsk((struct sock *)tw); 345 tw6 = inet6_twsk((struct sock *)tw);
346 ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); 346 tw6->tw_v6_daddr = np->daddr;
347 ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); 347 tw6->tw_v6_rcv_saddr = np->rcv_saddr;
348 tw->tw_tclass = np->tclass; 348 tw->tw_tclass = np->tclass;
349 tw->tw_ipv6only = np->ipv6only; 349 tw->tw_ipv6only = np->ipv6only;
350 } 350 }
@@ -425,7 +425,7 @@ static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
425 */ 425 */
426struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) 426struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
427{ 427{
428 struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); 428 struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
429 429
430 if (newsk != NULL) { 430 if (newsk != NULL) {
431 const struct inet_request_sock *ireq = inet_rsk(req); 431 const struct inet_request_sock *ireq = inet_rsk(req);
@@ -495,7 +495,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
495 newtp->frto_counter = 0; 495 newtp->frto_counter = 0;
496 newtp->frto_highmark = 0; 496 newtp->frto_highmark = 0;
497 497
498 newicsk->icsk_ca_ops = &tcp_init_congestion_ops; 498 if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops &&
499 !try_module_get(newicsk->icsk_ca_ops->owner))
500 newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
499 501
500 tcp_set_ca_state(newsk, TCP_CA_Open); 502 tcp_set_ca_state(newsk, TCP_CA_Open);
501 tcp_init_xmit_timers(newsk); 503 tcp_init_xmit_timers(newsk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 63170e297540..4ff3b6dc74fc 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1093,6 +1093,13 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
1093{ 1093{
1094 int i, k, eat; 1094 int i, k, eat;
1095 1095
1096 eat = min_t(int, len, skb_headlen(skb));
1097 if (eat) {
1098 __skb_pull(skb, eat);
1099 len -= eat;
1100 if (!len)
1101 return;
1102 }
1096 eat = len; 1103 eat = len;
1097 k = 0; 1104 k = 0;
1098 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1105 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
@@ -1124,11 +1131,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
1124 if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) 1131 if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
1125 return -ENOMEM; 1132 return -ENOMEM;
1126 1133
1127 /* If len == headlen, we avoid __skb_pull to preserve alignment. */ 1134 __pskb_trim_head(skb, len);
1128 if (unlikely(len < skb_headlen(skb)))
1129 __skb_pull(skb, len);
1130 else
1131 __pskb_trim_head(skb, len - skb_headlen(skb));
1132 1135
1133 TCP_SKB_CB(skb)->seq += len; 1136 TCP_SKB_CB(skb)->seq += len;
1134 skb->ip_summed = CHECKSUM_PARTIAL; 1137 skb->ip_summed = CHECKSUM_PARTIAL;
@@ -1138,11 +1141,9 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
1138 sk_mem_uncharge(sk, len); 1141 sk_mem_uncharge(sk, len);
1139 sock_set_flag(sk, SOCK_QUEUE_SHRUNK); 1142 sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
1140 1143
1141 /* Any change of skb->len requires recalculation of tso 1144 /* Any change of skb->len requires recalculation of tso factor. */
1142 * factor and mss.
1143 */
1144 if (tcp_skb_pcount(skb) > 1) 1145 if (tcp_skb_pcount(skb) > 1)
1145 tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk)); 1146 tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
1146 1147
1147 return 0; 1148 return 0;
1148} 1149}
@@ -1581,7 +1582,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1581 * frame, so if we have space for more than 3 frames 1582 * frame, so if we have space for more than 3 frames
1582 * then send now. 1583 * then send now.
1583 */ 1584 */
1584 if (limit > tcp_max_burst(tp) * tp->mss_cache) 1585 if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
1585 goto send_now; 1586 goto send_now;
1586 } 1587 }
1587 1588
@@ -1919,7 +1920,7 @@ u32 __tcp_select_window(struct sock *sk)
1919 if (free_space < (full_space >> 1)) { 1920 if (free_space < (full_space >> 1)) {
1920 icsk->icsk_ack.quick = 0; 1921 icsk->icsk_ack.quick = 0;
1921 1922
1922 if (tcp_memory_pressure) 1923 if (sk_under_memory_pressure(sk))
1923 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 1924 tp->rcv_ssthresh = min(tp->rcv_ssthresh,
1924 4U * tp->advmss); 1925 4U * tp->advmss);
1925 1926
@@ -2147,7 +2148,15 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2147 */ 2148 */
2148 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2149 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2149 2150
2150 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2151 /* make sure skb->data is aligned on arches that require it */
2152 if (unlikely(NET_IP_ALIGN && ((unsigned long)skb->data & 3))) {
2153 struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
2154 GFP_ATOMIC);
2155 err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
2156 -ENOBUFS;
2157 } else {
2158 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2159 }
2151 2160
2152 if (err == 0) { 2161 if (err == 0) {
2153 /* Update global TCP statistics. */ 2162 /* Update global TCP statistics. */
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 2e0f0af76c19..cd2e0723266d 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -77,10 +77,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)
77 if (sk->sk_err_soft) 77 if (sk->sk_err_soft)
78 shift++; 78 shift++;
79 79
80 if (tcp_too_many_orphans(sk, shift)) { 80 if (tcp_check_oom(sk, shift)) {
81 if (net_ratelimit())
82 printk(KERN_INFO "Out of socket memory\n");
83
84 /* Catch exceptional cases, when connection requires reset. 81 /* Catch exceptional cases, when connection requires reset.
85 * 1. Last segment was sent recently. */ 82 * 1. Last segment was sent recently. */
86 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || 83 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
@@ -171,13 +168,13 @@ static int tcp_write_timeout(struct sock *sk)
171{ 168{
172 struct inet_connection_sock *icsk = inet_csk(sk); 169 struct inet_connection_sock *icsk = inet_csk(sk);
173 int retry_until; 170 int retry_until;
174 bool do_reset, syn_set = 0; 171 bool do_reset, syn_set = false;
175 172
176 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 173 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
177 if (icsk->icsk_retransmits) 174 if (icsk->icsk_retransmits)
178 dst_negative_advice(sk); 175 dst_negative_advice(sk);
179 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; 176 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
180 syn_set = 1; 177 syn_set = true;
181 } else { 178 } else {
182 if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { 179 if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
183 /* Black hole detection */ 180 /* Black hole detection */
@@ -261,7 +258,7 @@ static void tcp_delack_timer(unsigned long data)
261 } 258 }
262 259
263out: 260out:
264 if (tcp_memory_pressure) 261 if (sk_under_memory_pressure(sk))
265 sk_mem_reclaim(sk); 262 sk_mem_reclaim(sk);
266out_unlock: 263out_unlock:
267 bh_unlock_sock(sk); 264 bh_unlock_sock(sk);
@@ -340,7 +337,7 @@ void tcp_retransmit_timer(struct sock *sk)
340 &inet->inet_daddr, ntohs(inet->inet_dport), 337 &inet->inet_daddr, ntohs(inet->inet_dport),
341 inet->inet_num, tp->snd_una, tp->snd_nxt); 338 inet->inet_num, tp->snd_una, tp->snd_nxt);
342 } 339 }
343#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 340#if IS_ENABLED(CONFIG_IPV6)
344 else if (sk->sk_family == AF_INET6) { 341 else if (sk->sk_family == AF_INET6) {
345 struct ipv6_pinfo *np = inet6_sk(sk); 342 struct ipv6_pinfo *np = inet6_sk(sk);
346 LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", 343 LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index ac3b3ee4b07c..01775983b997 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -105,7 +105,7 @@ drop:
105 return 0; 105 return 0;
106} 106}
107 107
108#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 108#if IS_ENABLED(CONFIG_IPV6)
109static int tunnel64_rcv(struct sk_buff *skb) 109static int tunnel64_rcv(struct sk_buff *skb)
110{ 110{
111 struct xfrm_tunnel *handler; 111 struct xfrm_tunnel *handler;
@@ -134,7 +134,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info)
134 break; 134 break;
135} 135}
136 136
137#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 137#if IS_ENABLED(CONFIG_IPV6)
138static void tunnel64_err(struct sk_buff *skb, u32 info) 138static void tunnel64_err(struct sk_buff *skb, u32 info)
139{ 139{
140 struct xfrm_tunnel *handler; 140 struct xfrm_tunnel *handler;
@@ -152,7 +152,7 @@ static const struct net_protocol tunnel4_protocol = {
152 .netns_ok = 1, 152 .netns_ok = 1,
153}; 153};
154 154
155#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 155#if IS_ENABLED(CONFIG_IPV6)
156static const struct net_protocol tunnel64_protocol = { 156static const struct net_protocol tunnel64_protocol = {
157 .handler = tunnel64_rcv, 157 .handler = tunnel64_rcv,
158 .err_handler = tunnel64_err, 158 .err_handler = tunnel64_err,
@@ -167,7 +167,7 @@ static int __init tunnel4_init(void)
167 printk(KERN_ERR "tunnel4 init: can't add protocol\n"); 167 printk(KERN_ERR "tunnel4 init: can't add protocol\n");
168 return -EAGAIN; 168 return -EAGAIN;
169 } 169 }
170#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 170#if IS_ENABLED(CONFIG_IPV6)
171 if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) { 171 if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) {
172 printk(KERN_ERR "tunnel64 init: can't add protocol\n"); 172 printk(KERN_ERR "tunnel64 init: can't add protocol\n");
173 inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); 173 inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
@@ -179,7 +179,7 @@ static int __init tunnel4_init(void)
179 179
180static void __exit tunnel4_fini(void) 180static void __exit tunnel4_fini(void)
181{ 181{
182#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 182#if IS_ENABLED(CONFIG_IPV6)
183 if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6)) 183 if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6))
184 printk(KERN_ERR "tunnel64 close: can't remove protocol\n"); 184 printk(KERN_ERR "tunnel64 close: can't remove protocol\n");
185#endif 185#endif
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5a65eeac1d29..5d075b5f70fc 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -445,7 +445,7 @@ exact_match:
445/* UDP is nearly always wildcards out the wazoo, it makes no sense to try 445/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
446 * harder than this. -DaveM 446 * harder than this. -DaveM
447 */ 447 */
448static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, 448struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
449 __be16 sport, __be32 daddr, __be16 dport, 449 __be16 sport, __be32 daddr, __be16 dport,
450 int dif, struct udp_table *udptable) 450 int dif, struct udp_table *udptable)
451{ 451{
@@ -512,6 +512,7 @@ begin:
512 rcu_read_unlock(); 512 rcu_read_unlock();
513 return result; 513 return result;
514} 514}
515EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
515 516
516static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, 517static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
517 __be16 sport, __be16 dport, 518 __be16 sport, __be16 dport,
@@ -1358,7 +1359,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1358 if (inet_sk(sk)->inet_daddr) 1359 if (inet_sk(sk)->inet_daddr)
1359 sock_rps_save_rxhash(sk, skb); 1360 sock_rps_save_rxhash(sk, skb);
1360 1361
1361 rc = ip_queue_rcv_skb(sk, skb); 1362 rc = sock_queue_rcv_skb(sk, skb);
1362 if (rc < 0) { 1363 if (rc < 0) {
1363 int is_udplite = IS_UDPLITE(sk); 1364 int is_udplite = IS_UDPLITE(sk);
1364 1365
@@ -1474,6 +1475,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1474 1475
1475 rc = 0; 1476 rc = 0;
1476 1477
1478 ipv4_pktinfo_prepare(skb);
1477 bh_lock_sock(sk); 1479 bh_lock_sock(sk);
1478 if (!sock_owned_by_user(sk)) 1480 if (!sock_owned_by_user(sk))
1479 rc = __udp_queue_rcv_skb(sk, skb); 1481 rc = __udp_queue_rcv_skb(sk, skb);
@@ -2247,7 +2249,8 @@ int udp4_ufo_send_check(struct sk_buff *skb)
2247 return 0; 2249 return 0;
2248} 2250}
2249 2251
2250struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features) 2252struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
2253 netdev_features_t features)
2251{ 2254{
2252 struct sk_buff *segs = ERR_PTR(-EINVAL); 2255 struct sk_buff *segs = ERR_PTR(-EINVAL);
2253 unsigned int mss; 2256 unsigned int mss;
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
new file mode 100644
index 000000000000..8a949f19deb6
--- /dev/null
+++ b/net/ipv4/udp_diag.c
@@ -0,0 +1,200 @@
1/*
2 * udp_diag.c Module for monitoring UDP transport protocols sockets.
3 *
4 * Authors: Pavel Emelyanov, <xemul@parallels.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12
13#include <linux/module.h>
14#include <linux/inet_diag.h>
15#include <linux/udp.h>
16#include <net/udp.h>
17#include <net/udplite.h>
18#include <linux/sock_diag.h>
19
20static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
21 struct netlink_callback *cb, struct inet_diag_req_v2 *req,
22 struct nlattr *bc)
23{
24 if (!inet_diag_bc_sk(bc, sk))
25 return 0;
26
27 return inet_sk_diag_fill(sk, NULL, skb, req, NETLINK_CB(cb->skb).pid,
28 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
29}
30
31static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
32 const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req)
33{
34 int err = -EINVAL;
35 struct sock *sk;
36 struct sk_buff *rep;
37
38 if (req->sdiag_family == AF_INET)
39 sk = __udp4_lib_lookup(&init_net,
40 req->id.idiag_src[0], req->id.idiag_sport,
41 req->id.idiag_dst[0], req->id.idiag_dport,
42 req->id.idiag_if, tbl);
43#if IS_ENABLED(CONFIG_IPV6)
44 else if (req->sdiag_family == AF_INET6)
45 sk = __udp6_lib_lookup(&init_net,
46 (struct in6_addr *)req->id.idiag_src,
47 req->id.idiag_sport,
48 (struct in6_addr *)req->id.idiag_dst,
49 req->id.idiag_dport,
50 req->id.idiag_if, tbl);
51#endif
52 else
53 goto out_nosk;
54
55 err = -ENOENT;
56 if (sk == NULL)
57 goto out_nosk;
58
59 err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
60 if (err)
61 goto out;
62
63 err = -ENOMEM;
64 rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
65 sizeof(struct inet_diag_meminfo) +
66 64)), GFP_KERNEL);
67 if (!rep)
68 goto out;
69
70 err = inet_sk_diag_fill(sk, NULL, rep, req,
71 NETLINK_CB(in_skb).pid,
72 nlh->nlmsg_seq, 0, nlh);
73 if (err < 0) {
74 WARN_ON(err == -EMSGSIZE);
75 kfree_skb(rep);
76 goto out;
77 }
78 err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid,
79 MSG_DONTWAIT);
80 if (err > 0)
81 err = 0;
82out:
83 if (sk)
84 sock_put(sk);
85out_nosk:
86 return err;
87}
88
89static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb,
90 struct inet_diag_req_v2 *r, struct nlattr *bc)
91{
92 int num, s_num, slot, s_slot;
93
94 s_slot = cb->args[0];
95 num = s_num = cb->args[1];
96
97 for (slot = s_slot; slot <= table->mask; num = s_num = 0, slot++) {
98 struct sock *sk;
99 struct hlist_nulls_node *node;
100 struct udp_hslot *hslot = &table->hash[slot];
101
102 if (hlist_nulls_empty(&hslot->head))
103 continue;
104
105 spin_lock_bh(&hslot->lock);
106 sk_nulls_for_each(sk, node, &hslot->head) {
107 struct inet_sock *inet = inet_sk(sk);
108
109 if (num < s_num)
110 goto next;
111 if (!(r->idiag_states & (1 << sk->sk_state)))
112 goto next;
113 if (r->sdiag_family != AF_UNSPEC &&
114 sk->sk_family != r->sdiag_family)
115 goto next;
116 if (r->id.idiag_sport != inet->inet_sport &&
117 r->id.idiag_sport)
118 goto next;
119 if (r->id.idiag_dport != inet->inet_dport &&
120 r->id.idiag_dport)
121 goto next;
122
123 if (sk_diag_dump(sk, skb, cb, r, bc) < 0) {
124 spin_unlock_bh(&hslot->lock);
125 goto done;
126 }
127next:
128 num++;
129 }
130 spin_unlock_bh(&hslot->lock);
131 }
132done:
133 cb->args[0] = slot;
134 cb->args[1] = num;
135}
136
137static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
138 struct inet_diag_req_v2 *r, struct nlattr *bc)
139{
140 udp_dump(&udp_table, skb, cb, r, bc);
141}
142
143static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
144 struct inet_diag_req_v2 *req)
145{
146 return udp_dump_one(&udp_table, in_skb, nlh, req);
147}
148
149static const struct inet_diag_handler udp_diag_handler = {
150 .dump = udp_diag_dump,
151 .dump_one = udp_diag_dump_one,
152 .idiag_type = IPPROTO_UDP,
153};
154
155static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
156 struct inet_diag_req_v2 *r, struct nlattr *bc)
157{
158 udp_dump(&udplite_table, skb, cb, r, bc);
159}
160
161static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
162 struct inet_diag_req_v2 *req)
163{
164 return udp_dump_one(&udplite_table, in_skb, nlh, req);
165}
166
167static const struct inet_diag_handler udplite_diag_handler = {
168 .dump = udplite_diag_dump,
169 .dump_one = udplite_diag_dump_one,
170 .idiag_type = IPPROTO_UDPLITE,
171};
172
173static int __init udp_diag_init(void)
174{
175 int err;
176
177 err = inet_diag_register(&udp_diag_handler);
178 if (err)
179 goto out;
180 err = inet_diag_register(&udplite_diag_handler);
181 if (err)
182 goto out_lite;
183out:
184 return err;
185out_lite:
186 inet_diag_unregister(&udp_diag_handler);
187 goto out;
188}
189
190static void __exit udp_diag_exit(void)
191{
192 inet_diag_unregister(&udplite_diag_handler);
193 inet_diag_unregister(&udp_diag_handler);
194}
195
196module_init(udp_diag_init);
197module_exit(udp_diag_exit);
198MODULE_LICENSE("GPL");
199MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */);
200MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */);
diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
index 63418185f524..e3db3f915114 100644
--- a/net/ipv4/xfrm4_mode_beet.c
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -110,10 +110,7 @@ static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb)
110 110
111 skb_push(skb, sizeof(*iph)); 111 skb_push(skb, sizeof(*iph));
112 skb_reset_network_header(skb); 112 skb_reset_network_header(skb);
113 113 skb_mac_header_rebuild(skb);
114 memmove(skb->data - skb->mac_len, skb_mac_header(skb),
115 skb->mac_len);
116 skb_set_mac_header(skb, -skb->mac_len);
117 114
118 xfrm4_beet_make_header(skb); 115 xfrm4_beet_make_header(skb);
119 116
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 534972e114ac..ed4bf11ef9f4 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -66,7 +66,6 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
66 66
67static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) 67static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
68{ 68{
69 const unsigned char *old_mac;
70 int err = -EINVAL; 69 int err = -EINVAL;
71 70
72 if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) 71 if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
@@ -84,10 +83,9 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
84 if (!(x->props.flags & XFRM_STATE_NOECN)) 83 if (!(x->props.flags & XFRM_STATE_NOECN))
85 ipip_ecn_decapsulate(skb); 84 ipip_ecn_decapsulate(skb);
86 85
87 old_mac = skb_mac_header(skb);
88 skb_set_mac_header(skb, -skb->mac_len);
89 memmove(skb_mac_header(skb), old_mac, skb->mac_len);
90 skb_reset_network_header(skb); 86 skb_reset_network_header(skb);
87 skb_mac_header_rebuild(skb);
88
91 err = 0; 89 err = 0;
92 90
93out: 91out:
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 82806455e859..9247d9d70e9d 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -64,7 +64,7 @@ static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
64 .priority = 2, 64 .priority = 2,
65}; 65};
66 66
67#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 67#if IS_ENABLED(CONFIG_IPV6)
68static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = { 68static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
69 .handler = xfrm_tunnel_rcv, 69 .handler = xfrm_tunnel_rcv,
70 .err_handler = xfrm_tunnel_err, 70 .err_handler = xfrm_tunnel_err,
@@ -84,7 +84,7 @@ static int __init ipip_init(void)
84 xfrm_unregister_type(&ipip_type, AF_INET); 84 xfrm_unregister_type(&ipip_type, AF_INET);
85 return -EAGAIN; 85 return -EAGAIN;
86 } 86 }
87#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 87#if IS_ENABLED(CONFIG_IPV6)
88 if (xfrm4_tunnel_register(&xfrm64_tunnel_handler, AF_INET6)) { 88 if (xfrm4_tunnel_register(&xfrm64_tunnel_handler, AF_INET6)) {
89 printk(KERN_INFO "ipip init: can't add xfrm handler for AF_INET6\n"); 89 printk(KERN_INFO "ipip init: can't add xfrm handler for AF_INET6\n");
90 xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET); 90 xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET);
@@ -97,7 +97,7 @@ static int __init ipip_init(void)
97 97
98static void __exit ipip_fini(void) 98static void __exit ipip_fini(void)
99{ 99{
100#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 100#if IS_ENABLED(CONFIG_IPV6)
101 if (xfrm4_tunnel_deregister(&xfrm64_tunnel_handler, AF_INET6)) 101 if (xfrm4_tunnel_deregister(&xfrm64_tunnel_handler, AF_INET6))
102 printk(KERN_INFO "ipip close: can't remove xfrm handler for AF_INET6\n"); 102 printk(KERN_INFO "ipip close: can't remove xfrm handler for AF_INET6\n");
103#endif 103#endif