aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorJ. Bruce Fields <bfields@redhat.com>2012-10-09 18:35:22 -0400
committerJ. Bruce Fields <bfields@redhat.com>2012-10-09 18:35:22 -0400
commitf474af7051212b4efc8267583fad9c4ebf33ccff (patch)
tree1aa46ebc8065a341f247c2a2d9af2f624ad1d4f8 /net/ipv4
parent0d22f68f02c10d5d10ec5712917e5828b001a822 (diff)
parente3dd9a52cb5552c46c2a4ca7ccdfb4dab5c72457 (diff)
nfs: disintegrate UAPI for nfs
This is to complete part of the Userspace API (UAPI) disintegration for which the preparatory patches were pulled recently. After these patches, userspace headers will be segregated into: include/uapi/linux/.../foo.h for the userspace interface stuff, and: include/linux/.../foo.h for the strictly kernel internal stuff. Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c27
-rw-r--r--net/ipv4/arp.c2
-rw-r--r--net/ipv4/devinet.c77
-rw-r--r--net/ipv4/fib_frontend.c41
-rw-r--r--net/ipv4/fib_rules.c4
-rw-r--r--net/ipv4/fib_semantics.c10
-rw-r--r--net/ipv4/fib_trie.c21
-rw-r--r--net/ipv4/igmp.c38
-rw-r--r--net/ipv4/inet_connection_sock.c64
-rw-r--r--net/ipv4/inet_diag.c45
-rw-r--r--net/ipv4/inet_fragment.c9
-rw-r--r--net/ipv4/inetpeer.c7
-rw-r--r--net/ipv4/ip_fragment.c13
-rw-r--r--net/ipv4/ip_gre.c128
-rw-r--r--net/ipv4/ip_output.c76
-rw-r--r--net/ipv4/ip_vti.c5
-rw-r--r--net/ipv4/ipconfig.c43
-rw-r--r--net/ipv4/ipip.c51
-rw-r--r--net/ipv4/ipmr.c28
-rw-r--r--net/ipv4/netfilter.c41
-rw-r--r--net/ipv4/netfilter/Kconfig90
-rw-r--r--net/ipv4/netfilter/Makefile18
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c18
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c98
-rw-r--r--net/ipv4/netfilter/ipt_REDIRECT.c110
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c3
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c2
-rw-r--r--net/ipv4/netfilter/iptable_filter.c10
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c10
-rw-r--r--net/ipv4/netfilter/iptable_nat.c (renamed from net/ipv4/netfilter/nf_nat_standalone.c)264
-rw-r--r--net/ipv4/netfilter/iptable_raw.c10
-rw-r--r--net/ipv4/netfilter/iptable_security.c5
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c85
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c763
-rw-r--r--net/ipv4/netfilter/nf_nat_ftp.c137
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c71
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c458
-rw-r--r--net/ipv4/netfilter/nf_nat_irc.c99
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c281
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c21
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_common.c114
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_dccp.c106
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c30
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c24
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_sctp.c96
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_tcp.c91
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_udp.c82
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_udplite.c98
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_unknown.c52
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c214
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c568
-rw-r--r--net/ipv4/netfilter/nf_nat_tftp.c51
-rw-r--r--net/ipv4/ping.c22
-rw-r--r--net/ipv4/proc.c4
-rw-r--r--net/ipv4/raw.c18
-rw-r--r--net/ipv4/route.c80
-rw-r--r--net/ipv4/syncookies.c1
-rw-r--r--net/ipv4/sysctl_net_ipv4.c87
-rw-r--r--net/ipv4/tcp.c158
-rw-r--r--net/ipv4/tcp_fastopen.c83
-rw-r--r--net/ipv4/tcp_input.c301
-rw-r--r--net/ipv4/tcp_ipv4.c341
-rw-r--r--net/ipv4/tcp_metrics.c354
-rw-r--r--net/ipv4/tcp_minisocks.c77
-rw-r--r--net/ipv4/tcp_output.c41
-rw-r--r--net/ipv4/tcp_timer.c45
-rw-r--r--net/ipv4/udp.c9
-rw-r--r--net/ipv4/udp_diag.c9
69 files changed, 2194 insertions, 4253 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index fe4582ca969..766c5965856 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -212,6 +212,26 @@ int inet_listen(struct socket *sock, int backlog)
212 * we can only allow the backlog to be adjusted. 212 * we can only allow the backlog to be adjusted.
213 */ 213 */
214 if (old_state != TCP_LISTEN) { 214 if (old_state != TCP_LISTEN) {
215 /* Check special setups for testing purpose to enable TFO w/o
216 * requiring TCP_FASTOPEN sockopt.
217 * Note that only TCP sockets (SOCK_STREAM) will reach here.
218 * Also fastopenq may already been allocated because this
219 * socket was in TCP_LISTEN state previously but was
220 * shutdown() (rather than close()).
221 */
222 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
223 inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
224 if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
225 err = fastopen_init_queue(sk, backlog);
226 else if ((sysctl_tcp_fastopen &
227 TFO_SERVER_WO_SOCKOPT2) != 0)
228 err = fastopen_init_queue(sk,
229 ((uint)sysctl_tcp_fastopen) >> 16);
230 else
231 err = 0;
232 if (err)
233 goto out;
234 }
215 err = inet_csk_listen_start(sk, backlog); 235 err = inet_csk_listen_start(sk, backlog);
216 if (err) 236 if (err)
217 goto out; 237 goto out;
@@ -701,7 +721,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
701 721
702 sock_rps_record_flow(sk2); 722 sock_rps_record_flow(sk2);
703 WARN_ON(!((1 << sk2->sk_state) & 723 WARN_ON(!((1 << sk2->sk_state) &
704 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); 724 (TCPF_ESTABLISHED | TCPF_SYN_RECV |
725 TCPF_CLOSE_WAIT | TCPF_CLOSE)));
705 726
706 sock_graft(sk2, newsock); 727 sock_graft(sk2, newsock);
707 728
@@ -1364,7 +1385,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1364 if (*(u8 *)iph != 0x45) 1385 if (*(u8 *)iph != 0x45)
1365 goto out_unlock; 1386 goto out_unlock;
1366 1387
1367 if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) 1388 if (unlikely(ip_fast_csum((u8 *)iph, 5)))
1368 goto out_unlock; 1389 goto out_unlock;
1369 1390
1370 id = ntohl(*(__be32 *)&iph->id); 1391 id = ntohl(*(__be32 *)&iph->id);
@@ -1380,7 +1401,6 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1380 iph2 = ip_hdr(p); 1401 iph2 = ip_hdr(p);
1381 1402
1382 if ((iph->protocol ^ iph2->protocol) | 1403 if ((iph->protocol ^ iph2->protocol) |
1383 (iph->tos ^ iph2->tos) |
1384 ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | 1404 ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
1385 ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { 1405 ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
1386 NAPI_GRO_CB(p)->same_flow = 0; 1406 NAPI_GRO_CB(p)->same_flow = 0;
@@ -1390,6 +1410,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1390 /* All fields must match except length and checksum. */ 1410 /* All fields must match except length and checksum. */
1391 NAPI_GRO_CB(p)->flush |= 1411 NAPI_GRO_CB(p)->flush |=
1392 (iph->ttl ^ iph2->ttl) | 1412 (iph->ttl ^ iph2->ttl) |
1413 (iph->tos ^ iph2->tos) |
1393 ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); 1414 ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
1394 1415
1395 NAPI_GRO_CB(p)->flush |= flush; 1416 NAPI_GRO_CB(p)->flush |= flush;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 77e87aff419..47800459e4c 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1225,7 +1225,7 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event,
1225 switch (event) { 1225 switch (event) {
1226 case NETDEV_CHANGEADDR: 1226 case NETDEV_CHANGEADDR:
1227 neigh_changeaddr(&arp_tbl, dev); 1227 neigh_changeaddr(&arp_tbl, dev);
1228 rt_cache_flush(dev_net(dev), 0); 1228 rt_cache_flush(dev_net(dev));
1229 break; 1229 break;
1230 default: 1230 default:
1231 break; 1231 break;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 44bf82e3aef..2a6abc163ed 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -94,25 +94,22 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
94 [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, 94 [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
95}; 95};
96 96
97/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE 97#define IN4_ADDR_HSIZE_SHIFT 8
98 * value. So if you change this define, make appropriate changes to 98#define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT)
99 * inet_addr_hash as well. 99
100 */
101#define IN4_ADDR_HSIZE 256
102static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; 100static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
103static DEFINE_SPINLOCK(inet_addr_hash_lock); 101static DEFINE_SPINLOCK(inet_addr_hash_lock);
104 102
105static inline unsigned int inet_addr_hash(struct net *net, __be32 addr) 103static u32 inet_addr_hash(struct net *net, __be32 addr)
106{ 104{
107 u32 val = (__force u32) addr ^ hash_ptr(net, 8); 105 u32 val = (__force u32) addr ^ net_hash_mix(net);
108 106
109 return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) & 107 return hash_32(val, IN4_ADDR_HSIZE_SHIFT);
110 (IN4_ADDR_HSIZE - 1));
111} 108}
112 109
113static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) 110static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
114{ 111{
115 unsigned int hash = inet_addr_hash(net, ifa->ifa_local); 112 u32 hash = inet_addr_hash(net, ifa->ifa_local);
116 113
117 spin_lock(&inet_addr_hash_lock); 114 spin_lock(&inet_addr_hash_lock);
118 hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); 115 hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
@@ -136,18 +133,18 @@ static void inet_hash_remove(struct in_ifaddr *ifa)
136 */ 133 */
137struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) 134struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
138{ 135{
139 unsigned int hash = inet_addr_hash(net, addr); 136 u32 hash = inet_addr_hash(net, addr);
140 struct net_device *result = NULL; 137 struct net_device *result = NULL;
141 struct in_ifaddr *ifa; 138 struct in_ifaddr *ifa;
142 struct hlist_node *node; 139 struct hlist_node *node;
143 140
144 rcu_read_lock(); 141 rcu_read_lock();
145 hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) { 142 hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) {
146 struct net_device *dev = ifa->ifa_dev->dev;
147
148 if (!net_eq(dev_net(dev), net))
149 continue;
150 if (ifa->ifa_local == addr) { 143 if (ifa->ifa_local == addr) {
144 struct net_device *dev = ifa->ifa_dev->dev;
145
146 if (!net_eq(dev_net(dev), net))
147 continue;
151 result = dev; 148 result = dev;
152 break; 149 break;
153 } 150 }
@@ -182,10 +179,10 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
182static void devinet_sysctl_register(struct in_device *idev); 179static void devinet_sysctl_register(struct in_device *idev);
183static void devinet_sysctl_unregister(struct in_device *idev); 180static void devinet_sysctl_unregister(struct in_device *idev);
184#else 181#else
185static inline void devinet_sysctl_register(struct in_device *idev) 182static void devinet_sysctl_register(struct in_device *idev)
186{ 183{
187} 184}
188static inline void devinet_sysctl_unregister(struct in_device *idev) 185static void devinet_sysctl_unregister(struct in_device *idev)
189{ 186{
190} 187}
191#endif 188#endif
@@ -205,7 +202,7 @@ static void inet_rcu_free_ifa(struct rcu_head *head)
205 kfree(ifa); 202 kfree(ifa);
206} 203}
207 204
208static inline void inet_free_ifa(struct in_ifaddr *ifa) 205static void inet_free_ifa(struct in_ifaddr *ifa)
209{ 206{
210 call_rcu(&ifa->rcu_head, inet_rcu_free_ifa); 207 call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
211} 208}
@@ -314,7 +311,7 @@ int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
314} 311}
315 312
316static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, 313static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
317 int destroy, struct nlmsghdr *nlh, u32 pid) 314 int destroy, struct nlmsghdr *nlh, u32 portid)
318{ 315{
319 struct in_ifaddr *promote = NULL; 316 struct in_ifaddr *promote = NULL;
320 struct in_ifaddr *ifa, *ifa1 = *ifap; 317 struct in_ifaddr *ifa, *ifa1 = *ifap;
@@ -348,7 +345,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
348 inet_hash_remove(ifa); 345 inet_hash_remove(ifa);
349 *ifap1 = ifa->ifa_next; 346 *ifap1 = ifa->ifa_next;
350 347
351 rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid); 348 rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid);
352 blocking_notifier_call_chain(&inetaddr_chain, 349 blocking_notifier_call_chain(&inetaddr_chain,
353 NETDEV_DOWN, ifa); 350 NETDEV_DOWN, ifa);
354 inet_free_ifa(ifa); 351 inet_free_ifa(ifa);
@@ -385,7 +382,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
385 is valid, it will try to restore deleted routes... Grr. 382 is valid, it will try to restore deleted routes... Grr.
386 So that, this order is correct. 383 So that, this order is correct.
387 */ 384 */
388 rtmsg_ifa(RTM_DELADDR, ifa1, nlh, pid); 385 rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid);
389 blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); 386 blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
390 387
391 if (promote) { 388 if (promote) {
@@ -398,7 +395,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
398 } 395 }
399 396
400 promote->ifa_flags &= ~IFA_F_SECONDARY; 397 promote->ifa_flags &= ~IFA_F_SECONDARY;
401 rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid); 398 rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid);
402 blocking_notifier_call_chain(&inetaddr_chain, 399 blocking_notifier_call_chain(&inetaddr_chain,
403 NETDEV_UP, promote); 400 NETDEV_UP, promote);
404 for (ifa = next_sec; ifa; ifa = ifa->ifa_next) { 401 for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
@@ -420,7 +417,7 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
420} 417}
421 418
422static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, 419static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
423 u32 pid) 420 u32 portid)
424{ 421{
425 struct in_device *in_dev = ifa->ifa_dev; 422 struct in_device *in_dev = ifa->ifa_dev;
426 struct in_ifaddr *ifa1, **ifap, **last_primary; 423 struct in_ifaddr *ifa1, **ifap, **last_primary;
@@ -467,7 +464,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
467 /* Send message first, then call notifier. 464 /* Send message first, then call notifier.
468 Notifier will trigger FIB update, so that 465 Notifier will trigger FIB update, so that
469 listeners of netlink will know about new ifaddr */ 466 listeners of netlink will know about new ifaddr */
470 rtmsg_ifa(RTM_NEWADDR, ifa, nlh, pid); 467 rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid);
471 blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); 468 blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
472 469
473 return 0; 470 return 0;
@@ -566,7 +563,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
566 !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa))) 563 !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa)))
567 continue; 564 continue;
568 565
569 __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).pid); 566 __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid);
570 return 0; 567 return 0;
571 } 568 }
572 569
@@ -652,14 +649,14 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
652 if (IS_ERR(ifa)) 649 if (IS_ERR(ifa))
653 return PTR_ERR(ifa); 650 return PTR_ERR(ifa);
654 651
655 return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).pid); 652 return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
656} 653}
657 654
658/* 655/*
659 * Determine a default network mask, based on the IP address. 656 * Determine a default network mask, based on the IP address.
660 */ 657 */
661 658
662static inline int inet_abc_len(__be32 addr) 659static int inet_abc_len(__be32 addr)
663{ 660{
664 int rc = -1; /* Something else, probably a multicast. */ 661 int rc = -1; /* Something else, probably a multicast. */
665 662
@@ -725,7 +722,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
725 break; 722 break;
726 723
727 case SIOCSIFFLAGS: 724 case SIOCSIFFLAGS:
728 ret = -EACCES; 725 ret = -EPERM;
729 if (!capable(CAP_NET_ADMIN)) 726 if (!capable(CAP_NET_ADMIN))
730 goto out; 727 goto out;
731 break; 728 break;
@@ -733,7 +730,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
733 case SIOCSIFBRDADDR: /* Set the broadcast address */ 730 case SIOCSIFBRDADDR: /* Set the broadcast address */
734 case SIOCSIFDSTADDR: /* Set the destination address */ 731 case SIOCSIFDSTADDR: /* Set the destination address */
735 case SIOCSIFNETMASK: /* Set the netmask for the interface */ 732 case SIOCSIFNETMASK: /* Set the netmask for the interface */
736 ret = -EACCES; 733 ret = -EPERM;
737 if (!capable(CAP_NET_ADMIN)) 734 if (!capable(CAP_NET_ADMIN))
738 goto out; 735 goto out;
739 ret = -EINVAL; 736 ret = -EINVAL;
@@ -1124,7 +1121,7 @@ skip:
1124 } 1121 }
1125} 1122}
1126 1123
1127static inline bool inetdev_valid_mtu(unsigned int mtu) 1124static bool inetdev_valid_mtu(unsigned int mtu)
1128{ 1125{
1129 return mtu >= 68; 1126 return mtu >= 68;
1130} 1127}
@@ -1239,7 +1236,7 @@ static struct notifier_block ip_netdev_notifier = {
1239 .notifier_call = inetdev_event, 1236 .notifier_call = inetdev_event,
1240}; 1237};
1241 1238
1242static inline size_t inet_nlmsg_size(void) 1239static size_t inet_nlmsg_size(void)
1243{ 1240{
1244 return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) 1241 return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
1245 + nla_total_size(4) /* IFA_ADDRESS */ 1242 + nla_total_size(4) /* IFA_ADDRESS */
@@ -1249,12 +1246,12 @@ static inline size_t inet_nlmsg_size(void)
1249} 1246}
1250 1247
1251static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, 1248static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
1252 u32 pid, u32 seq, int event, unsigned int flags) 1249 u32 portid, u32 seq, int event, unsigned int flags)
1253{ 1250{
1254 struct ifaddrmsg *ifm; 1251 struct ifaddrmsg *ifm;
1255 struct nlmsghdr *nlh; 1252 struct nlmsghdr *nlh;
1256 1253
1257 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), flags); 1254 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags);
1258 if (nlh == NULL) 1255 if (nlh == NULL)
1259 return -EMSGSIZE; 1256 return -EMSGSIZE;
1260 1257
@@ -1316,7 +1313,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
1316 if (ip_idx < s_ip_idx) 1313 if (ip_idx < s_ip_idx)
1317 continue; 1314 continue;
1318 if (inet_fill_ifaddr(skb, ifa, 1315 if (inet_fill_ifaddr(skb, ifa,
1319 NETLINK_CB(cb->skb).pid, 1316 NETLINK_CB(cb->skb).portid,
1320 cb->nlh->nlmsg_seq, 1317 cb->nlh->nlmsg_seq,
1321 RTM_NEWADDR, NLM_F_MULTI) <= 0) { 1318 RTM_NEWADDR, NLM_F_MULTI) <= 0) {
1322 rcu_read_unlock(); 1319 rcu_read_unlock();
@@ -1338,7 +1335,7 @@ done:
1338} 1335}
1339 1336
1340static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, 1337static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
1341 u32 pid) 1338 u32 portid)
1342{ 1339{
1343 struct sk_buff *skb; 1340 struct sk_buff *skb;
1344 u32 seq = nlh ? nlh->nlmsg_seq : 0; 1341 u32 seq = nlh ? nlh->nlmsg_seq : 0;
@@ -1350,14 +1347,14 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
1350 if (skb == NULL) 1347 if (skb == NULL)
1351 goto errout; 1348 goto errout;
1352 1349
1353 err = inet_fill_ifaddr(skb, ifa, pid, seq, event, 0); 1350 err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0);
1354 if (err < 0) { 1351 if (err < 0) {
1355 /* -EMSGSIZE implies BUG in inet_nlmsg_size() */ 1352 /* -EMSGSIZE implies BUG in inet_nlmsg_size() */
1356 WARN_ON(err == -EMSGSIZE); 1353 WARN_ON(err == -EMSGSIZE);
1357 kfree_skb(skb); 1354 kfree_skb(skb);
1358 goto errout; 1355 goto errout;
1359 } 1356 }
1360 rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); 1357 rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
1361 return; 1358 return;
1362errout: 1359errout:
1363 if (err < 0) 1360 if (err < 0)
@@ -1503,7 +1500,7 @@ static int devinet_conf_proc(ctl_table *ctl, int write,
1503 if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 || 1500 if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 ||
1504 i == IPV4_DEVCONF_ROUTE_LOCALNET - 1) 1501 i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)
1505 if ((new_value == 0) && (old_value != 0)) 1502 if ((new_value == 0) && (old_value != 0))
1506 rt_cache_flush(net, 0); 1503 rt_cache_flush(net);
1507 } 1504 }
1508 1505
1509 return ret; 1506 return ret;
@@ -1537,7 +1534,7 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
1537 dev_disable_lro(idev->dev); 1534 dev_disable_lro(idev->dev);
1538 } 1535 }
1539 rtnl_unlock(); 1536 rtnl_unlock();
1540 rt_cache_flush(net, 0); 1537 rt_cache_flush(net);
1541 } 1538 }
1542 } 1539 }
1543 1540
@@ -1554,7 +1551,7 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write,
1554 struct net *net = ctl->extra2; 1551 struct net *net = ctl->extra2;
1555 1552
1556 if (write && *valp != val) 1553 if (write && *valp != val)
1557 rt_cache_flush(net, 0); 1554 rt_cache_flush(net);
1558 1555
1559 return ret; 1556 return ret;
1560} 1557}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index c43ae3fba79..68c93d1bb03 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -148,7 +148,7 @@ static void fib_flush(struct net *net)
148 } 148 }
149 149
150 if (flushed) 150 if (flushed)
151 rt_cache_flush(net, -1); 151 rt_cache_flush(net);
152} 152}
153 153
154/* 154/*
@@ -218,7 +218,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
218 scope = RT_SCOPE_UNIVERSE; 218 scope = RT_SCOPE_UNIVERSE;
219 if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { 219 if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
220 fl4.flowi4_oif = 0; 220 fl4.flowi4_oif = 0;
221 fl4.flowi4_iif = net->loopback_dev->ifindex; 221 fl4.flowi4_iif = LOOPBACK_IFINDEX;
222 fl4.daddr = ip_hdr(skb)->saddr; 222 fl4.daddr = ip_hdr(skb)->saddr;
223 fl4.saddr = 0; 223 fl4.saddr = 0;
224 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); 224 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
@@ -557,7 +557,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
557 cfg->fc_flags = rtm->rtm_flags; 557 cfg->fc_flags = rtm->rtm_flags;
558 cfg->fc_nlflags = nlh->nlmsg_flags; 558 cfg->fc_nlflags = nlh->nlmsg_flags;
559 559
560 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid; 560 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
561 cfg->fc_nlinfo.nlh = nlh; 561 cfg->fc_nlinfo.nlh = nlh;
562 cfg->fc_nlinfo.nl_net = net; 562 cfg->fc_nlinfo.nl_net = net;
563 563
@@ -955,7 +955,7 @@ static void nl_fib_input(struct sk_buff *skb)
955 struct fib_result_nl *frn; 955 struct fib_result_nl *frn;
956 struct nlmsghdr *nlh; 956 struct nlmsghdr *nlh;
957 struct fib_table *tb; 957 struct fib_table *tb;
958 u32 pid; 958 u32 portid;
959 959
960 net = sock_net(skb->sk); 960 net = sock_net(skb->sk);
961 nlh = nlmsg_hdr(skb); 961 nlh = nlmsg_hdr(skb);
@@ -973,10 +973,10 @@ static void nl_fib_input(struct sk_buff *skb)
973 973
974 nl_fib_lookup(frn, tb); 974 nl_fib_lookup(frn, tb);
975 975
976 pid = NETLINK_CB(skb).pid; /* pid of sending process */ 976 portid = NETLINK_CB(skb).portid; /* pid of sending process */
977 NETLINK_CB(skb).pid = 0; /* from kernel */ 977 NETLINK_CB(skb).portid = 0; /* from kernel */
978 NETLINK_CB(skb).dst_group = 0; /* unicast */ 978 NETLINK_CB(skb).dst_group = 0; /* unicast */
979 netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); 979 netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);
980} 980}
981 981
982static int __net_init nl_fib_lookup_init(struct net *net) 982static int __net_init nl_fib_lookup_init(struct net *net)
@@ -986,7 +986,7 @@ static int __net_init nl_fib_lookup_init(struct net *net)
986 .input = nl_fib_input, 986 .input = nl_fib_input,
987 }; 987 };
988 988
989 sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, THIS_MODULE, &cfg); 989 sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
990 if (sk == NULL) 990 if (sk == NULL)
991 return -EAFNOSUPPORT; 991 return -EAFNOSUPPORT;
992 net->ipv4.fibnl = sk; 992 net->ipv4.fibnl = sk;
@@ -999,11 +999,11 @@ static void nl_fib_lookup_exit(struct net *net)
999 net->ipv4.fibnl = NULL; 999 net->ipv4.fibnl = NULL;
1000} 1000}
1001 1001
1002static void fib_disable_ip(struct net_device *dev, int force, int delay) 1002static void fib_disable_ip(struct net_device *dev, int force)
1003{ 1003{
1004 if (fib_sync_down_dev(dev, force)) 1004 if (fib_sync_down_dev(dev, force))
1005 fib_flush(dev_net(dev)); 1005 fib_flush(dev_net(dev));
1006 rt_cache_flush(dev_net(dev), delay); 1006 rt_cache_flush(dev_net(dev));
1007 arp_ifdown(dev); 1007 arp_ifdown(dev);
1008} 1008}
1009 1009
@@ -1020,7 +1020,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
1020 fib_sync_up(dev); 1020 fib_sync_up(dev);
1021#endif 1021#endif
1022 atomic_inc(&net->ipv4.dev_addr_genid); 1022 atomic_inc(&net->ipv4.dev_addr_genid);
1023 rt_cache_flush(dev_net(dev), -1); 1023 rt_cache_flush(dev_net(dev));
1024 break; 1024 break;
1025 case NETDEV_DOWN: 1025 case NETDEV_DOWN:
1026 fib_del_ifaddr(ifa, NULL); 1026 fib_del_ifaddr(ifa, NULL);
@@ -1029,9 +1029,9 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
1029 /* Last address was deleted from this interface. 1029 /* Last address was deleted from this interface.
1030 * Disable IP. 1030 * Disable IP.
1031 */ 1031 */
1032 fib_disable_ip(dev, 1, 0); 1032 fib_disable_ip(dev, 1);
1033 } else { 1033 } else {
1034 rt_cache_flush(dev_net(dev), -1); 1034 rt_cache_flush(dev_net(dev));
1035 } 1035 }
1036 break; 1036 break;
1037 } 1037 }
@@ -1041,17 +1041,16 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
1041static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) 1041static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
1042{ 1042{
1043 struct net_device *dev = ptr; 1043 struct net_device *dev = ptr;
1044 struct in_device *in_dev = __in_dev_get_rtnl(dev); 1044 struct in_device *in_dev;
1045 struct net *net = dev_net(dev); 1045 struct net *net = dev_net(dev);
1046 1046
1047 if (event == NETDEV_UNREGISTER) { 1047 if (event == NETDEV_UNREGISTER) {
1048 fib_disable_ip(dev, 2, -1); 1048 fib_disable_ip(dev, 2);
1049 rt_flush_dev(dev); 1049 rt_flush_dev(dev);
1050 return NOTIFY_DONE; 1050 return NOTIFY_DONE;
1051 } 1051 }
1052 1052
1053 if (!in_dev) 1053 in_dev = __in_dev_get_rtnl(dev);
1054 return NOTIFY_DONE;
1055 1054
1056 switch (event) { 1055 switch (event) {
1057 case NETDEV_UP: 1056 case NETDEV_UP:
@@ -1062,16 +1061,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
1062 fib_sync_up(dev); 1061 fib_sync_up(dev);
1063#endif 1062#endif
1064 atomic_inc(&net->ipv4.dev_addr_genid); 1063 atomic_inc(&net->ipv4.dev_addr_genid);
1065 rt_cache_flush(dev_net(dev), -1); 1064 rt_cache_flush(net);
1066 break; 1065 break;
1067 case NETDEV_DOWN: 1066 case NETDEV_DOWN:
1068 fib_disable_ip(dev, 0, 0); 1067 fib_disable_ip(dev, 0);
1069 break; 1068 break;
1070 case NETDEV_CHANGEMTU: 1069 case NETDEV_CHANGEMTU:
1071 case NETDEV_CHANGE: 1070 case NETDEV_CHANGE:
1072 rt_cache_flush(dev_net(dev), 0); 1071 rt_cache_flush(net);
1073 break;
1074 case NETDEV_UNREGISTER_BATCH:
1075 break; 1072 break;
1076 } 1073 }
1077 return NOTIFY_DONE; 1074 return NOTIFY_DONE;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index a83d74e498d..26aa65d1fce 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -259,10 +259,10 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
259 259
260static void fib4_rule_flush_cache(struct fib_rules_ops *ops) 260static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
261{ 261{
262 rt_cache_flush(ops->fro_net, -1); 262 rt_cache_flush(ops->fro_net);
263} 263}
264 264
265static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = { 265static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
266 .family = AF_INET, 266 .family = AF_INET,
267 .rule_size = sizeof(struct fib4_rule), 267 .rule_size = sizeof(struct fib4_rule),
268 .addr_size = sizeof(u32), 268 .addr_size = sizeof(u32),
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index da80dc14cc7..267753060ff 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -314,6 +314,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
314 nfi->fib_scope == fi->fib_scope && 314 nfi->fib_scope == fi->fib_scope &&
315 nfi->fib_prefsrc == fi->fib_prefsrc && 315 nfi->fib_prefsrc == fi->fib_prefsrc &&
316 nfi->fib_priority == fi->fib_priority && 316 nfi->fib_priority == fi->fib_priority &&
317 nfi->fib_type == fi->fib_type &&
317 memcmp(nfi->fib_metrics, fi->fib_metrics, 318 memcmp(nfi->fib_metrics, fi->fib_metrics,
318 sizeof(u32) * RTAX_MAX) == 0 && 319 sizeof(u32) * RTAX_MAX) == 0 &&
319 ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && 320 ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
@@ -391,7 +392,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
391 if (skb == NULL) 392 if (skb == NULL)
392 goto errout; 393 goto errout;
393 394
394 err = fib_dump_info(skb, info->pid, seq, event, tb_id, 395 err = fib_dump_info(skb, info->portid, seq, event, tb_id,
395 fa->fa_type, key, dst_len, 396 fa->fa_type, key, dst_len,
396 fa->fa_tos, fa->fa_info, nlm_flags); 397 fa->fa_tos, fa->fa_info, nlm_flags);
397 if (err < 0) { 398 if (err < 0) {
@@ -400,7 +401,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
400 kfree_skb(skb); 401 kfree_skb(skb);
401 goto errout; 402 goto errout;
402 } 403 }
403 rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, 404 rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,
404 info->nlh, GFP_KERNEL); 405 info->nlh, GFP_KERNEL);
405 return; 406 return;
406errout: 407errout:
@@ -833,6 +834,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
833 fi->fib_flags = cfg->fc_flags; 834 fi->fib_flags = cfg->fc_flags;
834 fi->fib_priority = cfg->fc_priority; 835 fi->fib_priority = cfg->fc_priority;
835 fi->fib_prefsrc = cfg->fc_prefsrc; 836 fi->fib_prefsrc = cfg->fc_prefsrc;
837 fi->fib_type = cfg->fc_type;
836 838
837 fi->fib_nhs = nhs; 839 fi->fib_nhs = nhs;
838 change_nexthops(fi) { 840 change_nexthops(fi) {
@@ -989,14 +991,14 @@ failure:
989 return ERR_PTR(err); 991 return ERR_PTR(err);
990} 992}
991 993
992int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 994int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
993 u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, 995 u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
994 struct fib_info *fi, unsigned int flags) 996 struct fib_info *fi, unsigned int flags)
995{ 997{
996 struct nlmsghdr *nlh; 998 struct nlmsghdr *nlh;
997 struct rtmsg *rtm; 999 struct rtmsg *rtm;
998 1000
999 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags); 1001 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
1000 if (nlh == NULL) 1002 if (nlh == NULL)
1001 return -EMSGSIZE; 1003 return -EMSGSIZE;
1002 1004
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 57bd978483e..31d771ca9a7 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1286,7 +1286,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1286 1286
1287 fib_release_info(fi_drop); 1287 fib_release_info(fi_drop);
1288 if (state & FA_S_ACCESSED) 1288 if (state & FA_S_ACCESSED)
1289 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); 1289 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1290 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, 1290 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1291 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); 1291 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1292 1292
@@ -1333,7 +1333,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1333 list_add_tail_rcu(&new_fa->fa_list, 1333 list_add_tail_rcu(&new_fa->fa_list,
1334 (fa ? &fa->fa_list : fa_head)); 1334 (fa ? &fa->fa_list : fa_head));
1335 1335
1336 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); 1336 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1337 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, 1337 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1338 &cfg->fc_nlinfo, 0); 1338 &cfg->fc_nlinfo, 0);
1339succeeded: 1339succeeded:
@@ -1550,7 +1550,8 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1550 * state.directly. 1550 * state.directly.
1551 */ 1551 */
1552 if (pref_mismatch) { 1552 if (pref_mismatch) {
1553 int mp = KEYLENGTH - fls(pref_mismatch); 1553 /* fls(x) = __fls(x) + 1 */
1554 int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
1554 1555
1555 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0) 1556 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1556 goto backtrace; 1557 goto backtrace;
@@ -1655,7 +1656,12 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1655 if (!l) 1656 if (!l)
1656 return -ESRCH; 1657 return -ESRCH;
1657 1658
1658 fa_head = get_fa_head(l, plen); 1659 li = find_leaf_info(l, plen);
1660
1661 if (!li)
1662 return -ESRCH;
1663
1664 fa_head = &li->falh;
1659 fa = fib_find_alias(fa_head, tos, 0); 1665 fa = fib_find_alias(fa_head, tos, 0);
1660 1666
1661 if (!fa) 1667 if (!fa)
@@ -1691,9 +1697,6 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1691 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, 1697 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1692 &cfg->fc_nlinfo, 0); 1698 &cfg->fc_nlinfo, 0);
1693 1699
1694 l = fib_find_node(t, key);
1695 li = find_leaf_info(l, plen);
1696
1697 list_del_rcu(&fa->fa_list); 1700 list_del_rcu(&fa->fa_list);
1698 1701
1699 if (!plen) 1702 if (!plen)
@@ -1708,7 +1711,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1708 trie_leaf_remove(t, l); 1711 trie_leaf_remove(t, l);
1709 1712
1710 if (fa->fa_state & FA_S_ACCESSED) 1713 if (fa->fa_state & FA_S_ACCESSED)
1711 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); 1714 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1712 1715
1713 fib_release_info(fa->fa_info); 1716 fib_release_info(fa->fa_info);
1714 alias_free_mem_rcu(fa); 1717 alias_free_mem_rcu(fa);
@@ -1870,7 +1873,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1870 continue; 1873 continue;
1871 } 1874 }
1872 1875
1873 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, 1876 if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1874 cb->nlh->nlmsg_seq, 1877 cb->nlh->nlmsg_seq,
1875 RTM_NEWROUTE, 1878 RTM_NEWROUTE,
1876 tb->tb_id, 1879 tb->tb_id,
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 6699f23e6f5..736ab70fd17 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -815,14 +815,15 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
815 return 1; 815 return 1;
816} 816}
817 817
818static void igmp_heard_report(struct in_device *in_dev, __be32 group) 818/* return true if packet was dropped */
819static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
819{ 820{
820 struct ip_mc_list *im; 821 struct ip_mc_list *im;
821 822
822 /* Timers are only set for non-local groups */ 823 /* Timers are only set for non-local groups */
823 824
824 if (group == IGMP_ALL_HOSTS) 825 if (group == IGMP_ALL_HOSTS)
825 return; 826 return false;
826 827
827 rcu_read_lock(); 828 rcu_read_lock();
828 for_each_pmc_rcu(in_dev, im) { 829 for_each_pmc_rcu(in_dev, im) {
@@ -832,9 +833,11 @@ static void igmp_heard_report(struct in_device *in_dev, __be32 group)
832 } 833 }
833 } 834 }
834 rcu_read_unlock(); 835 rcu_read_unlock();
836 return false;
835} 837}
836 838
837static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, 839/* return true if packet was dropped */
840static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
838 int len) 841 int len)
839{ 842{
840 struct igmphdr *ih = igmp_hdr(skb); 843 struct igmphdr *ih = igmp_hdr(skb);
@@ -866,7 +869,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
866 /* clear deleted report items */ 869 /* clear deleted report items */
867 igmpv3_clear_delrec(in_dev); 870 igmpv3_clear_delrec(in_dev);
868 } else if (len < 12) { 871 } else if (len < 12) {
869 return; /* ignore bogus packet; freed by caller */ 872 return true; /* ignore bogus packet; freed by caller */
870 } else if (IGMP_V1_SEEN(in_dev)) { 873 } else if (IGMP_V1_SEEN(in_dev)) {
871 /* This is a v3 query with v1 queriers present */ 874 /* This is a v3 query with v1 queriers present */
872 max_delay = IGMP_Query_Response_Interval; 875 max_delay = IGMP_Query_Response_Interval;
@@ -883,13 +886,13 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
883 max_delay = 1; /* can't mod w/ 0 */ 886 max_delay = 1; /* can't mod w/ 0 */
884 } else { /* v3 */ 887 } else { /* v3 */
885 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) 888 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
886 return; 889 return true;
887 890
888 ih3 = igmpv3_query_hdr(skb); 891 ih3 = igmpv3_query_hdr(skb);
889 if (ih3->nsrcs) { 892 if (ih3->nsrcs) {
890 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query) 893 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
891 + ntohs(ih3->nsrcs)*sizeof(__be32))) 894 + ntohs(ih3->nsrcs)*sizeof(__be32)))
892 return; 895 return true;
893 ih3 = igmpv3_query_hdr(skb); 896 ih3 = igmpv3_query_hdr(skb);
894 } 897 }
895 898
@@ -901,9 +904,9 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
901 in_dev->mr_qrv = ih3->qrv; 904 in_dev->mr_qrv = ih3->qrv;
902 if (!group) { /* general query */ 905 if (!group) { /* general query */
903 if (ih3->nsrcs) 906 if (ih3->nsrcs)
904 return; /* no sources allowed */ 907 return false; /* no sources allowed */
905 igmp_gq_start_timer(in_dev); 908 igmp_gq_start_timer(in_dev);
906 return; 909 return false;
907 } 910 }
908 /* mark sources to include, if group & source-specific */ 911 /* mark sources to include, if group & source-specific */
909 mark = ih3->nsrcs != 0; 912 mark = ih3->nsrcs != 0;
@@ -939,6 +942,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
939 igmp_mod_timer(im, max_delay); 942 igmp_mod_timer(im, max_delay);
940 } 943 }
941 rcu_read_unlock(); 944 rcu_read_unlock();
945 return false;
942} 946}
943 947
944/* called in rcu_read_lock() section */ 948/* called in rcu_read_lock() section */
@@ -948,6 +952,7 @@ int igmp_rcv(struct sk_buff *skb)
948 struct igmphdr *ih; 952 struct igmphdr *ih;
949 struct in_device *in_dev = __in_dev_get_rcu(skb->dev); 953 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
950 int len = skb->len; 954 int len = skb->len;
955 bool dropped = true;
951 956
952 if (in_dev == NULL) 957 if (in_dev == NULL)
953 goto drop; 958 goto drop;
@@ -969,7 +974,7 @@ int igmp_rcv(struct sk_buff *skb)
969 ih = igmp_hdr(skb); 974 ih = igmp_hdr(skb);
970 switch (ih->type) { 975 switch (ih->type) {
971 case IGMP_HOST_MEMBERSHIP_QUERY: 976 case IGMP_HOST_MEMBERSHIP_QUERY:
972 igmp_heard_query(in_dev, skb, len); 977 dropped = igmp_heard_query(in_dev, skb, len);
973 break; 978 break;
974 case IGMP_HOST_MEMBERSHIP_REPORT: 979 case IGMP_HOST_MEMBERSHIP_REPORT:
975 case IGMPV2_HOST_MEMBERSHIP_REPORT: 980 case IGMPV2_HOST_MEMBERSHIP_REPORT:
@@ -979,7 +984,7 @@ int igmp_rcv(struct sk_buff *skb)
979 /* don't rely on MC router hearing unicast reports */ 984 /* don't rely on MC router hearing unicast reports */
980 if (skb->pkt_type == PACKET_MULTICAST || 985 if (skb->pkt_type == PACKET_MULTICAST ||
981 skb->pkt_type == PACKET_BROADCAST) 986 skb->pkt_type == PACKET_BROADCAST)
982 igmp_heard_report(in_dev, ih->group); 987 dropped = igmp_heard_report(in_dev, ih->group);
983 break; 988 break;
984 case IGMP_PIM: 989 case IGMP_PIM:
985#ifdef CONFIG_IP_PIMSM_V1 990#ifdef CONFIG_IP_PIMSM_V1
@@ -997,7 +1002,10 @@ int igmp_rcv(struct sk_buff *skb)
997 } 1002 }
998 1003
999drop: 1004drop:
1000 kfree_skb(skb); 1005 if (dropped)
1006 kfree_skb(skb);
1007 else
1008 consume_skb(skb);
1001 return 0; 1009 return 0;
1002} 1010}
1003 1011
@@ -1896,6 +1904,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1896 rtnl_unlock(); 1904 rtnl_unlock();
1897 return ret; 1905 return ret;
1898} 1906}
1907EXPORT_SYMBOL(ip_mc_leave_group);
1899 1908
1900int ip_mc_source(int add, int omode, struct sock *sk, struct 1909int ip_mc_source(int add, int omode, struct sock *sk, struct
1901 ip_mreq_source *mreqs, int ifindex) 1910 ip_mreq_source *mreqs, int ifindex)
@@ -2435,6 +2444,8 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
2435 struct ip_mc_list *im = (struct ip_mc_list *)v; 2444 struct ip_mc_list *im = (struct ip_mc_list *)v;
2436 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); 2445 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2437 char *querier; 2446 char *querier;
2447 long delta;
2448
2438#ifdef CONFIG_IP_MULTICAST 2449#ifdef CONFIG_IP_MULTICAST
2439 querier = IGMP_V1_SEEN(state->in_dev) ? "V1" : 2450 querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
2440 IGMP_V2_SEEN(state->in_dev) ? "V2" : 2451 IGMP_V2_SEEN(state->in_dev) ? "V2" :
@@ -2448,11 +2459,12 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
2448 state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); 2459 state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
2449 } 2460 }
2450 2461
2462 delta = im->timer.expires - jiffies;
2451 seq_printf(seq, 2463 seq_printf(seq,
2452 "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n", 2464 "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
2453 im->multiaddr, im->users, 2465 im->multiaddr, im->users,
2454 im->tm_running, im->tm_running ? 2466 im->tm_running,
2455 jiffies_to_clock_t(im->timer.expires-jiffies) : 0, 2467 im->tm_running ? jiffies_delta_to_clock_t(delta) : 0,
2456 im->reporter); 2468 im->reporter);
2457 } 2469 }
2458 return 0; 2470 return 0;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index db0cf17c00f..f0c5b9c1a95 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -283,7 +283,9 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
283struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) 283struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
284{ 284{
285 struct inet_connection_sock *icsk = inet_csk(sk); 285 struct inet_connection_sock *icsk = inet_csk(sk);
286 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
286 struct sock *newsk; 287 struct sock *newsk;
288 struct request_sock *req;
287 int error; 289 int error;
288 290
289 lock_sock(sk); 291 lock_sock(sk);
@@ -296,7 +298,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
296 goto out_err; 298 goto out_err;
297 299
298 /* Find already established connection */ 300 /* Find already established connection */
299 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { 301 if (reqsk_queue_empty(queue)) {
300 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 302 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
301 303
302 /* If this is a non blocking socket don't sleep */ 304 /* If this is a non blocking socket don't sleep */
@@ -308,14 +310,32 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
308 if (error) 310 if (error)
309 goto out_err; 311 goto out_err;
310 } 312 }
311 313 req = reqsk_queue_remove(queue);
312 newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); 314 newsk = req->sk;
313 WARN_ON(newsk->sk_state == TCP_SYN_RECV); 315
316 sk_acceptq_removed(sk);
317 if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) {
318 spin_lock_bh(&queue->fastopenq->lock);
319 if (tcp_rsk(req)->listener) {
320 /* We are still waiting for the final ACK from 3WHS
321 * so can't free req now. Instead, we set req->sk to
322 * NULL to signify that the child socket is taken
323 * so reqsk_fastopen_remove() will free the req
324 * when 3WHS finishes (or is aborted).
325 */
326 req->sk = NULL;
327 req = NULL;
328 }
329 spin_unlock_bh(&queue->fastopenq->lock);
330 }
314out: 331out:
315 release_sock(sk); 332 release_sock(sk);
333 if (req)
334 __reqsk_free(req);
316 return newsk; 335 return newsk;
317out_err: 336out_err:
318 newsk = NULL; 337 newsk = NULL;
338 req = NULL;
319 *err = error; 339 *err = error;
320 goto out; 340 goto out;
321} 341}
@@ -404,12 +424,15 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
404{ 424{
405 const struct inet_request_sock *ireq = inet_rsk(req); 425 const struct inet_request_sock *ireq = inet_rsk(req);
406 struct inet_sock *newinet = inet_sk(newsk); 426 struct inet_sock *newinet = inet_sk(newsk);
407 struct ip_options_rcu *opt = ireq->opt; 427 struct ip_options_rcu *opt;
408 struct net *net = sock_net(sk); 428 struct net *net = sock_net(sk);
409 struct flowi4 *fl4; 429 struct flowi4 *fl4;
410 struct rtable *rt; 430 struct rtable *rt;
411 431
412 fl4 = &newinet->cork.fl.u.ip4; 432 fl4 = &newinet->cork.fl.u.ip4;
433
434 rcu_read_lock();
435 opt = rcu_dereference(newinet->inet_opt);
413 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 436 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
414 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 437 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
415 sk->sk_protocol, inet_sk_flowi_flags(sk), 438 sk->sk_protocol, inet_sk_flowi_flags(sk),
@@ -421,11 +444,13 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
421 goto no_route; 444 goto no_route;
422 if (opt && opt->opt.is_strictroute && rt->rt_gateway) 445 if (opt && opt->opt.is_strictroute && rt->rt_gateway)
423 goto route_err; 446 goto route_err;
447 rcu_read_unlock();
424 return &rt->dst; 448 return &rt->dst;
425 449
426route_err: 450route_err:
427 ip_rt_put(rt); 451 ip_rt_put(rt);
428no_route: 452no_route:
453 rcu_read_unlock();
429 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); 454 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
430 return NULL; 455 return NULL;
431} 456}
@@ -715,13 +740,14 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start);
715void inet_csk_listen_stop(struct sock *sk) 740void inet_csk_listen_stop(struct sock *sk)
716{ 741{
717 struct inet_connection_sock *icsk = inet_csk(sk); 742 struct inet_connection_sock *icsk = inet_csk(sk);
743 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
718 struct request_sock *acc_req; 744 struct request_sock *acc_req;
719 struct request_sock *req; 745 struct request_sock *req;
720 746
721 inet_csk_delete_keepalive_timer(sk); 747 inet_csk_delete_keepalive_timer(sk);
722 748
723 /* make all the listen_opt local to us */ 749 /* make all the listen_opt local to us */
724 acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); 750 acc_req = reqsk_queue_yank_acceptq(queue);
725 751
726 /* Following specs, it would be better either to send FIN 752 /* Following specs, it would be better either to send FIN
727 * (and enter FIN-WAIT-1, it is normal close) 753 * (and enter FIN-WAIT-1, it is normal close)
@@ -731,7 +757,7 @@ void inet_csk_listen_stop(struct sock *sk)
731 * To be honest, we are not able to make either 757 * To be honest, we are not able to make either
732 * of the variants now. --ANK 758 * of the variants now. --ANK
733 */ 759 */
734 reqsk_queue_destroy(&icsk->icsk_accept_queue); 760 reqsk_queue_destroy(queue);
735 761
736 while ((req = acc_req) != NULL) { 762 while ((req = acc_req) != NULL) {
737 struct sock *child = req->sk; 763 struct sock *child = req->sk;
@@ -749,6 +775,19 @@ void inet_csk_listen_stop(struct sock *sk)
749 775
750 percpu_counter_inc(sk->sk_prot->orphan_count); 776 percpu_counter_inc(sk->sk_prot->orphan_count);
751 777
778 if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) {
779 BUG_ON(tcp_sk(child)->fastopen_rsk != req);
780 BUG_ON(sk != tcp_rsk(req)->listener);
781
782 /* Paranoid, to prevent race condition if
783 * an inbound pkt destined for child is
784 * blocked by sock lock in tcp_v4_rcv().
785 * Also to satisfy an assertion in
786 * tcp_v4_destroy_sock().
787 */
788 tcp_sk(child)->fastopen_rsk = NULL;
789 sock_put(sk);
790 }
752 inet_csk_destroy_sock(child); 791 inet_csk_destroy_sock(child);
753 792
754 bh_unlock_sock(child); 793 bh_unlock_sock(child);
@@ -758,6 +797,17 @@ void inet_csk_listen_stop(struct sock *sk)
758 sk_acceptq_removed(sk); 797 sk_acceptq_removed(sk);
759 __reqsk_free(req); 798 __reqsk_free(req);
760 } 799 }
800 if (queue->fastopenq != NULL) {
801 /* Free all the reqs queued in rskq_rst_head. */
802 spin_lock_bh(&queue->fastopenq->lock);
803 acc_req = queue->fastopenq->rskq_rst_head;
804 queue->fastopenq->rskq_rst_head = NULL;
805 spin_unlock_bh(&queue->fastopenq->lock);
806 while ((req = acc_req) != NULL) {
807 acc_req = req->dl_next;
808 __reqsk_free(req);
809 }
810 }
761 WARN_ON(sk->sk_ack_backlog); 811 WARN_ON(sk->sk_ack_backlog);
762} 812}
763EXPORT_SYMBOL_GPL(inet_csk_listen_stop); 813EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 570e61f9611..535584c00f9 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -69,7 +69,8 @@ static inline void inet_diag_unlock_handler(
69 69
70int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, 70int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
71 struct sk_buff *skb, struct inet_diag_req_v2 *req, 71 struct sk_buff *skb, struct inet_diag_req_v2 *req,
72 u32 pid, u32 seq, u16 nlmsg_flags, 72 struct user_namespace *user_ns,
73 u32 portid, u32 seq, u16 nlmsg_flags,
73 const struct nlmsghdr *unlh) 74 const struct nlmsghdr *unlh)
74{ 75{
75 const struct inet_sock *inet = inet_sk(sk); 76 const struct inet_sock *inet = inet_sk(sk);
@@ -83,7 +84,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
83 handler = inet_diag_table[req->sdiag_protocol]; 84 handler = inet_diag_table[req->sdiag_protocol];
84 BUG_ON(handler == NULL); 85 BUG_ON(handler == NULL);
85 86
86 nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r), 87 nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
87 nlmsg_flags); 88 nlmsg_flags);
88 if (!nlh) 89 if (!nlh)
89 return -EMSGSIZE; 90 return -EMSGSIZE;
@@ -124,7 +125,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
124 } 125 }
125#endif 126#endif
126 127
127 r->idiag_uid = sock_i_uid(sk); 128 r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
128 r->idiag_inode = sock_i_ino(sk); 129 r->idiag_inode = sock_i_ino(sk);
129 130
130 if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { 131 if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
@@ -199,23 +200,24 @@ EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
199 200
200static int inet_csk_diag_fill(struct sock *sk, 201static int inet_csk_diag_fill(struct sock *sk,
201 struct sk_buff *skb, struct inet_diag_req_v2 *req, 202 struct sk_buff *skb, struct inet_diag_req_v2 *req,
202 u32 pid, u32 seq, u16 nlmsg_flags, 203 struct user_namespace *user_ns,
204 u32 portid, u32 seq, u16 nlmsg_flags,
203 const struct nlmsghdr *unlh) 205 const struct nlmsghdr *unlh)
204{ 206{
205 return inet_sk_diag_fill(sk, inet_csk(sk), 207 return inet_sk_diag_fill(sk, inet_csk(sk),
206 skb, req, pid, seq, nlmsg_flags, unlh); 208 skb, req, user_ns, portid, seq, nlmsg_flags, unlh);
207} 209}
208 210
209static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, 211static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
210 struct sk_buff *skb, struct inet_diag_req_v2 *req, 212 struct sk_buff *skb, struct inet_diag_req_v2 *req,
211 u32 pid, u32 seq, u16 nlmsg_flags, 213 u32 portid, u32 seq, u16 nlmsg_flags,
212 const struct nlmsghdr *unlh) 214 const struct nlmsghdr *unlh)
213{ 215{
214 long tmo; 216 long tmo;
215 struct inet_diag_msg *r; 217 struct inet_diag_msg *r;
216 struct nlmsghdr *nlh; 218 struct nlmsghdr *nlh;
217 219
218 nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r), 220 nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
219 nlmsg_flags); 221 nlmsg_flags);
220 if (!nlh) 222 if (!nlh)
221 return -EMSGSIZE; 223 return -EMSGSIZE;
@@ -256,14 +258,16 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
256} 258}
257 259
258static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, 260static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
259 struct inet_diag_req_v2 *r, u32 pid, u32 seq, u16 nlmsg_flags, 261 struct inet_diag_req_v2 *r,
262 struct user_namespace *user_ns,
263 u32 portid, u32 seq, u16 nlmsg_flags,
260 const struct nlmsghdr *unlh) 264 const struct nlmsghdr *unlh)
261{ 265{
262 if (sk->sk_state == TCP_TIME_WAIT) 266 if (sk->sk_state == TCP_TIME_WAIT)
263 return inet_twsk_diag_fill((struct inet_timewait_sock *)sk, 267 return inet_twsk_diag_fill((struct inet_timewait_sock *)sk,
264 skb, r, pid, seq, nlmsg_flags, 268 skb, r, portid, seq, nlmsg_flags,
265 unlh); 269 unlh);
266 return inet_csk_diag_fill(sk, skb, r, pid, seq, nlmsg_flags, unlh); 270 return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, nlmsg_flags, unlh);
267} 271}
268 272
269int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb, 273int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb,
@@ -311,14 +315,15 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
311 } 315 }
312 316
313 err = sk_diag_fill(sk, rep, req, 317 err = sk_diag_fill(sk, rep, req,
314 NETLINK_CB(in_skb).pid, 318 sk_user_ns(NETLINK_CB(in_skb).ssk),
319 NETLINK_CB(in_skb).portid,
315 nlh->nlmsg_seq, 0, nlh); 320 nlh->nlmsg_seq, 0, nlh);
316 if (err < 0) { 321 if (err < 0) {
317 WARN_ON(err == -EMSGSIZE); 322 WARN_ON(err == -EMSGSIZE);
318 nlmsg_free(rep); 323 nlmsg_free(rep);
319 goto out; 324 goto out;
320 } 325 }
321 err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid, 326 err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
322 MSG_DONTWAIT); 327 MSG_DONTWAIT);
323 if (err > 0) 328 if (err > 0)
324 err = 0; 329 err = 0;
@@ -551,7 +556,8 @@ static int inet_csk_diag_dump(struct sock *sk,
551 return 0; 556 return 0;
552 557
553 return inet_csk_diag_fill(sk, skb, r, 558 return inet_csk_diag_fill(sk, skb, r,
554 NETLINK_CB(cb->skb).pid, 559 sk_user_ns(NETLINK_CB(cb->skb).ssk),
560 NETLINK_CB(cb->skb).portid,
555 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); 561 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
556} 562}
557 563
@@ -586,12 +592,14 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
586 } 592 }
587 593
588 return inet_twsk_diag_fill(tw, skb, r, 594 return inet_twsk_diag_fill(tw, skb, r,
589 NETLINK_CB(cb->skb).pid, 595 NETLINK_CB(cb->skb).portid,
590 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); 596 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
591} 597}
592 598
593static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, 599static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
594 struct request_sock *req, u32 pid, u32 seq, 600 struct request_sock *req,
601 struct user_namespace *user_ns,
602 u32 portid, u32 seq,
595 const struct nlmsghdr *unlh) 603 const struct nlmsghdr *unlh)
596{ 604{
597 const struct inet_request_sock *ireq = inet_rsk(req); 605 const struct inet_request_sock *ireq = inet_rsk(req);
@@ -600,7 +608,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
600 struct nlmsghdr *nlh; 608 struct nlmsghdr *nlh;
601 long tmo; 609 long tmo;
602 610
603 nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r), 611 nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
604 NLM_F_MULTI); 612 NLM_F_MULTI);
605 if (!nlh) 613 if (!nlh)
606 return -EMSGSIZE; 614 return -EMSGSIZE;
@@ -625,7 +633,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
625 r->idiag_expires = jiffies_to_msecs(tmo); 633 r->idiag_expires = jiffies_to_msecs(tmo);
626 r->idiag_rqueue = 0; 634 r->idiag_rqueue = 0;
627 r->idiag_wqueue = 0; 635 r->idiag_wqueue = 0;
628 r->idiag_uid = sock_i_uid(sk); 636 r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
629 r->idiag_inode = 0; 637 r->idiag_inode = 0;
630#if IS_ENABLED(CONFIG_IPV6) 638#if IS_ENABLED(CONFIG_IPV6)
631 if (r->idiag_family == AF_INET6) { 639 if (r->idiag_family == AF_INET6) {
@@ -702,7 +710,8 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
702 } 710 }
703 711
704 err = inet_diag_fill_req(skb, sk, req, 712 err = inet_diag_fill_req(skb, sk, req,
705 NETLINK_CB(cb->skb).pid, 713 sk_user_ns(NETLINK_CB(cb->skb).ssk),
714 NETLINK_CB(cb->skb).portid,
706 cb->nlh->nlmsg_seq, cb->nlh); 715 cb->nlh->nlmsg_seq, cb->nlh);
707 if (err < 0) { 716 if (err < 0) {
708 cb->args[3] = j + 1; 717 cb->args[3] = j + 1;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 85190e69297..4750d2b74d7 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -89,7 +89,7 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
89 nf->low_thresh = 0; 89 nf->low_thresh = 0;
90 90
91 local_bh_disable(); 91 local_bh_disable();
92 inet_frag_evictor(nf, f); 92 inet_frag_evictor(nf, f, true);
93 local_bh_enable(); 93 local_bh_enable();
94} 94}
95EXPORT_SYMBOL(inet_frags_exit_net); 95EXPORT_SYMBOL(inet_frags_exit_net);
@@ -158,11 +158,16 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
158} 158}
159EXPORT_SYMBOL(inet_frag_destroy); 159EXPORT_SYMBOL(inet_frag_destroy);
160 160
161int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f) 161int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
162{ 162{
163 struct inet_frag_queue *q; 163 struct inet_frag_queue *q;
164 int work, evicted = 0; 164 int work, evicted = 0;
165 165
166 if (!force) {
167 if (atomic_read(&nf->mem) <= nf->high_thresh)
168 return 0;
169 }
170
166 work = atomic_read(&nf->mem) - nf->low_thresh; 171 work = atomic_read(&nf->mem) - nf->low_thresh;
167 while (work > 0) { 172 while (work > 0) {
168 read_lock(&f->lock); 173 read_lock(&f->lock);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index e1e0a4e8fd3..000e3d239d6 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -194,7 +194,7 @@ void __init inet_initpeers(void)
194 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, 194 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
195 NULL); 195 NULL);
196 196
197 INIT_DELAYED_WORK_DEFERRABLE(&gc_work, inetpeer_gc_worker); 197 INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker);
198} 198}
199 199
200static int addr_compare(const struct inetpeer_addr *a, 200static int addr_compare(const struct inetpeer_addr *a,
@@ -510,7 +510,10 @@ relookup:
510 secure_ipv6_id(daddr->addr.a6)); 510 secure_ipv6_id(daddr->addr.a6));
511 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; 511 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
512 p->rate_tokens = 0; 512 p->rate_tokens = 0;
513 p->rate_last = 0; 513 /* 60*HZ is arbitrary, but chosen enough high so that the first
514 * calculation of tokens is at its maximum.
515 */
516 p->rate_last = jiffies - 60*HZ;
514 INIT_LIST_HEAD(&p->gc_list); 517 INIT_LIST_HEAD(&p->gc_list);
515 518
516 /* Link the node. */ 519 /* Link the node. */
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8d07c973409..448e6854682 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -219,7 +219,7 @@ static void ip_evictor(struct net *net)
219{ 219{
220 int evicted; 220 int evicted;
221 221
222 evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags); 222 evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
223 if (evicted) 223 if (evicted)
224 IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted); 224 IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
225} 225}
@@ -523,6 +523,10 @@ found:
523 if (offset == 0) 523 if (offset == 0)
524 qp->q.last_in |= INET_FRAG_FIRST_IN; 524 qp->q.last_in |= INET_FRAG_FIRST_IN;
525 525
526 if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
527 skb->len + ihl > qp->q.max_size)
528 qp->q.max_size = skb->len + ihl;
529
526 if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && 530 if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
527 qp->q.meat == qp->q.len) 531 qp->q.meat == qp->q.len)
528 return ip_frag_reasm(qp, prev, dev); 532 return ip_frag_reasm(qp, prev, dev);
@@ -646,9 +650,11 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
646 head->next = NULL; 650 head->next = NULL;
647 head->dev = dev; 651 head->dev = dev;
648 head->tstamp = qp->q.stamp; 652 head->tstamp = qp->q.stamp;
653 IPCB(head)->frag_max_size = qp->q.max_size;
649 654
650 iph = ip_hdr(head); 655 iph = ip_hdr(head);
651 iph->frag_off = 0; 656 /* max_size != 0 implies at least one fragment had IP_DF set */
657 iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
652 iph->tot_len = htons(len); 658 iph->tot_len = htons(len);
653 iph->tos |= ecn; 659 iph->tos |= ecn;
654 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); 660 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
@@ -678,8 +684,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
678 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); 684 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
679 685
680 /* Start by cleaning up the memory. */ 686 /* Start by cleaning up the memory. */
681 if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) 687 ip_evictor(net);
682 ip_evictor(net);
683 688
684 /* Lookup (or create) queue header */ 689 /* Lookup (or create) queue header */
685 if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { 690 if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index b062a98574f..7240f8e2dd4 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -120,6 +120,10 @@
120 Alexey Kuznetsov. 120 Alexey Kuznetsov.
121 */ 121 */
122 122
123static bool log_ecn_error = true;
124module_param(log_ecn_error, bool, 0644);
125MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
126
123static struct rtnl_link_ops ipgre_link_ops __read_mostly; 127static struct rtnl_link_ops ipgre_link_ops __read_mostly;
124static int ipgre_tunnel_init(struct net_device *dev); 128static int ipgre_tunnel_init(struct net_device *dev);
125static void ipgre_tunnel_setup(struct net_device *dev); 129static void ipgre_tunnel_setup(struct net_device *dev);
@@ -204,7 +208,9 @@ static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
204 tot->rx_crc_errors = dev->stats.rx_crc_errors; 208 tot->rx_crc_errors = dev->stats.rx_crc_errors;
205 tot->rx_fifo_errors = dev->stats.rx_fifo_errors; 209 tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
206 tot->rx_length_errors = dev->stats.rx_length_errors; 210 tot->rx_length_errors = dev->stats.rx_length_errors;
211 tot->rx_frame_errors = dev->stats.rx_frame_errors;
207 tot->rx_errors = dev->stats.rx_errors; 212 tot->rx_errors = dev->stats.rx_errors;
213
208 tot->tx_fifo_errors = dev->stats.tx_fifo_errors; 214 tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
209 tot->tx_carrier_errors = dev->stats.tx_carrier_errors; 215 tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
210 tot->tx_dropped = dev->stats.tx_dropped; 216 tot->tx_dropped = dev->stats.tx_dropped;
@@ -214,11 +220,25 @@ static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
214 return tot; 220 return tot;
215} 221}
216 222
223/* Does key in tunnel parameters match packet */
224static bool ipgre_key_match(const struct ip_tunnel_parm *p,
225 __be16 flags, __be32 key)
226{
227 if (p->i_flags & GRE_KEY) {
228 if (flags & GRE_KEY)
229 return key == p->i_key;
230 else
231 return false; /* key expected, none present */
232 } else
233 return !(flags & GRE_KEY);
234}
235
217/* Given src, dst and key, find appropriate for input tunnel. */ 236/* Given src, dst and key, find appropriate for input tunnel. */
218 237
219static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev, 238static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
220 __be32 remote, __be32 local, 239 __be32 remote, __be32 local,
221 __be32 key, __be16 gre_proto) 240 __be16 flags, __be32 key,
241 __be16 gre_proto)
222{ 242{
223 struct net *net = dev_net(dev); 243 struct net *net = dev_net(dev);
224 int link = dev->ifindex; 244 int link = dev->ifindex;
@@ -233,10 +253,12 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
233 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) { 253 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
234 if (local != t->parms.iph.saddr || 254 if (local != t->parms.iph.saddr ||
235 remote != t->parms.iph.daddr || 255 remote != t->parms.iph.daddr ||
236 key != t->parms.i_key ||
237 !(t->dev->flags & IFF_UP)) 256 !(t->dev->flags & IFF_UP))
238 continue; 257 continue;
239 258
259 if (!ipgre_key_match(&t->parms, flags, key))
260 continue;
261
240 if (t->dev->type != ARPHRD_IPGRE && 262 if (t->dev->type != ARPHRD_IPGRE &&
241 t->dev->type != dev_type) 263 t->dev->type != dev_type)
242 continue; 264 continue;
@@ -257,10 +279,12 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
257 279
258 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) { 280 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
259 if (remote != t->parms.iph.daddr || 281 if (remote != t->parms.iph.daddr ||
260 key != t->parms.i_key ||
261 !(t->dev->flags & IFF_UP)) 282 !(t->dev->flags & IFF_UP))
262 continue; 283 continue;
263 284
285 if (!ipgre_key_match(&t->parms, flags, key))
286 continue;
287
264 if (t->dev->type != ARPHRD_IPGRE && 288 if (t->dev->type != ARPHRD_IPGRE &&
265 t->dev->type != dev_type) 289 t->dev->type != dev_type)
266 continue; 290 continue;
@@ -283,10 +307,12 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
283 if ((local != t->parms.iph.saddr && 307 if ((local != t->parms.iph.saddr &&
284 (local != t->parms.iph.daddr || 308 (local != t->parms.iph.daddr ||
285 !ipv4_is_multicast(local))) || 309 !ipv4_is_multicast(local))) ||
286 key != t->parms.i_key ||
287 !(t->dev->flags & IFF_UP)) 310 !(t->dev->flags & IFF_UP))
288 continue; 311 continue;
289 312
313 if (!ipgre_key_match(&t->parms, flags, key))
314 continue;
315
290 if (t->dev->type != ARPHRD_IPGRE && 316 if (t->dev->type != ARPHRD_IPGRE &&
291 t->dev->type != dev_type) 317 t->dev->type != dev_type)
292 continue; 318 continue;
@@ -489,6 +515,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
489 const int code = icmp_hdr(skb)->code; 515 const int code = icmp_hdr(skb)->code;
490 struct ip_tunnel *t; 516 struct ip_tunnel *t;
491 __be16 flags; 517 __be16 flags;
518 __be32 key = 0;
492 519
493 flags = p[0]; 520 flags = p[0];
494 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { 521 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
@@ -505,6 +532,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
505 if (skb_headlen(skb) < grehlen) 532 if (skb_headlen(skb) < grehlen)
506 return; 533 return;
507 534
535 if (flags & GRE_KEY)
536 key = *(((__be32 *)p) + (grehlen / 4) - 1);
537
508 switch (type) { 538 switch (type) {
509 default: 539 default:
510 case ICMP_PARAMETERPROB: 540 case ICMP_PARAMETERPROB:
@@ -533,49 +563,34 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
533 break; 563 break;
534 } 564 }
535 565
536 rcu_read_lock();
537 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, 566 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
538 flags & GRE_KEY ? 567 flags, key, p[1]);
539 *(((__be32 *)p) + (grehlen / 4) - 1) : 0, 568
540 p[1]);
541 if (t == NULL) 569 if (t == NULL)
542 goto out; 570 return;
543 571
544 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { 572 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
545 ipv4_update_pmtu(skb, dev_net(skb->dev), info, 573 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
546 t->parms.link, 0, IPPROTO_GRE, 0); 574 t->parms.link, 0, IPPROTO_GRE, 0);
547 goto out; 575 return;
548 } 576 }
549 if (type == ICMP_REDIRECT) { 577 if (type == ICMP_REDIRECT) {
550 ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, 578 ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
551 IPPROTO_GRE, 0); 579 IPPROTO_GRE, 0);
552 goto out; 580 return;
553 } 581 }
554 if (t->parms.iph.daddr == 0 || 582 if (t->parms.iph.daddr == 0 ||
555 ipv4_is_multicast(t->parms.iph.daddr)) 583 ipv4_is_multicast(t->parms.iph.daddr))
556 goto out; 584 return;
557 585
558 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) 586 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
559 goto out; 587 return;
560 588
561 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) 589 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
562 t->err_count++; 590 t->err_count++;
563 else 591 else
564 t->err_count = 1; 592 t->err_count = 1;
565 t->err_time = jiffies; 593 t->err_time = jiffies;
566out:
567 rcu_read_unlock();
568}
569
570static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
571{
572 if (INET_ECN_is_ce(iph->tos)) {
573 if (skb->protocol == htons(ETH_P_IP)) {
574 IP_ECN_set_ce(ip_hdr(skb));
575 } else if (skb->protocol == htons(ETH_P_IPV6)) {
576 IP6_ECN_set_ce(ipv6_hdr(skb));
577 }
578 }
579} 594}
580 595
581static inline u8 596static inline u8
@@ -600,9 +615,10 @@ static int ipgre_rcv(struct sk_buff *skb)
600 struct ip_tunnel *tunnel; 615 struct ip_tunnel *tunnel;
601 int offset = 4; 616 int offset = 4;
602 __be16 gre_proto; 617 __be16 gre_proto;
618 int err;
603 619
604 if (!pskb_may_pull(skb, 16)) 620 if (!pskb_may_pull(skb, 16))
605 goto drop_nolock; 621 goto drop;
606 622
607 iph = ip_hdr(skb); 623 iph = ip_hdr(skb);
608 h = skb->data; 624 h = skb->data;
@@ -613,7 +629,7 @@ static int ipgre_rcv(struct sk_buff *skb)
613 - We do not support routing headers. 629 - We do not support routing headers.
614 */ 630 */
615 if (flags&(GRE_VERSION|GRE_ROUTING)) 631 if (flags&(GRE_VERSION|GRE_ROUTING))
616 goto drop_nolock; 632 goto drop;
617 633
618 if (flags&GRE_CSUM) { 634 if (flags&GRE_CSUM) {
619 switch (skb->ip_summed) { 635 switch (skb->ip_summed) {
@@ -641,10 +657,10 @@ static int ipgre_rcv(struct sk_buff *skb)
641 657
642 gre_proto = *(__be16 *)(h + 2); 658 gre_proto = *(__be16 *)(h + 2);
643 659
644 rcu_read_lock(); 660 tunnel = ipgre_tunnel_lookup(skb->dev,
645 if ((tunnel = ipgre_tunnel_lookup(skb->dev, 661 iph->saddr, iph->daddr, flags, key,
646 iph->saddr, iph->daddr, key, 662 gre_proto);
647 gre_proto))) { 663 if (tunnel) {
648 struct pcpu_tstats *tstats; 664 struct pcpu_tstats *tstats;
649 665
650 secpath_reset(skb); 666 secpath_reset(skb);
@@ -703,27 +719,33 @@ static int ipgre_rcv(struct sk_buff *skb)
703 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 719 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
704 } 720 }
705 721
722 __skb_tunnel_rx(skb, tunnel->dev);
723
724 skb_reset_network_header(skb);
725 err = IP_ECN_decapsulate(iph, skb);
726 if (unlikely(err)) {
727 if (log_ecn_error)
728 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
729 &iph->saddr, iph->tos);
730 if (err > 1) {
731 ++tunnel->dev->stats.rx_frame_errors;
732 ++tunnel->dev->stats.rx_errors;
733 goto drop;
734 }
735 }
736
706 tstats = this_cpu_ptr(tunnel->dev->tstats); 737 tstats = this_cpu_ptr(tunnel->dev->tstats);
707 u64_stats_update_begin(&tstats->syncp); 738 u64_stats_update_begin(&tstats->syncp);
708 tstats->rx_packets++; 739 tstats->rx_packets++;
709 tstats->rx_bytes += skb->len; 740 tstats->rx_bytes += skb->len;
710 u64_stats_update_end(&tstats->syncp); 741 u64_stats_update_end(&tstats->syncp);
711 742
712 __skb_tunnel_rx(skb, tunnel->dev); 743 gro_cells_receive(&tunnel->gro_cells, skb);
713
714 skb_reset_network_header(skb);
715 ipgre_ecn_decapsulate(iph, skb);
716
717 netif_rx(skb);
718
719 rcu_read_unlock();
720 return 0; 744 return 0;
721 } 745 }
722 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 746 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
723 747
724drop: 748drop:
725 rcu_read_unlock();
726drop_nolock:
727 kfree_skb(skb); 749 kfree_skb(skb);
728 return 0; 750 return 0;
729} 751}
@@ -745,6 +767,10 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
745 __be32 dst; 767 __be32 dst;
746 int mtu; 768 int mtu;
747 769
770 if (skb->ip_summed == CHECKSUM_PARTIAL &&
771 skb_checksum_help(skb))
772 goto tx_error;
773
748 if (dev->type == ARPHRD_ETHER) 774 if (dev->type == ARPHRD_ETHER)
749 IPCB(skb)->flags = 0; 775 IPCB(skb)->flags = 0;
750 776
@@ -1292,10 +1318,18 @@ static const struct net_device_ops ipgre_netdev_ops = {
1292 1318
1293static void ipgre_dev_free(struct net_device *dev) 1319static void ipgre_dev_free(struct net_device *dev)
1294{ 1320{
1321 struct ip_tunnel *tunnel = netdev_priv(dev);
1322
1323 gro_cells_destroy(&tunnel->gro_cells);
1295 free_percpu(dev->tstats); 1324 free_percpu(dev->tstats);
1296 free_netdev(dev); 1325 free_netdev(dev);
1297} 1326}
1298 1327
1328#define GRE_FEATURES (NETIF_F_SG | \
1329 NETIF_F_FRAGLIST | \
1330 NETIF_F_HIGHDMA | \
1331 NETIF_F_HW_CSUM)
1332
1299static void ipgre_tunnel_setup(struct net_device *dev) 1333static void ipgre_tunnel_setup(struct net_device *dev)
1300{ 1334{
1301 dev->netdev_ops = &ipgre_netdev_ops; 1335 dev->netdev_ops = &ipgre_netdev_ops;
@@ -1309,12 +1343,16 @@ static void ipgre_tunnel_setup(struct net_device *dev)
1309 dev->addr_len = 4; 1343 dev->addr_len = 4;
1310 dev->features |= NETIF_F_NETNS_LOCAL; 1344 dev->features |= NETIF_F_NETNS_LOCAL;
1311 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 1345 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1346
1347 dev->features |= GRE_FEATURES;
1348 dev->hw_features |= GRE_FEATURES;
1312} 1349}
1313 1350
1314static int ipgre_tunnel_init(struct net_device *dev) 1351static int ipgre_tunnel_init(struct net_device *dev)
1315{ 1352{
1316 struct ip_tunnel *tunnel; 1353 struct ip_tunnel *tunnel;
1317 struct iphdr *iph; 1354 struct iphdr *iph;
1355 int err;
1318 1356
1319 tunnel = netdev_priv(dev); 1357 tunnel = netdev_priv(dev);
1320 iph = &tunnel->parms.iph; 1358 iph = &tunnel->parms.iph;
@@ -1341,6 +1379,12 @@ static int ipgre_tunnel_init(struct net_device *dev)
1341 if (!dev->tstats) 1379 if (!dev->tstats)
1342 return -ENOMEM; 1380 return -ENOMEM;
1343 1381
1382 err = gro_cells_init(&tunnel->gro_cells, dev);
1383 if (err) {
1384 free_percpu(dev->tstats);
1385 return err;
1386 }
1387
1344 return 0; 1388 return 0;
1345} 1389}
1346 1390
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 147ccc3e93d..24a29a39e9a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -467,7 +467,9 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
467 467
468 iph = ip_hdr(skb); 468 iph = ip_hdr(skb);
469 469
470 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { 470 if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) ||
471 (IPCB(skb)->frag_max_size &&
472 IPCB(skb)->frag_max_size > dst_mtu(&rt->dst)))) {
471 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 473 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
472 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 474 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
473 htonl(ip_skb_dst_mtu(skb))); 475 htonl(ip_skb_dst_mtu(skb)));
@@ -791,6 +793,7 @@ static int __ip_append_data(struct sock *sk,
791 struct flowi4 *fl4, 793 struct flowi4 *fl4,
792 struct sk_buff_head *queue, 794 struct sk_buff_head *queue,
793 struct inet_cork *cork, 795 struct inet_cork *cork,
796 struct page_frag *pfrag,
794 int getfrag(void *from, char *to, int offset, 797 int getfrag(void *from, char *to, int offset,
795 int len, int odd, struct sk_buff *skb), 798 int len, int odd, struct sk_buff *skb),
796 void *from, int length, int transhdrlen, 799 void *from, int length, int transhdrlen,
@@ -985,47 +988,30 @@ alloc_new_skb:
985 } 988 }
986 } else { 989 } else {
987 int i = skb_shinfo(skb)->nr_frags; 990 int i = skb_shinfo(skb)->nr_frags;
988 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
989 struct page *page = cork->page;
990 int off = cork->off;
991 unsigned int left;
992
993 if (page && (left = PAGE_SIZE - off) > 0) {
994 if (copy >= left)
995 copy = left;
996 if (page != skb_frag_page(frag)) {
997 if (i == MAX_SKB_FRAGS) {
998 err = -EMSGSIZE;
999 goto error;
1000 }
1001 skb_fill_page_desc(skb, i, page, off, 0);
1002 skb_frag_ref(skb, i);
1003 frag = &skb_shinfo(skb)->frags[i];
1004 }
1005 } else if (i < MAX_SKB_FRAGS) {
1006 if (copy > PAGE_SIZE)
1007 copy = PAGE_SIZE;
1008 page = alloc_pages(sk->sk_allocation, 0);
1009 if (page == NULL) {
1010 err = -ENOMEM;
1011 goto error;
1012 }
1013 cork->page = page;
1014 cork->off = 0;
1015 991
1016 skb_fill_page_desc(skb, i, page, 0, 0); 992 err = -ENOMEM;
1017 frag = &skb_shinfo(skb)->frags[i]; 993 if (!sk_page_frag_refill(sk, pfrag))
1018 } else {
1019 err = -EMSGSIZE;
1020 goto error;
1021 }
1022 if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
1023 offset, copy, skb->len, skb) < 0) {
1024 err = -EFAULT;
1025 goto error; 994 goto error;
995
996 if (!skb_can_coalesce(skb, i, pfrag->page,
997 pfrag->offset)) {
998 err = -EMSGSIZE;
999 if (i == MAX_SKB_FRAGS)
1000 goto error;
1001
1002 __skb_fill_page_desc(skb, i, pfrag->page,
1003 pfrag->offset, 0);
1004 skb_shinfo(skb)->nr_frags = ++i;
1005 get_page(pfrag->page);
1026 } 1006 }
1027 cork->off += copy; 1007 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1028 skb_frag_size_add(frag, copy); 1008 if (getfrag(from,
1009 page_address(pfrag->page) + pfrag->offset,
1010 offset, copy, skb->len, skb) < 0)
1011 goto error_efault;
1012
1013 pfrag->offset += copy;
1014 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1029 skb->len += copy; 1015 skb->len += copy;
1030 skb->data_len += copy; 1016 skb->data_len += copy;
1031 skb->truesize += copy; 1017 skb->truesize += copy;
@@ -1037,6 +1023,8 @@ alloc_new_skb:
1037 1023
1038 return 0; 1024 return 0;
1039 1025
1026error_efault:
1027 err = -EFAULT;
1040error: 1028error:
1041 cork->length -= length; 1029 cork->length -= length;
1042 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1030 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
@@ -1077,8 +1065,6 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1077 cork->dst = &rt->dst; 1065 cork->dst = &rt->dst;
1078 cork->length = 0; 1066 cork->length = 0;
1079 cork->tx_flags = ipc->tx_flags; 1067 cork->tx_flags = ipc->tx_flags;
1080 cork->page = NULL;
1081 cork->off = 0;
1082 1068
1083 return 0; 1069 return 0;
1084} 1070}
@@ -1115,7 +1101,8 @@ int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1115 transhdrlen = 0; 1101 transhdrlen = 0;
1116 } 1102 }
1117 1103
1118 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag, 1104 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
1105 sk_page_frag(sk), getfrag,
1119 from, length, transhdrlen, flags); 1106 from, length, transhdrlen, flags);
1120} 1107}
1121 1108
@@ -1338,10 +1325,10 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
1338 iph->ihl = 5; 1325 iph->ihl = 5;
1339 iph->tos = inet->tos; 1326 iph->tos = inet->tos;
1340 iph->frag_off = df; 1327 iph->frag_off = df;
1341 ip_select_ident(iph, &rt->dst, sk);
1342 iph->ttl = ttl; 1328 iph->ttl = ttl;
1343 iph->protocol = sk->sk_protocol; 1329 iph->protocol = sk->sk_protocol;
1344 ip_copy_addrs(iph, fl4); 1330 ip_copy_addrs(iph, fl4);
1331 ip_select_ident(iph, &rt->dst, sk);
1345 1332
1346 if (opt) { 1333 if (opt) {
1347 iph->ihl += opt->optlen>>2; 1334 iph->ihl += opt->optlen>>2;
@@ -1437,7 +1424,8 @@ struct sk_buff *ip_make_skb(struct sock *sk,
1437 if (err) 1424 if (err)
1438 return ERR_PTR(err); 1425 return ERR_PTR(err);
1439 1426
1440 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag, 1427 err = __ip_append_data(sk, fl4, &queue, &cork,
1428 &current->task_frag, getfrag,
1441 from, length, transhdrlen, flags); 1429 from, length, transhdrlen, flags);
1442 if (err) { 1430 if (err) {
1443 __ip_flush_pending_frames(sk, &queue, &cork); 1431 __ip_flush_pending_frames(sk, &queue, &cork);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 3511ffba7bd..978bca4818a 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -304,7 +304,6 @@ static int vti_err(struct sk_buff *skb, u32 info)
304 304
305 err = -ENOENT; 305 err = -ENOENT;
306 306
307 rcu_read_lock();
308 t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); 307 t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
309 if (t == NULL) 308 if (t == NULL)
310 goto out; 309 goto out;
@@ -326,7 +325,6 @@ static int vti_err(struct sk_buff *skb, u32 info)
326 t->err_count = 1; 325 t->err_count = 1;
327 t->err_time = jiffies; 326 t->err_time = jiffies;
328out: 327out:
329 rcu_read_unlock();
330 return err; 328 return err;
331} 329}
332 330
@@ -336,7 +334,6 @@ static int vti_rcv(struct sk_buff *skb)
336 struct ip_tunnel *tunnel; 334 struct ip_tunnel *tunnel;
337 const struct iphdr *iph = ip_hdr(skb); 335 const struct iphdr *iph = ip_hdr(skb);
338 336
339 rcu_read_lock();
340 tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); 337 tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
341 if (tunnel != NULL) { 338 if (tunnel != NULL) {
342 struct pcpu_tstats *tstats; 339 struct pcpu_tstats *tstats;
@@ -348,10 +345,8 @@ static int vti_rcv(struct sk_buff *skb)
348 u64_stats_update_end(&tstats->syncp); 345 u64_stats_update_end(&tstats->syncp);
349 346
350 skb->dev = tunnel->dev; 347 skb->dev = tunnel->dev;
351 rcu_read_unlock();
352 return 1; 348 return 1;
353 } 349 }
354 rcu_read_unlock();
355 350
356 return -1; 351 return -1;
357} 352}
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 67e8a6b086e..798358b1071 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -583,6 +583,17 @@ static void __init ic_rarp_send_if(struct ic_device *d)
583#endif 583#endif
584 584
585/* 585/*
586 * Predefine Nameservers
587 */
588static inline void __init ic_nameservers_predef(void)
589{
590 int i;
591
592 for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
593 ic_nameservers[i] = NONE;
594}
595
596/*
586 * DHCP/BOOTP support. 597 * DHCP/BOOTP support.
587 */ 598 */
588 599
@@ -747,10 +758,7 @@ static void __init ic_bootp_init_ext(u8 *e)
747 */ 758 */
748static inline void __init ic_bootp_init(void) 759static inline void __init ic_bootp_init(void)
749{ 760{
750 int i; 761 ic_nameservers_predef();
751
752 for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
753 ic_nameservers[i] = NONE;
754 762
755 dev_add_pack(&bootp_packet_type); 763 dev_add_pack(&bootp_packet_type);
756} 764}
@@ -1379,6 +1387,7 @@ static int __init ip_auto_config(void)
1379 int retries = CONF_OPEN_RETRIES; 1387 int retries = CONF_OPEN_RETRIES;
1380#endif 1388#endif
1381 int err; 1389 int err;
1390 unsigned int i;
1382 1391
1383#ifdef CONFIG_PROC_FS 1392#ifdef CONFIG_PROC_FS
1384 proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops); 1393 proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops);
@@ -1499,7 +1508,15 @@ static int __init ip_auto_config(void)
1499 &ic_servaddr, &root_server_addr, root_server_path); 1508 &ic_servaddr, &root_server_addr, root_server_path);
1500 if (ic_dev_mtu) 1509 if (ic_dev_mtu)
1501 pr_cont(", mtu=%d", ic_dev_mtu); 1510 pr_cont(", mtu=%d", ic_dev_mtu);
1502 pr_cont("\n"); 1511 for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
1512 if (ic_nameservers[i] != NONE) {
1513 pr_info(" nameserver%u=%pI4",
1514 i, &ic_nameservers[i]);
1515 break;
1516 }
1517 for (i++; i < CONF_NAMESERVERS_MAX; i++)
1518 if (ic_nameservers[i] != NONE)
1519 pr_cont(", nameserver%u=%pI4\n", i, &ic_nameservers[i]);
1503#endif /* !SILENT */ 1520#endif /* !SILENT */
1504 1521
1505 return 0; 1522 return 0;
@@ -1570,6 +1587,8 @@ static int __init ip_auto_config_setup(char *addrs)
1570 return 1; 1587 return 1;
1571 } 1588 }
1572 1589
1590 ic_nameservers_predef();
1591
1573 /* Parse string for static IP assignment. */ 1592 /* Parse string for static IP assignment. */
1574 ip = addrs; 1593 ip = addrs;
1575 while (ip && *ip) { 1594 while (ip && *ip) {
@@ -1613,6 +1632,20 @@ static int __init ip_auto_config_setup(char *addrs)
1613 ic_enable = 0; 1632 ic_enable = 0;
1614 } 1633 }
1615 break; 1634 break;
1635 case 7:
1636 if (CONF_NAMESERVERS_MAX >= 1) {
1637 ic_nameservers[0] = in_aton(ip);
1638 if (ic_nameservers[0] == ANY)
1639 ic_nameservers[0] = NONE;
1640 }
1641 break;
1642 case 8:
1643 if (CONF_NAMESERVERS_MAX >= 2) {
1644 ic_nameservers[1] = in_aton(ip);
1645 if (ic_nameservers[1] == ANY)
1646 ic_nameservers[1] = NONE;
1647 }
1648 break;
1616 } 1649 }
1617 } 1650 }
1618 ip = cp; 1651 ip = cp;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 99af1f0cc65..e15b45297c0 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -120,6 +120,10 @@
120#define HASH_SIZE 16 120#define HASH_SIZE 16
121#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) 121#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
122 122
123static bool log_ecn_error = true;
124module_param(log_ecn_error, bool, 0644);
125MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
126
123static int ipip_net_id __read_mostly; 127static int ipip_net_id __read_mostly;
124struct ipip_net { 128struct ipip_net {
125 struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; 129 struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
@@ -365,8 +369,6 @@ static int ipip_err(struct sk_buff *skb, u32 info)
365 } 369 }
366 370
367 err = -ENOENT; 371 err = -ENOENT;
368
369 rcu_read_lock();
370 t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); 372 t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
371 if (t == NULL) 373 if (t == NULL)
372 goto out; 374 goto out;
@@ -398,34 +400,22 @@ static int ipip_err(struct sk_buff *skb, u32 info)
398 t->err_count = 1; 400 t->err_count = 1;
399 t->err_time = jiffies; 401 t->err_time = jiffies;
400out: 402out:
401 rcu_read_unlock();
402 return err;
403}
404
405static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,
406 struct sk_buff *skb)
407{
408 struct iphdr *inner_iph = ip_hdr(skb);
409 403
410 if (INET_ECN_is_ce(outer_iph->tos)) 404 return err;
411 IP_ECN_set_ce(inner_iph);
412} 405}
413 406
414static int ipip_rcv(struct sk_buff *skb) 407static int ipip_rcv(struct sk_buff *skb)
415{ 408{
416 struct ip_tunnel *tunnel; 409 struct ip_tunnel *tunnel;
417 const struct iphdr *iph = ip_hdr(skb); 410 const struct iphdr *iph = ip_hdr(skb);
411 int err;
418 412
419 rcu_read_lock();
420 tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); 413 tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
421 if (tunnel != NULL) { 414 if (tunnel != NULL) {
422 struct pcpu_tstats *tstats; 415 struct pcpu_tstats *tstats;
423 416
424 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 417 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
425 rcu_read_unlock(); 418 goto drop;
426 kfree_skb(skb);
427 return 0;
428 }
429 419
430 secpath_reset(skb); 420 secpath_reset(skb);
431 421
@@ -434,24 +424,35 @@ static int ipip_rcv(struct sk_buff *skb)
434 skb->protocol = htons(ETH_P_IP); 424 skb->protocol = htons(ETH_P_IP);
435 skb->pkt_type = PACKET_HOST; 425 skb->pkt_type = PACKET_HOST;
436 426
427 __skb_tunnel_rx(skb, tunnel->dev);
428
429 err = IP_ECN_decapsulate(iph, skb);
430 if (unlikely(err)) {
431 if (log_ecn_error)
432 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
433 &iph->saddr, iph->tos);
434 if (err > 1) {
435 ++tunnel->dev->stats.rx_frame_errors;
436 ++tunnel->dev->stats.rx_errors;
437 goto drop;
438 }
439 }
440
437 tstats = this_cpu_ptr(tunnel->dev->tstats); 441 tstats = this_cpu_ptr(tunnel->dev->tstats);
438 u64_stats_update_begin(&tstats->syncp); 442 u64_stats_update_begin(&tstats->syncp);
439 tstats->rx_packets++; 443 tstats->rx_packets++;
440 tstats->rx_bytes += skb->len; 444 tstats->rx_bytes += skb->len;
441 u64_stats_update_end(&tstats->syncp); 445 u64_stats_update_end(&tstats->syncp);
442 446
443 __skb_tunnel_rx(skb, tunnel->dev);
444
445 ipip_ecn_decapsulate(iph, skb);
446
447 netif_rx(skb); 447 netif_rx(skb);
448
449 rcu_read_unlock();
450 return 0; 448 return 0;
451 } 449 }
452 rcu_read_unlock();
453 450
454 return -1; 451 return -1;
452
453drop:
454 kfree_skb(skb);
455 return 0;
455} 456}
456 457
457/* 458/*
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 8eec8f4a053..6168c4dc58b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -124,6 +124,8 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
124static struct kmem_cache *mrt_cachep __read_mostly; 124static struct kmem_cache *mrt_cachep __read_mostly;
125 125
126static struct mr_table *ipmr_new_table(struct net *net, u32 id); 126static struct mr_table *ipmr_new_table(struct net *net, u32 id);
127static void ipmr_free_table(struct mr_table *mrt);
128
127static int ip_mr_forward(struct net *net, struct mr_table *mrt, 129static int ip_mr_forward(struct net *net, struct mr_table *mrt,
128 struct sk_buff *skb, struct mfc_cache *cache, 130 struct sk_buff *skb, struct mfc_cache *cache,
129 int local); 131 int local);
@@ -131,6 +133,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
131 struct sk_buff *pkt, vifi_t vifi, int assert); 133 struct sk_buff *pkt, vifi_t vifi, int assert);
132static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, 134static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
133 struct mfc_cache *c, struct rtmsg *rtm); 135 struct mfc_cache *c, struct rtmsg *rtm);
136static void mroute_clean_tables(struct mr_table *mrt);
134static void ipmr_expire_process(unsigned long arg); 137static void ipmr_expire_process(unsigned long arg);
135 138
136#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES 139#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
@@ -218,7 +221,7 @@ static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
218 return 0; 221 return 0;
219} 222}
220 223
221static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = { 224static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
222 .family = RTNL_FAMILY_IPMR, 225 .family = RTNL_FAMILY_IPMR,
223 .rule_size = sizeof(struct ipmr_rule), 226 .rule_size = sizeof(struct ipmr_rule),
224 .addr_size = sizeof(u32), 227 .addr_size = sizeof(u32),
@@ -271,7 +274,7 @@ static void __net_exit ipmr_rules_exit(struct net *net)
271 274
272 list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { 275 list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
273 list_del(&mrt->list); 276 list_del(&mrt->list);
274 kfree(mrt); 277 ipmr_free_table(mrt);
275 } 278 }
276 fib_rules_unregister(net->ipv4.mr_rules_ops); 279 fib_rules_unregister(net->ipv4.mr_rules_ops);
277} 280}
@@ -299,7 +302,7 @@ static int __net_init ipmr_rules_init(struct net *net)
299 302
300static void __net_exit ipmr_rules_exit(struct net *net) 303static void __net_exit ipmr_rules_exit(struct net *net)
301{ 304{
302 kfree(net->ipv4.mrt); 305 ipmr_free_table(net->ipv4.mrt);
303} 306}
304#endif 307#endif
305 308
@@ -336,6 +339,13 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
336 return mrt; 339 return mrt;
337} 340}
338 341
342static void ipmr_free_table(struct mr_table *mrt)
343{
344 del_timer_sync(&mrt->ipmr_expire_timer);
345 mroute_clean_tables(mrt);
346 kfree(mrt);
347}
348
339/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ 349/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
340 350
341static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) 351static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
@@ -616,7 +626,7 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
616 e->error = -ETIMEDOUT; 626 e->error = -ETIMEDOUT;
617 memset(&e->msg, 0, sizeof(e->msg)); 627 memset(&e->msg, 0, sizeof(e->msg));
618 628
619 rtnl_unicast(skb, net, NETLINK_CB(skb).pid); 629 rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
620 } else { 630 } else {
621 kfree_skb(skb); 631 kfree_skb(skb);
622 } 632 }
@@ -860,7 +870,7 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
860 memset(&e->msg, 0, sizeof(e->msg)); 870 memset(&e->msg, 0, sizeof(e->msg));
861 } 871 }
862 872
863 rtnl_unicast(skb, net, NETLINK_CB(skb).pid); 873 rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
864 } else { 874 } else {
865 ip_mr_forward(net, mrt, skb, c, 0); 875 ip_mr_forward(net, mrt, skb, c, 0);
866 } 876 }
@@ -1798,7 +1808,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
1798 .flowi4_oif = (rt_is_output_route(rt) ? 1808 .flowi4_oif = (rt_is_output_route(rt) ?
1799 skb->dev->ifindex : 0), 1809 skb->dev->ifindex : 0),
1800 .flowi4_iif = (rt_is_output_route(rt) ? 1810 .flowi4_iif = (rt_is_output_route(rt) ?
1801 net->loopback_dev->ifindex : 1811 LOOPBACK_IFINDEX :
1802 skb->dev->ifindex), 1812 skb->dev->ifindex),
1803 .flowi4_mark = skb->mark, 1813 .flowi4_mark = skb->mark,
1804 }; 1814 };
@@ -2107,12 +2117,12 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
2107} 2117}
2108 2118
2109static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, 2119static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2110 u32 pid, u32 seq, struct mfc_cache *c) 2120 u32 portid, u32 seq, struct mfc_cache *c)
2111{ 2121{
2112 struct nlmsghdr *nlh; 2122 struct nlmsghdr *nlh;
2113 struct rtmsg *rtm; 2123 struct rtmsg *rtm;
2114 2124
2115 nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI); 2125 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
2116 if (nlh == NULL) 2126 if (nlh == NULL)
2117 return -EMSGSIZE; 2127 return -EMSGSIZE;
2118 2128
@@ -2166,7 +2176,7 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2166 if (e < s_e) 2176 if (e < s_e)
2167 goto next_entry; 2177 goto next_entry;
2168 if (ipmr_fill_mroute(mrt, skb, 2178 if (ipmr_fill_mroute(mrt, skb,
2169 NETLINK_CB(cb->skb).pid, 2179 NETLINK_CB(cb->skb).portid,
2170 cb->nlh->nlmsg_seq, 2180 cb->nlh->nlmsg_seq,
2171 mfc) < 0) 2181 mfc) < 0)
2172 goto done; 2182 goto done;
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index ed1b3678319..4c0cf63dd92 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -72,43 +72,6 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
72} 72}
73EXPORT_SYMBOL(ip_route_me_harder); 73EXPORT_SYMBOL(ip_route_me_harder);
74 74
75#ifdef CONFIG_XFRM
76int ip_xfrm_me_harder(struct sk_buff *skb)
77{
78 struct flowi fl;
79 unsigned int hh_len;
80 struct dst_entry *dst;
81
82 if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
83 return 0;
84 if (xfrm_decode_session(skb, &fl, AF_INET) < 0)
85 return -1;
86
87 dst = skb_dst(skb);
88 if (dst->xfrm)
89 dst = ((struct xfrm_dst *)dst)->route;
90 dst_hold(dst);
91
92 dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0);
93 if (IS_ERR(dst))
94 return -1;
95
96 skb_dst_drop(skb);
97 skb_dst_set(skb, dst);
98
99 /* Change in oif may mean change in hh_len. */
100 hh_len = skb_dst(skb)->dev->hard_header_len;
101 if (skb_headroom(skb) < hh_len &&
102 pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
103 return -1;
104 return 0;
105}
106EXPORT_SYMBOL(ip_xfrm_me_harder);
107#endif
108
109void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *);
110EXPORT_SYMBOL(ip_nat_decode_session);
111
112/* 75/*
113 * Extra routing may needed on local out, as the QUEUE target never 76 * Extra routing may needed on local out, as the QUEUE target never
114 * returns control to the table. 77 * returns control to the table.
@@ -225,12 +188,12 @@ static const struct nf_afinfo nf_ip_afinfo = {
225 .route_key_size = sizeof(struct ip_rt_info), 188 .route_key_size = sizeof(struct ip_rt_info),
226}; 189};
227 190
228static int ipv4_netfilter_init(void) 191static int __init ipv4_netfilter_init(void)
229{ 192{
230 return nf_register_afinfo(&nf_ip_afinfo); 193 return nf_register_afinfo(&nf_ip_afinfo);
231} 194}
232 195
233static void ipv4_netfilter_fini(void) 196static void __exit ipv4_netfilter_fini(void)
234{ 197{
235 nf_unregister_afinfo(&nf_ip_afinfo); 198 nf_unregister_afinfo(&nf_ip_afinfo);
236} 199}
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index fcc543cd987..d8d6f2a5bf1 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -143,25 +143,22 @@ config IP_NF_TARGET_ULOG
143 To compile it as a module, choose M here. If unsure, say N. 143 To compile it as a module, choose M here. If unsure, say N.
144 144
145# NAT + specific targets: nf_conntrack 145# NAT + specific targets: nf_conntrack
146config NF_NAT 146config NF_NAT_IPV4
147 tristate "Full NAT" 147 tristate "IPv4 NAT"
148 depends on NF_CONNTRACK_IPV4 148 depends on NF_CONNTRACK_IPV4
149 default m if NETFILTER_ADVANCED=n 149 default m if NETFILTER_ADVANCED=n
150 select NF_NAT
150 help 151 help
151 The Full NAT option allows masquerading, port forwarding and other 152 The IPv4 NAT option allows masquerading, port forwarding and other
152 forms of full Network Address Port Translation. It is controlled by 153 forms of full Network Address Port Translation. It is controlled by
153 the `nat' table in iptables: see the man page for iptables(8). 154 the `nat' table in iptables: see the man page for iptables(8).
154 155
155 To compile it as a module, choose M here. If unsure, say N. 156 To compile it as a module, choose M here. If unsure, say N.
156 157
157config NF_NAT_NEEDED 158if NF_NAT_IPV4
158 bool
159 depends on NF_NAT
160 default y
161 159
162config IP_NF_TARGET_MASQUERADE 160config IP_NF_TARGET_MASQUERADE
163 tristate "MASQUERADE target support" 161 tristate "MASQUERADE target support"
164 depends on NF_NAT
165 default m if NETFILTER_ADVANCED=n 162 default m if NETFILTER_ADVANCED=n
166 help 163 help
167 Masquerading is a special case of NAT: all outgoing connections are 164 Masquerading is a special case of NAT: all outgoing connections are
@@ -174,30 +171,27 @@ config IP_NF_TARGET_MASQUERADE
174 171
175config IP_NF_TARGET_NETMAP 172config IP_NF_TARGET_NETMAP
176 tristate "NETMAP target support" 173 tristate "NETMAP target support"
177 depends on NF_NAT
178 depends on NETFILTER_ADVANCED 174 depends on NETFILTER_ADVANCED
179 help 175 select NETFILTER_XT_TARGET_NETMAP
180 NETMAP is an implementation of static 1:1 NAT mapping of network 176 ---help---
181 addresses. It maps the network address part, while keeping the host 177 This is a backwards-compat option for the user's convenience
182 address part intact. 178 (e.g. when running oldconfig). It selects
183 179 CONFIG_NETFILTER_XT_TARGET_NETMAP.
184 To compile it as a module, choose M here. If unsure, say N.
185 180
186config IP_NF_TARGET_REDIRECT 181config IP_NF_TARGET_REDIRECT
187 tristate "REDIRECT target support" 182 tristate "REDIRECT target support"
188 depends on NF_NAT
189 depends on NETFILTER_ADVANCED 183 depends on NETFILTER_ADVANCED
190 help 184 select NETFILTER_XT_TARGET_REDIRECT
191 REDIRECT is a special case of NAT: all incoming connections are 185 ---help---
192 mapped onto the incoming interface's address, causing the packets to 186 This is a backwards-compat option for the user's convenience
193 come to the local machine instead of passing through. This is 187 (e.g. when running oldconfig). It selects
194 useful for transparent proxies. 188 CONFIG_NETFILTER_XT_TARGET_REDIRECT.
195 189
196 To compile it as a module, choose M here. If unsure, say N. 190endif
197 191
198config NF_NAT_SNMP_BASIC 192config NF_NAT_SNMP_BASIC
199 tristate "Basic SNMP-ALG support" 193 tristate "Basic SNMP-ALG support"
200 depends on NF_CONNTRACK_SNMP && NF_NAT 194 depends on NF_CONNTRACK_SNMP && NF_NAT_IPV4
201 depends on NETFILTER_ADVANCED 195 depends on NETFILTER_ADVANCED
202 default NF_NAT && NF_CONNTRACK_SNMP 196 default NF_NAT && NF_CONNTRACK_SNMP
203 ---help--- 197 ---help---
@@ -219,61 +213,21 @@ config NF_NAT_SNMP_BASIC
219# <expr> '&&' <expr> (6) 213# <expr> '&&' <expr> (6)
220# 214#
221# (6) Returns the result of min(/expr/, /expr/). 215# (6) Returns the result of min(/expr/, /expr/).
222config NF_NAT_PROTO_DCCP
223 tristate
224 depends on NF_NAT && NF_CT_PROTO_DCCP
225 default NF_NAT && NF_CT_PROTO_DCCP
226 216
227config NF_NAT_PROTO_GRE 217config NF_NAT_PROTO_GRE
228 tristate 218 tristate
229 depends on NF_NAT && NF_CT_PROTO_GRE 219 depends on NF_NAT_IPV4 && NF_CT_PROTO_GRE
230
231config NF_NAT_PROTO_UDPLITE
232 tristate
233 depends on NF_NAT && NF_CT_PROTO_UDPLITE
234 default NF_NAT && NF_CT_PROTO_UDPLITE
235
236config NF_NAT_PROTO_SCTP
237 tristate
238 default NF_NAT && NF_CT_PROTO_SCTP
239 depends on NF_NAT && NF_CT_PROTO_SCTP
240 select LIBCRC32C
241
242config NF_NAT_FTP
243 tristate
244 depends on NF_CONNTRACK && NF_NAT
245 default NF_NAT && NF_CONNTRACK_FTP
246
247config NF_NAT_IRC
248 tristate
249 depends on NF_CONNTRACK && NF_NAT
250 default NF_NAT && NF_CONNTRACK_IRC
251
252config NF_NAT_TFTP
253 tristate
254 depends on NF_CONNTRACK && NF_NAT
255 default NF_NAT && NF_CONNTRACK_TFTP
256
257config NF_NAT_AMANDA
258 tristate
259 depends on NF_CONNTRACK && NF_NAT
260 default NF_NAT && NF_CONNTRACK_AMANDA
261 220
262config NF_NAT_PPTP 221config NF_NAT_PPTP
263 tristate 222 tristate
264 depends on NF_CONNTRACK && NF_NAT 223 depends on NF_CONNTRACK && NF_NAT_IPV4
265 default NF_NAT && NF_CONNTRACK_PPTP 224 default NF_NAT_IPV4 && NF_CONNTRACK_PPTP
266 select NF_NAT_PROTO_GRE 225 select NF_NAT_PROTO_GRE
267 226
268config NF_NAT_H323 227config NF_NAT_H323
269 tristate 228 tristate
270 depends on NF_CONNTRACK && NF_NAT 229 depends on NF_CONNTRACK && NF_NAT_IPV4
271 default NF_NAT && NF_CONNTRACK_H323 230 default NF_NAT_IPV4 && NF_CONNTRACK_H323
272
273config NF_NAT_SIP
274 tristate
275 depends on NF_CONNTRACK && NF_NAT
276 default NF_NAT && NF_CONNTRACK_SIP
277 231
278# mangle + specific targets 232# mangle + specific targets
279config IP_NF_MANGLE 233config IP_NF_MANGLE
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index c20674dc945..007b128eecc 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -10,32 +10,22 @@ nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o
10endif 10endif
11endif 11endif
12 12
13nf_nat-y := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o
14iptable_nat-y := nf_nat_rule.o nf_nat_standalone.o
15
16# connection tracking 13# connection tracking
17obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o 14obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
18 15
19obj-$(CONFIG_NF_NAT) += nf_nat.o 16nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
17obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
20 18
21# defrag 19# defrag
22obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o 20obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
23 21
24# NAT helpers (nf_conntrack) 22# NAT helpers (nf_conntrack)
25obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
26obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
27obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o 23obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
28obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o
29obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o 24obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
30obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o
31obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o 25obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
32obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
33 26
34# NAT protocols (nf_nat) 27# NAT protocols (nf_nat)
35obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
36obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o 28obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
37obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
38obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
39 29
40# generic IP tables 30# generic IP tables
41obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o 31obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
@@ -43,7 +33,7 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
43# the three instances of ip_tables 33# the three instances of ip_tables
44obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o 34obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
45obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o 35obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
46obj-$(CONFIG_NF_NAT) += iptable_nat.o 36obj-$(CONFIG_NF_NAT_IPV4) += iptable_nat.o
47obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o 37obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
48obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o 38obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
49 39
@@ -55,8 +45,6 @@ obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o
55obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o 45obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
56obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o 46obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
57obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o 47obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
58obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
59obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
60obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o 48obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
61obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o 49obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
62 50
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index cbb6a1a6f6f..5d5d4d1be9c 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -19,9 +19,9 @@
19#include <net/ip.h> 19#include <net/ip.h>
20#include <net/checksum.h> 20#include <net/checksum.h>
21#include <net/route.h> 21#include <net/route.h>
22#include <net/netfilter/nf_nat_rule.h>
23#include <linux/netfilter_ipv4.h> 22#include <linux/netfilter_ipv4.h>
24#include <linux/netfilter/x_tables.h> 23#include <linux/netfilter/x_tables.h>
24#include <net/netfilter/nf_nat.h>
25 25
26MODULE_LICENSE("GPL"); 26MODULE_LICENSE("GPL");
27MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); 27MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
@@ -49,7 +49,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
49 struct nf_conn *ct; 49 struct nf_conn *ct;
50 struct nf_conn_nat *nat; 50 struct nf_conn_nat *nat;
51 enum ip_conntrack_info ctinfo; 51 enum ip_conntrack_info ctinfo;
52 struct nf_nat_ipv4_range newrange; 52 struct nf_nat_range newrange;
53 const struct nf_nat_ipv4_multi_range_compat *mr; 53 const struct nf_nat_ipv4_multi_range_compat *mr;
54 const struct rtable *rt; 54 const struct rtable *rt;
55 __be32 newsrc, nh; 55 __be32 newsrc, nh;
@@ -80,10 +80,13 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
80 nat->masq_index = par->out->ifindex; 80 nat->masq_index = par->out->ifindex;
81 81
82 /* Transfer from original range. */ 82 /* Transfer from original range. */
83 newrange = ((struct nf_nat_ipv4_range) 83 memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
84 { mr->range[0].flags | NF_NAT_RANGE_MAP_IPS, 84 memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
85 newsrc, newsrc, 85 newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
86 mr->range[0].min, mr->range[0].max }); 86 newrange.min_addr.ip = newsrc;
87 newrange.max_addr.ip = newsrc;
88 newrange.min_proto = mr->range[0].min;
89 newrange.max_proto = mr->range[0].max;
87 90
88 /* Hand modified range to generic setup. */ 91 /* Hand modified range to generic setup. */
89 return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); 92 return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
@@ -96,7 +99,8 @@ device_cmp(struct nf_conn *i, void *ifindex)
96 99
97 if (!nat) 100 if (!nat)
98 return 0; 101 return 0;
99 102 if (nf_ct_l3num(i) != NFPROTO_IPV4)
103 return 0;
100 return nat->masq_index == (int)(long)ifindex; 104 return nat->masq_index == (int)(long)ifindex;
101} 105}
102 106
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
deleted file mode 100644
index b5bfbbabf70..00000000000
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ /dev/null
@@ -1,98 +0,0 @@
1/* NETMAP - static NAT mapping of IP network addresses (1:1).
2 * The mapping can be applied to source (POSTROUTING),
3 * destination (PREROUTING), or both (with separate rules).
4 */
5
6/* (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13#include <linux/ip.h>
14#include <linux/module.h>
15#include <linux/netdevice.h>
16#include <linux/netfilter.h>
17#include <linux/netfilter_ipv4.h>
18#include <linux/netfilter/x_tables.h>
19#include <net/netfilter/nf_nat_rule.h>
20
21MODULE_LICENSE("GPL");
22MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>");
23MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets");
24
25static int netmap_tg_check(const struct xt_tgchk_param *par)
26{
27 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
28
29 if (!(mr->range[0].flags & NF_NAT_RANGE_MAP_IPS)) {
30 pr_debug("bad MAP_IPS.\n");
31 return -EINVAL;
32 }
33 if (mr->rangesize != 1) {
34 pr_debug("bad rangesize %u.\n", mr->rangesize);
35 return -EINVAL;
36 }
37 return 0;
38}
39
40static unsigned int
41netmap_tg(struct sk_buff *skb, const struct xt_action_param *par)
42{
43 struct nf_conn *ct;
44 enum ip_conntrack_info ctinfo;
45 __be32 new_ip, netmask;
46 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
47 struct nf_nat_ipv4_range newrange;
48
49 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
50 par->hooknum == NF_INET_POST_ROUTING ||
51 par->hooknum == NF_INET_LOCAL_OUT ||
52 par->hooknum == NF_INET_LOCAL_IN);
53 ct = nf_ct_get(skb, &ctinfo);
54
55 netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
56
57 if (par->hooknum == NF_INET_PRE_ROUTING ||
58 par->hooknum == NF_INET_LOCAL_OUT)
59 new_ip = ip_hdr(skb)->daddr & ~netmask;
60 else
61 new_ip = ip_hdr(skb)->saddr & ~netmask;
62 new_ip |= mr->range[0].min_ip & netmask;
63
64 newrange = ((struct nf_nat_ipv4_range)
65 { mr->range[0].flags | NF_NAT_RANGE_MAP_IPS,
66 new_ip, new_ip,
67 mr->range[0].min, mr->range[0].max });
68
69 /* Hand modified range to generic setup. */
70 return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
71}
72
73static struct xt_target netmap_tg_reg __read_mostly = {
74 .name = "NETMAP",
75 .family = NFPROTO_IPV4,
76 .target = netmap_tg,
77 .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
78 .table = "nat",
79 .hooks = (1 << NF_INET_PRE_ROUTING) |
80 (1 << NF_INET_POST_ROUTING) |
81 (1 << NF_INET_LOCAL_OUT) |
82 (1 << NF_INET_LOCAL_IN),
83 .checkentry = netmap_tg_check,
84 .me = THIS_MODULE
85};
86
87static int __init netmap_tg_init(void)
88{
89 return xt_register_target(&netmap_tg_reg);
90}
91
92static void __exit netmap_tg_exit(void)
93{
94 xt_unregister_target(&netmap_tg_reg);
95}
96
97module_init(netmap_tg_init);
98module_exit(netmap_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
deleted file mode 100644
index 7c0103a5203..00000000000
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ /dev/null
@@ -1,110 +0,0 @@
1/* Redirect. Simple mapping which alters dst to a local IP address. */
2/* (C) 1999-2001 Paul `Rusty' Russell
3 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10#include <linux/types.h>
11#include <linux/ip.h>
12#include <linux/timer.h>
13#include <linux/module.h>
14#include <linux/netfilter.h>
15#include <linux/netdevice.h>
16#include <linux/if.h>
17#include <linux/inetdevice.h>
18#include <net/protocol.h>
19#include <net/checksum.h>
20#include <linux/netfilter_ipv4.h>
21#include <linux/netfilter/x_tables.h>
22#include <net/netfilter/nf_nat_rule.h>
23
24MODULE_LICENSE("GPL");
25MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
26MODULE_DESCRIPTION("Xtables: Connection redirection to localhost");
27
28/* FIXME: Take multiple ranges --RR */
29static int redirect_tg_check(const struct xt_tgchk_param *par)
30{
31 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
32
33 if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) {
34 pr_debug("bad MAP_IPS.\n");
35 return -EINVAL;
36 }
37 if (mr->rangesize != 1) {
38 pr_debug("bad rangesize %u.\n", mr->rangesize);
39 return -EINVAL;
40 }
41 return 0;
42}
43
44static unsigned int
45redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
46{
47 struct nf_conn *ct;
48 enum ip_conntrack_info ctinfo;
49 __be32 newdst;
50 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
51 struct nf_nat_ipv4_range newrange;
52
53 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
54 par->hooknum == NF_INET_LOCAL_OUT);
55
56 ct = nf_ct_get(skb, &ctinfo);
57 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
58
59 /* Local packets: make them go to loopback */
60 if (par->hooknum == NF_INET_LOCAL_OUT)
61 newdst = htonl(0x7F000001);
62 else {
63 struct in_device *indev;
64 struct in_ifaddr *ifa;
65
66 newdst = 0;
67
68 rcu_read_lock();
69 indev = __in_dev_get_rcu(skb->dev);
70 if (indev && (ifa = indev->ifa_list))
71 newdst = ifa->ifa_local;
72 rcu_read_unlock();
73
74 if (!newdst)
75 return NF_DROP;
76 }
77
78 /* Transfer from original range. */
79 newrange = ((struct nf_nat_ipv4_range)
80 { mr->range[0].flags | NF_NAT_RANGE_MAP_IPS,
81 newdst, newdst,
82 mr->range[0].min, mr->range[0].max });
83
84 /* Hand modified range to generic setup. */
85 return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
86}
87
88static struct xt_target redirect_tg_reg __read_mostly = {
89 .name = "REDIRECT",
90 .family = NFPROTO_IPV4,
91 .target = redirect_tg,
92 .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
93 .table = "nat",
94 .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT),
95 .checkentry = redirect_tg_check,
96 .me = THIS_MODULE,
97};
98
99static int __init redirect_tg_init(void)
100{
101 return xt_register_target(&redirect_tg_reg);
102}
103
104static void __exit redirect_tg_exit(void)
105{
106 xt_unregister_target(&redirect_tg_reg);
107}
108
109module_init(redirect_tg_init);
110module_exit(redirect_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 1109f7f6c25..b5ef3cba225 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -396,8 +396,7 @@ static int __init ulog_tg_init(void)
396 for (i = 0; i < ULOG_MAXNLGROUPS; i++) 396 for (i = 0; i < ULOG_MAXNLGROUPS; i++)
397 setup_timer(&ulog_buffers[i].timer, ulog_timer, i); 397 setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
398 398
399 nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, 399 nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, &cfg);
400 THIS_MODULE, &cfg);
401 if (!nflognl) 400 if (!nflognl)
402 return -ENOMEM; 401 return -ENOMEM;
403 402
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index 31371be8174..c30130062cd 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -85,7 +85,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
85 return ipv4_is_local_multicast(iph->daddr) ^ invert; 85 return ipv4_is_local_multicast(iph->daddr) ^ invert;
86 flow.flowi4_iif = 0; 86 flow.flowi4_iif = 0;
87 } else { 87 } else {
88 flow.flowi4_iif = dev_net(par->in)->loopback_dev->ifindex; 88 flow.flowi4_iif = LOOPBACK_IFINDEX;
89 } 89 }
90 90
91 flow.daddr = iph->saddr; 91 flow.daddr = iph->saddr;
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 851acec852d..6b3da5cf54e 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -69,9 +69,7 @@ static int __net_init iptable_filter_net_init(struct net *net)
69 net->ipv4.iptable_filter = 69 net->ipv4.iptable_filter =
70 ipt_register_table(net, &packet_filter, repl); 70 ipt_register_table(net, &packet_filter, repl);
71 kfree(repl); 71 kfree(repl);
72 if (IS_ERR(net->ipv4.iptable_filter)) 72 return PTR_RET(net->ipv4.iptable_filter);
73 return PTR_ERR(net->ipv4.iptable_filter);
74 return 0;
75} 73}
76 74
77static void __net_exit iptable_filter_net_exit(struct net *net) 75static void __net_exit iptable_filter_net_exit(struct net *net)
@@ -96,14 +94,10 @@ static int __init iptable_filter_init(void)
96 filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook); 94 filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
97 if (IS_ERR(filter_ops)) { 95 if (IS_ERR(filter_ops)) {
98 ret = PTR_ERR(filter_ops); 96 ret = PTR_ERR(filter_ops);
99 goto cleanup_table; 97 unregister_pernet_subsys(&iptable_filter_net_ops);
100 } 98 }
101 99
102 return ret; 100 return ret;
103
104 cleanup_table:
105 unregister_pernet_subsys(&iptable_filter_net_ops);
106 return ret;
107} 101}
108 102
109static void __exit iptable_filter_fini(void) 103static void __exit iptable_filter_fini(void)
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index aef5d1fbe77..85d88f20644 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -104,9 +104,7 @@ static int __net_init iptable_mangle_net_init(struct net *net)
104 net->ipv4.iptable_mangle = 104 net->ipv4.iptable_mangle =
105 ipt_register_table(net, &packet_mangler, repl); 105 ipt_register_table(net, &packet_mangler, repl);
106 kfree(repl); 106 kfree(repl);
107 if (IS_ERR(net->ipv4.iptable_mangle)) 107 return PTR_RET(net->ipv4.iptable_mangle);
108 return PTR_ERR(net->ipv4.iptable_mangle);
109 return 0;
110} 108}
111 109
112static void __net_exit iptable_mangle_net_exit(struct net *net) 110static void __net_exit iptable_mangle_net_exit(struct net *net)
@@ -131,14 +129,10 @@ static int __init iptable_mangle_init(void)
131 mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook); 129 mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);
132 if (IS_ERR(mangle_ops)) { 130 if (IS_ERR(mangle_ops)) {
133 ret = PTR_ERR(mangle_ops); 131 ret = PTR_ERR(mangle_ops);
134 goto cleanup_table; 132 unregister_pernet_subsys(&iptable_mangle_net_ops);
135 } 133 }
136 134
137 return ret; 135 return ret;
138
139 cleanup_table:
140 unregister_pernet_subsys(&iptable_mangle_net_ops);
141 return ret;
142} 136}
143 137
144static void __exit iptable_mangle_fini(void) 138static void __exit iptable_mangle_fini(void)
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/iptable_nat.c
index 3828a422982..9e0ffaf1d94 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -1,84 +1,71 @@
1/* (C) 1999-2001 Paul `Rusty' Russell 1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 2 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
3 * (C) 2011 Patrick McHardy <kaber@trash.net>
3 * 4 *
4 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 6 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
7 */ 8 */
8#include <linux/types.h> 9
9#include <linux/icmp.h> 10#include <linux/module.h>
10#include <linux/gfp.h>
11#include <linux/ip.h>
12#include <linux/netfilter.h> 11#include <linux/netfilter.h>
13#include <linux/netfilter_ipv4.h> 12#include <linux/netfilter_ipv4.h>
14#include <linux/module.h> 13#include <linux/netfilter_ipv4/ip_tables.h>
15#include <linux/skbuff.h> 14#include <linux/ip.h>
16#include <linux/proc_fs.h>
17#include <net/ip.h> 15#include <net/ip.h>
18#include <net/checksum.h>
19#include <linux/spinlock.h>
20 16
21#include <net/netfilter/nf_conntrack.h>
22#include <net/netfilter/nf_conntrack_core.h>
23#include <net/netfilter/nf_conntrack_extend.h>
24#include <net/netfilter/nf_nat.h> 17#include <net/netfilter/nf_nat.h>
25#include <net/netfilter/nf_nat_rule.h>
26#include <net/netfilter/nf_nat_protocol.h>
27#include <net/netfilter/nf_nat_core.h> 18#include <net/netfilter/nf_nat_core.h>
28#include <net/netfilter/nf_nat_helper.h> 19#include <net/netfilter/nf_nat_l3proto.h>
29#include <linux/netfilter_ipv4/ip_tables.h> 20
21static const struct xt_table nf_nat_ipv4_table = {
22 .name = "nat",
23 .valid_hooks = (1 << NF_INET_PRE_ROUTING) |
24 (1 << NF_INET_POST_ROUTING) |
25 (1 << NF_INET_LOCAL_OUT) |
26 (1 << NF_INET_LOCAL_IN),
27 .me = THIS_MODULE,
28 .af = NFPROTO_IPV4,
29};
30 30
31#ifdef CONFIG_XFRM 31static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
32static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
33{ 32{
34 struct flowi4 *fl4 = &fl->u.ip4; 33 /* Force range to this IP; let proto decide mapping for
35 const struct nf_conn *ct; 34 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
36 const struct nf_conntrack_tuple *t; 35 */
37 enum ip_conntrack_info ctinfo; 36 struct nf_nat_range range;
38 enum ip_conntrack_dir dir; 37
39 unsigned long statusbit; 38 range.flags = 0;
40 39 pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
41 ct = nf_ct_get(skb, &ctinfo); 40 HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ?
42 if (ct == NULL) 41 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
43 return; 42 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
44 dir = CTINFO2DIR(ctinfo); 43
45 t = &ct->tuplehash[dir].tuple; 44 return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
46 45}
47 if (dir == IP_CT_DIR_ORIGINAL)
48 statusbit = IPS_DST_NAT;
49 else
50 statusbit = IPS_SRC_NAT;
51
52 if (ct->status & statusbit) {
53 fl4->daddr = t->dst.u3.ip;
54 if (t->dst.protonum == IPPROTO_TCP ||
55 t->dst.protonum == IPPROTO_UDP ||
56 t->dst.protonum == IPPROTO_UDPLITE ||
57 t->dst.protonum == IPPROTO_DCCP ||
58 t->dst.protonum == IPPROTO_SCTP)
59 fl4->fl4_dport = t->dst.u.tcp.port;
60 }
61 46
62 statusbit ^= IPS_NAT_MASK; 47static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum,
48 const struct net_device *in,
49 const struct net_device *out,
50 struct nf_conn *ct)
51{
52 struct net *net = nf_ct_net(ct);
53 unsigned int ret;
63 54
64 if (ct->status & statusbit) { 55 ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table);
65 fl4->saddr = t->src.u3.ip; 56 if (ret == NF_ACCEPT) {
66 if (t->dst.protonum == IPPROTO_TCP || 57 if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
67 t->dst.protonum == IPPROTO_UDP || 58 ret = alloc_null_binding(ct, hooknum);
68 t->dst.protonum == IPPROTO_UDPLITE ||
69 t->dst.protonum == IPPROTO_DCCP ||
70 t->dst.protonum == IPPROTO_SCTP)
71 fl4->fl4_sport = t->src.u.tcp.port;
72 } 59 }
60 return ret;
73} 61}
74#endif
75 62
76static unsigned int 63static unsigned int
77nf_nat_fn(unsigned int hooknum, 64nf_nat_ipv4_fn(unsigned int hooknum,
78 struct sk_buff *skb, 65 struct sk_buff *skb,
79 const struct net_device *in, 66 const struct net_device *in,
80 const struct net_device *out, 67 const struct net_device *out,
81 int (*okfn)(struct sk_buff *)) 68 int (*okfn)(struct sk_buff *))
82{ 69{
83 struct nf_conn *ct; 70 struct nf_conn *ct;
84 enum ip_conntrack_info ctinfo; 71 enum ip_conntrack_info ctinfo;
@@ -87,14 +74,16 @@ nf_nat_fn(unsigned int hooknum,
87 enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum); 74 enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
88 75
89 /* We never see fragments: conntrack defrags on pre-routing 76 /* We never see fragments: conntrack defrags on pre-routing
90 and local-out, and nf_nat_out protects post-routing. */ 77 * and local-out, and nf_nat_out protects post-routing.
78 */
91 NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb))); 79 NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
92 80
93 ct = nf_ct_get(skb, &ctinfo); 81 ct = nf_ct_get(skb, &ctinfo);
94 /* Can't track? It's not due to stress, or conntrack would 82 /* Can't track? It's not due to stress, or conntrack would
95 have dropped it. Hence it's the user's responsibilty to 83 * have dropped it. Hence it's the user's responsibilty to
96 packet filter it out, or implement conntrack/NAT for that 84 * packet filter it out, or implement conntrack/NAT for that
97 protocol. 8) --RR */ 85 * protocol. 8) --RR
86 */
98 if (!ct) 87 if (!ct)
99 return NF_ACCEPT; 88 return NF_ACCEPT;
100 89
@@ -118,17 +107,17 @@ nf_nat_fn(unsigned int hooknum,
118 case IP_CT_RELATED: 107 case IP_CT_RELATED:
119 case IP_CT_RELATED_REPLY: 108 case IP_CT_RELATED_REPLY:
120 if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { 109 if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
121 if (!nf_nat_icmp_reply_translation(ct, ctinfo, 110 if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
122 hooknum, skb)) 111 hooknum))
123 return NF_DROP; 112 return NF_DROP;
124 else 113 else
125 return NF_ACCEPT; 114 return NF_ACCEPT;
126 } 115 }
127 /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ 116 /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
128 case IP_CT_NEW: 117 case IP_CT_NEW:
129
130 /* Seen it before? This can happen for loopback, retrans, 118 /* Seen it before? This can happen for loopback, retrans,
131 or local packets.. */ 119 * or local packets.
120 */
132 if (!nf_nat_initialized(ct, maniptype)) { 121 if (!nf_nat_initialized(ct, maniptype)) {
133 unsigned int ret; 122 unsigned int ret;
134 123
@@ -151,16 +140,16 @@ nf_nat_fn(unsigned int hooknum,
151} 140}
152 141
153static unsigned int 142static unsigned int
154nf_nat_in(unsigned int hooknum, 143nf_nat_ipv4_in(unsigned int hooknum,
155 struct sk_buff *skb, 144 struct sk_buff *skb,
156 const struct net_device *in, 145 const struct net_device *in,
157 const struct net_device *out, 146 const struct net_device *out,
158 int (*okfn)(struct sk_buff *)) 147 int (*okfn)(struct sk_buff *))
159{ 148{
160 unsigned int ret; 149 unsigned int ret;
161 __be32 daddr = ip_hdr(skb)->daddr; 150 __be32 daddr = ip_hdr(skb)->daddr;
162 151
163 ret = nf_nat_fn(hooknum, skb, in, out, okfn); 152 ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
164 if (ret != NF_DROP && ret != NF_STOLEN && 153 if (ret != NF_DROP && ret != NF_STOLEN &&
165 daddr != ip_hdr(skb)->daddr) 154 daddr != ip_hdr(skb)->daddr)
166 skb_dst_drop(skb); 155 skb_dst_drop(skb);
@@ -169,11 +158,11 @@ nf_nat_in(unsigned int hooknum,
169} 158}
170 159
171static unsigned int 160static unsigned int
172nf_nat_out(unsigned int hooknum, 161nf_nat_ipv4_out(unsigned int hooknum,
173 struct sk_buff *skb, 162 struct sk_buff *skb,
174 const struct net_device *in, 163 const struct net_device *in,
175 const struct net_device *out, 164 const struct net_device *out,
176 int (*okfn)(struct sk_buff *)) 165 int (*okfn)(struct sk_buff *))
177{ 166{
178#ifdef CONFIG_XFRM 167#ifdef CONFIG_XFRM
179 const struct nf_conn *ct; 168 const struct nf_conn *ct;
@@ -186,29 +175,30 @@ nf_nat_out(unsigned int hooknum,
186 ip_hdrlen(skb) < sizeof(struct iphdr)) 175 ip_hdrlen(skb) < sizeof(struct iphdr))
187 return NF_ACCEPT; 176 return NF_ACCEPT;
188 177
189 ret = nf_nat_fn(hooknum, skb, in, out, okfn); 178 ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
190#ifdef CONFIG_XFRM 179#ifdef CONFIG_XFRM
191 if (ret != NF_DROP && ret != NF_STOLEN && 180 if (ret != NF_DROP && ret != NF_STOLEN &&
181 !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
192 (ct = nf_ct_get(skb, &ctinfo)) != NULL) { 182 (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
193 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 183 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
194 184
195 if ((ct->tuplehash[dir].tuple.src.u3.ip != 185 if ((ct->tuplehash[dir].tuple.src.u3.ip !=
196 ct->tuplehash[!dir].tuple.dst.u3.ip) || 186 ct->tuplehash[!dir].tuple.dst.u3.ip) ||
197 (ct->tuplehash[dir].tuple.src.u.all != 187 (ct->tuplehash[dir].tuple.src.u.all !=
198 ct->tuplehash[!dir].tuple.dst.u.all) 188 ct->tuplehash[!dir].tuple.dst.u.all))
199 ) 189 if (nf_xfrm_me_harder(skb, AF_INET) < 0)
200 return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP; 190 ret = NF_DROP;
201 } 191 }
202#endif 192#endif
203 return ret; 193 return ret;
204} 194}
205 195
206static unsigned int 196static unsigned int
207nf_nat_local_fn(unsigned int hooknum, 197nf_nat_ipv4_local_fn(unsigned int hooknum,
208 struct sk_buff *skb, 198 struct sk_buff *skb,
209 const struct net_device *in, 199 const struct net_device *in,
210 const struct net_device *out, 200 const struct net_device *out,
211 int (*okfn)(struct sk_buff *)) 201 int (*okfn)(struct sk_buff *))
212{ 202{
213 const struct nf_conn *ct; 203 const struct nf_conn *ct;
214 enum ip_conntrack_info ctinfo; 204 enum ip_conntrack_info ctinfo;
@@ -219,7 +209,7 @@ nf_nat_local_fn(unsigned int hooknum,
219 ip_hdrlen(skb) < sizeof(struct iphdr)) 209 ip_hdrlen(skb) < sizeof(struct iphdr))
220 return NF_ACCEPT; 210 return NF_ACCEPT;
221 211
222 ret = nf_nat_fn(hooknum, skb, in, out, okfn); 212 ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
223 if (ret != NF_DROP && ret != NF_STOLEN && 213 if (ret != NF_DROP && ret != NF_STOLEN &&
224 (ct = nf_ct_get(skb, &ctinfo)) != NULL) { 214 (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
225 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 215 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
@@ -230,21 +220,20 @@ nf_nat_local_fn(unsigned int hooknum,
230 ret = NF_DROP; 220 ret = NF_DROP;
231 } 221 }
232#ifdef CONFIG_XFRM 222#ifdef CONFIG_XFRM
233 else if (ct->tuplehash[dir].tuple.dst.u.all != 223 else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
224 ct->tuplehash[dir].tuple.dst.u.all !=
234 ct->tuplehash[!dir].tuple.src.u.all) 225 ct->tuplehash[!dir].tuple.src.u.all)
235 if (ip_xfrm_me_harder(skb)) 226 if (nf_xfrm_me_harder(skb, AF_INET) < 0)
236 ret = NF_DROP; 227 ret = NF_DROP;
237#endif 228#endif
238 } 229 }
239 return ret; 230 return ret;
240} 231}
241 232
242/* We must be after connection tracking and before packet filtering. */ 233static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
243
244static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
245 /* Before packet filtering, change destination */ 234 /* Before packet filtering, change destination */
246 { 235 {
247 .hook = nf_nat_in, 236 .hook = nf_nat_ipv4_in,
248 .owner = THIS_MODULE, 237 .owner = THIS_MODULE,
249 .pf = NFPROTO_IPV4, 238 .pf = NFPROTO_IPV4,
250 .hooknum = NF_INET_PRE_ROUTING, 239 .hooknum = NF_INET_PRE_ROUTING,
@@ -252,7 +241,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
252 }, 241 },
253 /* After packet filtering, change source */ 242 /* After packet filtering, change source */
254 { 243 {
255 .hook = nf_nat_out, 244 .hook = nf_nat_ipv4_out,
256 .owner = THIS_MODULE, 245 .owner = THIS_MODULE,
257 .pf = NFPROTO_IPV4, 246 .pf = NFPROTO_IPV4,
258 .hooknum = NF_INET_POST_ROUTING, 247 .hooknum = NF_INET_POST_ROUTING,
@@ -260,7 +249,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
260 }, 249 },
261 /* Before packet filtering, change destination */ 250 /* Before packet filtering, change destination */
262 { 251 {
263 .hook = nf_nat_local_fn, 252 .hook = nf_nat_ipv4_local_fn,
264 .owner = THIS_MODULE, 253 .owner = THIS_MODULE,
265 .pf = NFPROTO_IPV4, 254 .pf = NFPROTO_IPV4,
266 .hooknum = NF_INET_LOCAL_OUT, 255 .hooknum = NF_INET_LOCAL_OUT,
@@ -268,7 +257,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
268 }, 257 },
269 /* After packet filtering, change source */ 258 /* After packet filtering, change source */
270 { 259 {
271 .hook = nf_nat_fn, 260 .hook = nf_nat_ipv4_fn,
272 .owner = THIS_MODULE, 261 .owner = THIS_MODULE,
273 .pf = NFPROTO_IPV4, 262 .pf = NFPROTO_IPV4,
274 .hooknum = NF_INET_LOCAL_IN, 263 .hooknum = NF_INET_LOCAL_IN,
@@ -276,51 +265,56 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
276 }, 265 },
277}; 266};
278 267
279static int __init nf_nat_standalone_init(void) 268static int __net_init iptable_nat_net_init(struct net *net)
280{ 269{
281 int ret = 0; 270 struct ipt_replace *repl;
271
272 repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
273 if (repl == NULL)
274 return -ENOMEM;
275 net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
276 kfree(repl);
277 if (IS_ERR(net->ipv4.nat_table))
278 return PTR_ERR(net->ipv4.nat_table);
279 return 0;
280}
282 281
283 need_ipv4_conntrack(); 282static void __net_exit iptable_nat_net_exit(struct net *net)
283{
284 ipt_unregister_table(net, net->ipv4.nat_table);
285}
284 286
285#ifdef CONFIG_XFRM 287static struct pernet_operations iptable_nat_net_ops = {
286 BUG_ON(ip_nat_decode_session != NULL); 288 .init = iptable_nat_net_init,
287 RCU_INIT_POINTER(ip_nat_decode_session, nat_decode_session); 289 .exit = iptable_nat_net_exit,
288#endif 290};
289 ret = nf_nat_rule_init();
290 if (ret < 0) {
291 pr_err("nf_nat_init: can't setup rules.\n");
292 goto cleanup_decode_session;
293 }
294 ret = nf_register_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops));
295 if (ret < 0) {
296 pr_err("nf_nat_init: can't register hooks.\n");
297 goto cleanup_rule_init;
298 }
299 return ret;
300 291
301 cleanup_rule_init: 292static int __init iptable_nat_init(void)
302 nf_nat_rule_cleanup(); 293{
303 cleanup_decode_session: 294 int err;
304#ifdef CONFIG_XFRM 295
305 RCU_INIT_POINTER(ip_nat_decode_session, NULL); 296 err = register_pernet_subsys(&iptable_nat_net_ops);
306 synchronize_net(); 297 if (err < 0)
307#endif 298 goto err1;
308 return ret; 299
300 err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
301 if (err < 0)
302 goto err2;
303 return 0;
304
305err2:
306 unregister_pernet_subsys(&iptable_nat_net_ops);
307err1:
308 return err;
309} 309}
310 310
311static void __exit nf_nat_standalone_fini(void) 311static void __exit iptable_nat_exit(void)
312{ 312{
313 nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); 313 nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
314 nf_nat_rule_cleanup(); 314 unregister_pernet_subsys(&iptable_nat_net_ops);
315#ifdef CONFIG_XFRM
316 RCU_INIT_POINTER(ip_nat_decode_session, NULL);
317 synchronize_net();
318#endif
319 /* Conntrack caches are unregistered in nf_conntrack_cleanup */
320} 315}
321 316
322module_init(nf_nat_standalone_init); 317module_init(iptable_nat_init);
323module_exit(nf_nat_standalone_fini); 318module_exit(iptable_nat_exit);
324 319
325MODULE_LICENSE("GPL"); 320MODULE_LICENSE("GPL");
326MODULE_ALIAS("ip_nat");
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 07fb710cd72..03d9696d3c6 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -48,9 +48,7 @@ static int __net_init iptable_raw_net_init(struct net *net)
48 net->ipv4.iptable_raw = 48 net->ipv4.iptable_raw =
49 ipt_register_table(net, &packet_raw, repl); 49 ipt_register_table(net, &packet_raw, repl);
50 kfree(repl); 50 kfree(repl);
51 if (IS_ERR(net->ipv4.iptable_raw)) 51 return PTR_RET(net->ipv4.iptable_raw);
52 return PTR_ERR(net->ipv4.iptable_raw);
53 return 0;
54} 52}
55 53
56static void __net_exit iptable_raw_net_exit(struct net *net) 54static void __net_exit iptable_raw_net_exit(struct net *net)
@@ -75,14 +73,10 @@ static int __init iptable_raw_init(void)
75 rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook); 73 rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);
76 if (IS_ERR(rawtable_ops)) { 74 if (IS_ERR(rawtable_ops)) {
77 ret = PTR_ERR(rawtable_ops); 75 ret = PTR_ERR(rawtable_ops);
78 goto cleanup_table; 76 unregister_pernet_subsys(&iptable_raw_net_ops);
79 } 77 }
80 78
81 return ret; 79 return ret;
82
83 cleanup_table:
84 unregister_pernet_subsys(&iptable_raw_net_ops);
85 return ret;
86} 80}
87 81
88static void __exit iptable_raw_fini(void) 82static void __exit iptable_raw_fini(void)
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index be45bdc4c60..b283d8e2601 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -66,10 +66,7 @@ static int __net_init iptable_security_net_init(struct net *net)
66 net->ipv4.iptable_security = 66 net->ipv4.iptable_security =
67 ipt_register_table(net, &security_table, repl); 67 ipt_register_table(net, &security_table, repl);
68 kfree(repl); 68 kfree(repl);
69 if (IS_ERR(net->ipv4.iptable_security)) 69 return PTR_RET(net->ipv4.iptable_security);
70 return PTR_ERR(net->ipv4.iptable_security);
71
72 return 0;
73} 70}
74 71
75static void __net_exit iptable_security_net_exit(struct net *net) 72static void __net_exit iptable_security_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index e7ff2dcab6c..fcdd0c2406e 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -29,11 +29,6 @@
29#include <net/netfilter/ipv4/nf_defrag_ipv4.h> 29#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
30#include <net/netfilter/nf_log.h> 30#include <net/netfilter/nf_log.h>
31 31
32int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb,
33 struct nf_conn *ct,
34 enum ip_conntrack_info ctinfo);
35EXPORT_SYMBOL_GPL(nf_nat_seq_adjust_hook);
36
37static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, 32static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
38 struct nf_conntrack_tuple *tuple) 33 struct nf_conntrack_tuple *tuple)
39{ 34{
@@ -149,7 +144,8 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
149 typeof(nf_nat_seq_adjust_hook) seq_adjust; 144 typeof(nf_nat_seq_adjust_hook) seq_adjust;
150 145
151 seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); 146 seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook);
152 if (!seq_adjust || !seq_adjust(skb, ct, ctinfo)) { 147 if (!seq_adjust ||
148 !seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
153 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); 149 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
154 return NF_DROP; 150 return NF_DROP;
155 } 151 }
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
deleted file mode 100644
index 3c04d24e297..00000000000
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ /dev/null
@@ -1,85 +0,0 @@
1/* Amanda extension for TCP NAT alteration.
2 * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
3 * based on a copy of HW's ip_nat_irc.c as well as other modules
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version
8 * 2 of the License, or (at your option) any later version.
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/skbuff.h>
14#include <linux/udp.h>
15
16#include <net/netfilter/nf_conntrack_helper.h>
17#include <net/netfilter/nf_conntrack_expect.h>
18#include <net/netfilter/nf_nat_helper.h>
19#include <net/netfilter/nf_nat_rule.h>
20#include <linux/netfilter/nf_conntrack_amanda.h>
21
22MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
23MODULE_DESCRIPTION("Amanda NAT helper");
24MODULE_LICENSE("GPL");
25MODULE_ALIAS("ip_nat_amanda");
26
27static unsigned int help(struct sk_buff *skb,
28 enum ip_conntrack_info ctinfo,
29 unsigned int matchoff,
30 unsigned int matchlen,
31 struct nf_conntrack_expect *exp)
32{
33 char buffer[sizeof("65535")];
34 u_int16_t port;
35 unsigned int ret;
36
37 /* Connection comes from client. */
38 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
39 exp->dir = IP_CT_DIR_ORIGINAL;
40
41 /* When you see the packet, we need to NAT it the same as the
42 * this one (ie. same IP: it will be TCP and master is UDP). */
43 exp->expectfn = nf_nat_follow_master;
44
45 /* Try to get same port: if not, try to change it. */
46 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
47 int res;
48
49 exp->tuple.dst.u.tcp.port = htons(port);
50 res = nf_ct_expect_related(exp);
51 if (res == 0)
52 break;
53 else if (res != -EBUSY) {
54 port = 0;
55 break;
56 }
57 }
58
59 if (port == 0)
60 return NF_DROP;
61
62 sprintf(buffer, "%u", port);
63 ret = nf_nat_mangle_udp_packet(skb, exp->master, ctinfo,
64 matchoff, matchlen,
65 buffer, strlen(buffer));
66 if (ret != NF_ACCEPT)
67 nf_ct_unexpect_related(exp);
68 return ret;
69}
70
71static void __exit nf_nat_amanda_fini(void)
72{
73 RCU_INIT_POINTER(nf_nat_amanda_hook, NULL);
74 synchronize_rcu();
75}
76
77static int __init nf_nat_amanda_init(void)
78{
79 BUG_ON(nf_nat_amanda_hook != NULL);
80 RCU_INIT_POINTER(nf_nat_amanda_hook, help);
81 return 0;
82}
83
84module_init(nf_nat_amanda_init);
85module_exit(nf_nat_amanda_fini);
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
deleted file mode 100644
index 44b082fd48a..00000000000
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ /dev/null
@@ -1,763 +0,0 @@
1/* NAT for netfilter; shared with compatibility layer. */
2
3/* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/types.h>
13#include <linux/timer.h>
14#include <linux/skbuff.h>
15#include <linux/gfp.h>
16#include <net/checksum.h>
17#include <net/icmp.h>
18#include <net/ip.h>
19#include <net/tcp.h> /* For tcp_prot in getorigdst */
20#include <linux/icmp.h>
21#include <linux/udp.h>
22#include <linux/jhash.h>
23
24#include <linux/netfilter_ipv4.h>
25#include <net/netfilter/nf_conntrack.h>
26#include <net/netfilter/nf_conntrack_core.h>
27#include <net/netfilter/nf_nat.h>
28#include <net/netfilter/nf_nat_protocol.h>
29#include <net/netfilter/nf_nat_core.h>
30#include <net/netfilter/nf_nat_helper.h>
31#include <net/netfilter/nf_conntrack_helper.h>
32#include <net/netfilter/nf_conntrack_l3proto.h>
33#include <net/netfilter/nf_conntrack_zones.h>
34
35static DEFINE_SPINLOCK(nf_nat_lock);
36
37static struct nf_conntrack_l3proto *l3proto __read_mostly;
38
39#define MAX_IP_NAT_PROTO 256
40static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO]
41 __read_mostly;
42
43static inline const struct nf_nat_protocol *
44__nf_nat_proto_find(u_int8_t protonum)
45{
46 return rcu_dereference(nf_nat_protos[protonum]);
47}
48
49/* We keep an extra hash for each conntrack, for fast searching. */
50static inline unsigned int
51hash_by_src(const struct net *net, u16 zone,
52 const struct nf_conntrack_tuple *tuple)
53{
54 unsigned int hash;
55
56 /* Original src, to ensure we map it consistently if poss. */
57 hash = jhash_3words((__force u32)tuple->src.u3.ip,
58 (__force u32)tuple->src.u.all ^ zone,
59 tuple->dst.protonum, nf_conntrack_hash_rnd);
60 return ((u64)hash * net->ipv4.nat_htable_size) >> 32;
61}
62
63/* Is this tuple already taken? (not by us) */
64int
65nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
66 const struct nf_conn *ignored_conntrack)
67{
68 /* Conntrack tracking doesn't keep track of outgoing tuples; only
69 incoming ones. NAT means they don't have a fixed mapping,
70 so we invert the tuple and look for the incoming reply.
71
72 We could keep a separate hash if this proves too slow. */
73 struct nf_conntrack_tuple reply;
74
75 nf_ct_invert_tuplepr(&reply, tuple);
76 return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
77}
78EXPORT_SYMBOL(nf_nat_used_tuple);
79
80/* If we source map this tuple so reply looks like reply_tuple, will
81 * that meet the constraints of range. */
82static int
83in_range(const struct nf_conntrack_tuple *tuple,
84 const struct nf_nat_ipv4_range *range)
85{
86 const struct nf_nat_protocol *proto;
87 int ret = 0;
88
89 /* If we are supposed to map IPs, then we must be in the
90 range specified, otherwise let this drag us onto a new src IP. */
91 if (range->flags & NF_NAT_RANGE_MAP_IPS) {
92 if (ntohl(tuple->src.u3.ip) < ntohl(range->min_ip) ||
93 ntohl(tuple->src.u3.ip) > ntohl(range->max_ip))
94 return 0;
95 }
96
97 rcu_read_lock();
98 proto = __nf_nat_proto_find(tuple->dst.protonum);
99 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) ||
100 proto->in_range(tuple, NF_NAT_MANIP_SRC,
101 &range->min, &range->max))
102 ret = 1;
103 rcu_read_unlock();
104
105 return ret;
106}
107
108static inline int
109same_src(const struct nf_conn *ct,
110 const struct nf_conntrack_tuple *tuple)
111{
112 const struct nf_conntrack_tuple *t;
113
114 t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
115 return (t->dst.protonum == tuple->dst.protonum &&
116 t->src.u3.ip == tuple->src.u3.ip &&
117 t->src.u.all == tuple->src.u.all);
118}
119
120/* Only called for SRC manip */
121static int
122find_appropriate_src(struct net *net, u16 zone,
123 const struct nf_conntrack_tuple *tuple,
124 struct nf_conntrack_tuple *result,
125 const struct nf_nat_ipv4_range *range)
126{
127 unsigned int h = hash_by_src(net, zone, tuple);
128 const struct nf_conn_nat *nat;
129 const struct nf_conn *ct;
130 const struct hlist_node *n;
131
132 rcu_read_lock();
133 hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) {
134 ct = nat->ct;
135 if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) {
136 /* Copy source part from reply tuple. */
137 nf_ct_invert_tuplepr(result,
138 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
139 result->dst = tuple->dst;
140
141 if (in_range(result, range)) {
142 rcu_read_unlock();
143 return 1;
144 }
145 }
146 }
147 rcu_read_unlock();
148 return 0;
149}
150
151/* For [FUTURE] fragmentation handling, we want the least-used
152 src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
153 if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
154 1-65535, we don't do pro-rata allocation based on ports; we choose
155 the ip with the lowest src-ip/dst-ip/proto usage.
156*/
157static void
158find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
159 const struct nf_nat_ipv4_range *range,
160 const struct nf_conn *ct,
161 enum nf_nat_manip_type maniptype)
162{
163 __be32 *var_ipp;
164 /* Host order */
165 u_int32_t minip, maxip, j;
166
167 /* No IP mapping? Do nothing. */
168 if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
169 return;
170
171 if (maniptype == NF_NAT_MANIP_SRC)
172 var_ipp = &tuple->src.u3.ip;
173 else
174 var_ipp = &tuple->dst.u3.ip;
175
176 /* Fast path: only one choice. */
177 if (range->min_ip == range->max_ip) {
178 *var_ipp = range->min_ip;
179 return;
180 }
181
182 /* Hashing source and destination IPs gives a fairly even
183 * spread in practice (if there are a small number of IPs
184 * involved, there usually aren't that many connections
185 * anyway). The consistency means that servers see the same
186 * client coming from the same IP (some Internet Banking sites
187 * like this), even across reboots. */
188 minip = ntohl(range->min_ip);
189 maxip = ntohl(range->max_ip);
190 j = jhash_2words((__force u32)tuple->src.u3.ip,
191 range->flags & NF_NAT_RANGE_PERSISTENT ?
192 0 : (__force u32)tuple->dst.u3.ip ^ zone, 0);
193 j = ((u64)j * (maxip - minip + 1)) >> 32;
194 *var_ipp = htonl(minip + j);
195}
196
197/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
198 * we change the source to map into the range. For NF_INET_PRE_ROUTING
199 * and NF_INET_LOCAL_OUT, we change the destination to map into the
200 * range. It might not be possible to get a unique tuple, but we try.
201 * At worst (or if we race), we will end up with a final duplicate in
202 * __ip_conntrack_confirm and drop the packet. */
203static void
204get_unique_tuple(struct nf_conntrack_tuple *tuple,
205 const struct nf_conntrack_tuple *orig_tuple,
206 const struct nf_nat_ipv4_range *range,
207 struct nf_conn *ct,
208 enum nf_nat_manip_type maniptype)
209{
210 struct net *net = nf_ct_net(ct);
211 const struct nf_nat_protocol *proto;
212 u16 zone = nf_ct_zone(ct);
213
214 /* 1) If this srcip/proto/src-proto-part is currently mapped,
215 and that same mapping gives a unique tuple within the given
216 range, use that.
217
218 This is only required for source (ie. NAT/masq) mappings.
219 So far, we don't do local source mappings, so multiple
220 manips not an issue. */
221 if (maniptype == NF_NAT_MANIP_SRC &&
222 !(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) {
223 /* try the original tuple first */
224 if (in_range(orig_tuple, range)) {
225 if (!nf_nat_used_tuple(orig_tuple, ct)) {
226 *tuple = *orig_tuple;
227 return;
228 }
229 } else if (find_appropriate_src(net, zone, orig_tuple, tuple,
230 range)) {
231 pr_debug("get_unique_tuple: Found current src map\n");
232 if (!nf_nat_used_tuple(tuple, ct))
233 return;
234 }
235 }
236
237 /* 2) Select the least-used IP/proto combination in the given
238 range. */
239 *tuple = *orig_tuple;
240 find_best_ips_proto(zone, tuple, range, ct, maniptype);
241
242 /* 3) The per-protocol part of the manip is made to map into
243 the range to make a unique tuple. */
244
245 rcu_read_lock();
246 proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
247
248 /* Only bother mapping if it's not already in range and unique */
249 if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) {
250 if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
251 if (proto->in_range(tuple, maniptype, &range->min,
252 &range->max) &&
253 (range->min.all == range->max.all ||
254 !nf_nat_used_tuple(tuple, ct)))
255 goto out;
256 } else if (!nf_nat_used_tuple(tuple, ct)) {
257 goto out;
258 }
259 }
260
261 /* Last change: get protocol to try to obtain unique tuple. */
262 proto->unique_tuple(tuple, range, maniptype, ct);
263out:
264 rcu_read_unlock();
265}
266
267unsigned int
268nf_nat_setup_info(struct nf_conn *ct,
269 const struct nf_nat_ipv4_range *range,
270 enum nf_nat_manip_type maniptype)
271{
272 struct net *net = nf_ct_net(ct);
273 struct nf_conntrack_tuple curr_tuple, new_tuple;
274 struct nf_conn_nat *nat;
275
276 /* nat helper or nfctnetlink also setup binding */
277 nat = nfct_nat(ct);
278 if (!nat) {
279 nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
280 if (nat == NULL) {
281 pr_debug("failed to add NAT extension\n");
282 return NF_ACCEPT;
283 }
284 }
285
286 NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC ||
287 maniptype == NF_NAT_MANIP_DST);
288 BUG_ON(nf_nat_initialized(ct, maniptype));
289
290 /* What we've got will look like inverse of reply. Normally
291 this is what is in the conntrack, except for prior
292 manipulations (future optimization: if num_manips == 0,
293 orig_tp =
294 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
295 nf_ct_invert_tuplepr(&curr_tuple,
296 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
297
298 get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
299
300 if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
301 struct nf_conntrack_tuple reply;
302
303 /* Alter conntrack table so will recognize replies. */
304 nf_ct_invert_tuplepr(&reply, &new_tuple);
305 nf_conntrack_alter_reply(ct, &reply);
306
307 /* Non-atomic: we own this at the moment. */
308 if (maniptype == NF_NAT_MANIP_SRC)
309 ct->status |= IPS_SRC_NAT;
310 else
311 ct->status |= IPS_DST_NAT;
312 }
313
314 if (maniptype == NF_NAT_MANIP_SRC) {
315 unsigned int srchash;
316
317 srchash = hash_by_src(net, nf_ct_zone(ct),
318 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
319 spin_lock_bh(&nf_nat_lock);
320 /* nf_conntrack_alter_reply might re-allocate extension area */
321 nat = nfct_nat(ct);
322 nat->ct = ct;
323 hlist_add_head_rcu(&nat->bysource,
324 &net->ipv4.nat_bysource[srchash]);
325 spin_unlock_bh(&nf_nat_lock);
326 }
327
328 /* It's done. */
329 if (maniptype == NF_NAT_MANIP_DST)
330 ct->status |= IPS_DST_NAT_DONE;
331 else
332 ct->status |= IPS_SRC_NAT_DONE;
333
334 return NF_ACCEPT;
335}
336EXPORT_SYMBOL(nf_nat_setup_info);
337
338/* Returns true if succeeded. */
339static bool
340manip_pkt(u_int16_t proto,
341 struct sk_buff *skb,
342 unsigned int iphdroff,
343 const struct nf_conntrack_tuple *target,
344 enum nf_nat_manip_type maniptype)
345{
346 struct iphdr *iph;
347 const struct nf_nat_protocol *p;
348
349 if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
350 return false;
351
352 iph = (void *)skb->data + iphdroff;
353
354 /* Manipulate protcol part. */
355
356 /* rcu_read_lock()ed by nf_hook_slow */
357 p = __nf_nat_proto_find(proto);
358 if (!p->manip_pkt(skb, iphdroff, target, maniptype))
359 return false;
360
361 iph = (void *)skb->data + iphdroff;
362
363 if (maniptype == NF_NAT_MANIP_SRC) {
364 csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
365 iph->saddr = target->src.u3.ip;
366 } else {
367 csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
368 iph->daddr = target->dst.u3.ip;
369 }
370 return true;
371}
372
373/* Do packet manipulations according to nf_nat_setup_info. */
374unsigned int nf_nat_packet(struct nf_conn *ct,
375 enum ip_conntrack_info ctinfo,
376 unsigned int hooknum,
377 struct sk_buff *skb)
378{
379 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
380 unsigned long statusbit;
381 enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
382
383 if (mtype == NF_NAT_MANIP_SRC)
384 statusbit = IPS_SRC_NAT;
385 else
386 statusbit = IPS_DST_NAT;
387
388 /* Invert if this is reply dir. */
389 if (dir == IP_CT_DIR_REPLY)
390 statusbit ^= IPS_NAT_MASK;
391
392 /* Non-atomic: these bits don't change. */
393 if (ct->status & statusbit) {
394 struct nf_conntrack_tuple target;
395
396 /* We are aiming to look like inverse of other direction. */
397 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
398
399 if (!manip_pkt(target.dst.protonum, skb, 0, &target, mtype))
400 return NF_DROP;
401 }
402 return NF_ACCEPT;
403}
404EXPORT_SYMBOL_GPL(nf_nat_packet);
405
406/* Dir is direction ICMP is coming from (opposite to packet it contains) */
407int nf_nat_icmp_reply_translation(struct nf_conn *ct,
408 enum ip_conntrack_info ctinfo,
409 unsigned int hooknum,
410 struct sk_buff *skb)
411{
412 struct {
413 struct icmphdr icmp;
414 struct iphdr ip;
415 } *inside;
416 struct nf_conntrack_tuple target;
417 int hdrlen = ip_hdrlen(skb);
418 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
419 unsigned long statusbit;
420 enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
421
422 if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
423 return 0;
424
425 inside = (void *)skb->data + hdrlen;
426
427 /* We're actually going to mangle it beyond trivial checksum
428 adjustment, so make sure the current checksum is correct. */
429 if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
430 return 0;
431
432 /* Must be RELATED */
433 NF_CT_ASSERT(skb->nfctinfo == IP_CT_RELATED ||
434 skb->nfctinfo == IP_CT_RELATED_REPLY);
435
436 /* Redirects on non-null nats must be dropped, else they'll
437 start talking to each other without our translation, and be
438 confused... --RR */
439 if (inside->icmp.type == ICMP_REDIRECT) {
440 /* If NAT isn't finished, assume it and drop. */
441 if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
442 return 0;
443
444 if (ct->status & IPS_NAT_MASK)
445 return 0;
446 }
447
448 if (manip == NF_NAT_MANIP_SRC)
449 statusbit = IPS_SRC_NAT;
450 else
451 statusbit = IPS_DST_NAT;
452
453 /* Invert if this is reply dir. */
454 if (dir == IP_CT_DIR_REPLY)
455 statusbit ^= IPS_NAT_MASK;
456
457 if (!(ct->status & statusbit))
458 return 1;
459
460 pr_debug("icmp_reply_translation: translating error %p manip %u "
461 "dir %s\n", skb, manip,
462 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
463
464 /* Change inner back to look like incoming packet. We do the
465 opposite manip on this hook to normal, because it might not
466 pass all hooks (locally-generated ICMP). Consider incoming
467 packet: PREROUTING (DST manip), routing produces ICMP, goes
468 through POSTROUTING (which must correct the DST manip). */
469 if (!manip_pkt(inside->ip.protocol, skb, hdrlen + sizeof(inside->icmp),
470 &ct->tuplehash[!dir].tuple, !manip))
471 return 0;
472
473 if (skb->ip_summed != CHECKSUM_PARTIAL) {
474 /* Reloading "inside" here since manip_pkt inner. */
475 inside = (void *)skb->data + hdrlen;
476 inside->icmp.checksum = 0;
477 inside->icmp.checksum =
478 csum_fold(skb_checksum(skb, hdrlen,
479 skb->len - hdrlen, 0));
480 }
481
482 /* Change outer to look the reply to an incoming packet
483 * (proto 0 means don't invert per-proto part). */
484 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
485 if (!manip_pkt(0, skb, 0, &target, manip))
486 return 0;
487
488 return 1;
489}
490EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
491
492/* Protocol registration. */
493int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
494{
495 int ret = 0;
496
497 spin_lock_bh(&nf_nat_lock);
498 if (rcu_dereference_protected(
499 nf_nat_protos[proto->protonum],
500 lockdep_is_held(&nf_nat_lock)
501 ) != &nf_nat_unknown_protocol) {
502 ret = -EBUSY;
503 goto out;
504 }
505 RCU_INIT_POINTER(nf_nat_protos[proto->protonum], proto);
506 out:
507 spin_unlock_bh(&nf_nat_lock);
508 return ret;
509}
510EXPORT_SYMBOL(nf_nat_protocol_register);
511
512/* No one stores the protocol anywhere; simply delete it. */
513void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto)
514{
515 spin_lock_bh(&nf_nat_lock);
516 RCU_INIT_POINTER(nf_nat_protos[proto->protonum],
517 &nf_nat_unknown_protocol);
518 spin_unlock_bh(&nf_nat_lock);
519 synchronize_rcu();
520}
521EXPORT_SYMBOL(nf_nat_protocol_unregister);
522
523/* No one using conntrack by the time this called. */
524static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
525{
526 struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
527
528 if (nat == NULL || nat->ct == NULL)
529 return;
530
531 NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
532
533 spin_lock_bh(&nf_nat_lock);
534 hlist_del_rcu(&nat->bysource);
535 spin_unlock_bh(&nf_nat_lock);
536}
537
538static void nf_nat_move_storage(void *new, void *old)
539{
540 struct nf_conn_nat *new_nat = new;
541 struct nf_conn_nat *old_nat = old;
542 struct nf_conn *ct = old_nat->ct;
543
544 if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
545 return;
546
547 spin_lock_bh(&nf_nat_lock);
548 hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
549 spin_unlock_bh(&nf_nat_lock);
550}
551
552static struct nf_ct_ext_type nat_extend __read_mostly = {
553 .len = sizeof(struct nf_conn_nat),
554 .align = __alignof__(struct nf_conn_nat),
555 .destroy = nf_nat_cleanup_conntrack,
556 .move = nf_nat_move_storage,
557 .id = NF_CT_EXT_NAT,
558 .flags = NF_CT_EXT_F_PREALLOC,
559};
560
561#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
562
563#include <linux/netfilter/nfnetlink.h>
564#include <linux/netfilter/nfnetlink_conntrack.h>
565
566static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
567 [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 },
568 [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 },
569};
570
571static int nfnetlink_parse_nat_proto(struct nlattr *attr,
572 const struct nf_conn *ct,
573 struct nf_nat_ipv4_range *range)
574{
575 struct nlattr *tb[CTA_PROTONAT_MAX+1];
576 const struct nf_nat_protocol *npt;
577 int err;
578
579 err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy);
580 if (err < 0)
581 return err;
582
583 rcu_read_lock();
584 npt = __nf_nat_proto_find(nf_ct_protonum(ct));
585 if (npt->nlattr_to_range)
586 err = npt->nlattr_to_range(tb, range);
587 rcu_read_unlock();
588 return err;
589}
590
591static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
592 [CTA_NAT_MINIP] = { .type = NLA_U32 },
593 [CTA_NAT_MAXIP] = { .type = NLA_U32 },
594 [CTA_NAT_PROTO] = { .type = NLA_NESTED },
595};
596
597static int
598nfnetlink_parse_nat(const struct nlattr *nat,
599 const struct nf_conn *ct, struct nf_nat_ipv4_range *range)
600{
601 struct nlattr *tb[CTA_NAT_MAX+1];
602 int err;
603
604 memset(range, 0, sizeof(*range));
605
606 err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy);
607 if (err < 0)
608 return err;
609
610 if (tb[CTA_NAT_MINIP])
611 range->min_ip = nla_get_be32(tb[CTA_NAT_MINIP]);
612
613 if (!tb[CTA_NAT_MAXIP])
614 range->max_ip = range->min_ip;
615 else
616 range->max_ip = nla_get_be32(tb[CTA_NAT_MAXIP]);
617
618 if (range->min_ip)
619 range->flags |= NF_NAT_RANGE_MAP_IPS;
620
621 if (!tb[CTA_NAT_PROTO])
622 return 0;
623
624 err = nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range);
625 if (err < 0)
626 return err;
627
628 return 0;
629}
630
631static int
632nfnetlink_parse_nat_setup(struct nf_conn *ct,
633 enum nf_nat_manip_type manip,
634 const struct nlattr *attr)
635{
636 struct nf_nat_ipv4_range range;
637
638 if (nfnetlink_parse_nat(attr, ct, &range) < 0)
639 return -EINVAL;
640 if (nf_nat_initialized(ct, manip))
641 return -EEXIST;
642
643 return nf_nat_setup_info(ct, &range, manip);
644}
645#else
646static int
647nfnetlink_parse_nat_setup(struct nf_conn *ct,
648 enum nf_nat_manip_type manip,
649 const struct nlattr *attr)
650{
651 return -EOPNOTSUPP;
652}
653#endif
654
655static int __net_init nf_nat_net_init(struct net *net)
656{
657 /* Leave them the same for the moment. */
658 net->ipv4.nat_htable_size = net->ct.htable_size;
659 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0);
660 if (!net->ipv4.nat_bysource)
661 return -ENOMEM;
662 return 0;
663}
664
665/* Clear NAT section of all conntracks, in case we're loaded again. */
666static int clean_nat(struct nf_conn *i, void *data)
667{
668 struct nf_conn_nat *nat = nfct_nat(i);
669
670 if (!nat)
671 return 0;
672 memset(nat, 0, sizeof(*nat));
673 i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
674 return 0;
675}
676
677static void __net_exit nf_nat_net_exit(struct net *net)
678{
679 nf_ct_iterate_cleanup(net, &clean_nat, NULL);
680 synchronize_rcu();
681 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size);
682}
683
684static struct pernet_operations nf_nat_net_ops = {
685 .init = nf_nat_net_init,
686 .exit = nf_nat_net_exit,
687};
688
689static struct nf_ct_helper_expectfn follow_master_nat = {
690 .name = "nat-follow-master",
691 .expectfn = nf_nat_follow_master,
692};
693
694static struct nfq_ct_nat_hook nfq_ct_nat = {
695 .seq_adjust = nf_nat_tcp_seq_adjust,
696};
697
698static int __init nf_nat_init(void)
699{
700 size_t i;
701 int ret;
702
703 need_ipv4_conntrack();
704
705 ret = nf_ct_extend_register(&nat_extend);
706 if (ret < 0) {
707 printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
708 return ret;
709 }
710
711 ret = register_pernet_subsys(&nf_nat_net_ops);
712 if (ret < 0)
713 goto cleanup_extend;
714
715 /* Sew in builtin protocols. */
716 spin_lock_bh(&nf_nat_lock);
717 for (i = 0; i < MAX_IP_NAT_PROTO; i++)
718 RCU_INIT_POINTER(nf_nat_protos[i], &nf_nat_unknown_protocol);
719 RCU_INIT_POINTER(nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp);
720 RCU_INIT_POINTER(nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp);
721 RCU_INIT_POINTER(nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp);
722 spin_unlock_bh(&nf_nat_lock);
723
724 /* Initialize fake conntrack so that NAT will skip it */
725 nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
726
727 l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET);
728
729 nf_ct_helper_expectfn_register(&follow_master_nat);
730
731 BUG_ON(nf_nat_seq_adjust_hook != NULL);
732 RCU_INIT_POINTER(nf_nat_seq_adjust_hook, nf_nat_seq_adjust);
733 BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
734 RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook,
735 nfnetlink_parse_nat_setup);
736 BUG_ON(nf_ct_nat_offset != NULL);
737 RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset);
738 RCU_INIT_POINTER(nfq_ct_nat_hook, &nfq_ct_nat);
739 return 0;
740
741 cleanup_extend:
742 nf_ct_extend_unregister(&nat_extend);
743 return ret;
744}
745
746static void __exit nf_nat_cleanup(void)
747{
748 unregister_pernet_subsys(&nf_nat_net_ops);
749 nf_ct_l3proto_put(l3proto);
750 nf_ct_extend_unregister(&nat_extend);
751 nf_ct_helper_expectfn_unregister(&follow_master_nat);
752 RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL);
753 RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
754 RCU_INIT_POINTER(nf_ct_nat_offset, NULL);
755 RCU_INIT_POINTER(nfq_ct_nat_hook, NULL);
756 synchronize_net();
757}
758
759MODULE_LICENSE("GPL");
760MODULE_ALIAS("nf-nat-ipv4");
761
762module_init(nf_nat_init);
763module_exit(nf_nat_cleanup);
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
deleted file mode 100644
index e462a957d08..00000000000
--- a/net/ipv4/netfilter/nf_nat_ftp.c
+++ /dev/null
@@ -1,137 +0,0 @@
1/* FTP extension for TCP NAT alteration. */
2
3/* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/moduleparam.h>
13#include <linux/ip.h>
14#include <linux/tcp.h>
15#include <linux/netfilter_ipv4.h>
16#include <net/netfilter/nf_nat.h>
17#include <net/netfilter/nf_nat_helper.h>
18#include <net/netfilter/nf_nat_rule.h>
19#include <net/netfilter/nf_conntrack_helper.h>
20#include <net/netfilter/nf_conntrack_expect.h>
21#include <linux/netfilter/nf_conntrack_ftp.h>
22
23MODULE_LICENSE("GPL");
24MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
25MODULE_DESCRIPTION("ftp NAT helper");
26MODULE_ALIAS("ip_nat_ftp");
27
28/* FIXME: Time out? --RR */
29
30static int nf_nat_ftp_fmt_cmd(enum nf_ct_ftp_type type,
31 char *buffer, size_t buflen,
32 __be32 addr, u16 port)
33{
34 switch (type) {
35 case NF_CT_FTP_PORT:
36 case NF_CT_FTP_PASV:
37 return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u",
38 ((unsigned char *)&addr)[0],
39 ((unsigned char *)&addr)[1],
40 ((unsigned char *)&addr)[2],
41 ((unsigned char *)&addr)[3],
42 port >> 8,
43 port & 0xFF);
44 case NF_CT_FTP_EPRT:
45 return snprintf(buffer, buflen, "|1|%pI4|%u|", &addr, port);
46 case NF_CT_FTP_EPSV:
47 return snprintf(buffer, buflen, "|||%u|", port);
48 }
49
50 return 0;
51}
52
53/* So, this packet has hit the connection tracking matching code.
54 Mangle it, and change the expectation to match the new version. */
55static unsigned int nf_nat_ftp(struct sk_buff *skb,
56 enum ip_conntrack_info ctinfo,
57 enum nf_ct_ftp_type type,
58 unsigned int matchoff,
59 unsigned int matchlen,
60 struct nf_conntrack_expect *exp)
61{
62 __be32 newip;
63 u_int16_t port;
64 int dir = CTINFO2DIR(ctinfo);
65 struct nf_conn *ct = exp->master;
66 char buffer[sizeof("|1|255.255.255.255|65535|")];
67 unsigned int buflen;
68
69 pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
70
71 /* Connection will come from wherever this packet goes, hence !dir */
72 newip = ct->tuplehash[!dir].tuple.dst.u3.ip;
73 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
74 exp->dir = !dir;
75
76 /* When you see the packet, we need to NAT it the same as the
77 * this one. */
78 exp->expectfn = nf_nat_follow_master;
79
80 /* Try to get same port: if not, try to change it. */
81 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
82 int ret;
83
84 exp->tuple.dst.u.tcp.port = htons(port);
85 ret = nf_ct_expect_related(exp);
86 if (ret == 0)
87 break;
88 else if (ret != -EBUSY) {
89 port = 0;
90 break;
91 }
92 }
93
94 if (port == 0)
95 return NF_DROP;
96
97 buflen = nf_nat_ftp_fmt_cmd(type, buffer, sizeof(buffer), newip, port);
98 if (!buflen)
99 goto out;
100
101 pr_debug("calling nf_nat_mangle_tcp_packet\n");
102
103 if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff,
104 matchlen, buffer, buflen))
105 goto out;
106
107 return NF_ACCEPT;
108
109out:
110 nf_ct_unexpect_related(exp);
111 return NF_DROP;
112}
113
114static void __exit nf_nat_ftp_fini(void)
115{
116 RCU_INIT_POINTER(nf_nat_ftp_hook, NULL);
117 synchronize_rcu();
118}
119
120static int __init nf_nat_ftp_init(void)
121{
122 BUG_ON(nf_nat_ftp_hook != NULL);
123 RCU_INIT_POINTER(nf_nat_ftp_hook, nf_nat_ftp);
124 return 0;
125}
126
127/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
128static int warn_set(const char *val, struct kernel_param *kp)
129{
130 printk(KERN_INFO KBUILD_MODNAME
131 ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
132 return 0;
133}
134module_param_call(ports, warn_set, NULL, NULL, 0);
135
136module_init(nf_nat_ftp_init);
137module_exit(nf_nat_ftp_fini);
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index c6784a18c1c..9c3db10b22d 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -15,13 +15,12 @@
15 15
16#include <net/netfilter/nf_nat.h> 16#include <net/netfilter/nf_nat.h>
17#include <net/netfilter/nf_nat_helper.h> 17#include <net/netfilter/nf_nat_helper.h>
18#include <net/netfilter/nf_nat_rule.h>
19#include <net/netfilter/nf_conntrack_helper.h> 18#include <net/netfilter/nf_conntrack_helper.h>
20#include <net/netfilter/nf_conntrack_expect.h> 19#include <net/netfilter/nf_conntrack_expect.h>
21#include <linux/netfilter/nf_conntrack_h323.h> 20#include <linux/netfilter/nf_conntrack_h323.h>
22 21
23/****************************************************************************/ 22/****************************************************************************/
24static int set_addr(struct sk_buff *skb, 23static int set_addr(struct sk_buff *skb, unsigned int protoff,
25 unsigned char **data, int dataoff, 24 unsigned char **data, int dataoff,
26 unsigned int addroff, __be32 ip, __be16 port) 25 unsigned int addroff, __be32 ip, __be16 port)
27{ 26{
@@ -40,7 +39,7 @@ static int set_addr(struct sk_buff *skb,
40 39
41 if (ip_hdr(skb)->protocol == IPPROTO_TCP) { 40 if (ip_hdr(skb)->protocol == IPPROTO_TCP) {
42 if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, 41 if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
43 addroff, sizeof(buf), 42 protoff, addroff, sizeof(buf),
44 (char *) &buf, sizeof(buf))) { 43 (char *) &buf, sizeof(buf))) {
45 net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n"); 44 net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n");
46 return -1; 45 return -1;
@@ -54,7 +53,7 @@ static int set_addr(struct sk_buff *skb,
54 *data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff; 53 *data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff;
55 } else { 54 } else {
56 if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, 55 if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
57 addroff, sizeof(buf), 56 protoff, addroff, sizeof(buf),
58 (char *) &buf, sizeof(buf))) { 57 (char *) &buf, sizeof(buf))) {
59 net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n"); 58 net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n");
60 return -1; 59 return -1;
@@ -69,22 +68,22 @@ static int set_addr(struct sk_buff *skb,
69} 68}
70 69
71/****************************************************************************/ 70/****************************************************************************/
72static int set_h225_addr(struct sk_buff *skb, 71static int set_h225_addr(struct sk_buff *skb, unsigned int protoff,
73 unsigned char **data, int dataoff, 72 unsigned char **data, int dataoff,
74 TransportAddress *taddr, 73 TransportAddress *taddr,
75 union nf_inet_addr *addr, __be16 port) 74 union nf_inet_addr *addr, __be16 port)
76{ 75{
77 return set_addr(skb, data, dataoff, taddr->ipAddress.ip, 76 return set_addr(skb, protoff, data, dataoff, taddr->ipAddress.ip,
78 addr->ip, port); 77 addr->ip, port);
79} 78}
80 79
81/****************************************************************************/ 80/****************************************************************************/
82static int set_h245_addr(struct sk_buff *skb, 81static int set_h245_addr(struct sk_buff *skb, unsigned protoff,
83 unsigned char **data, int dataoff, 82 unsigned char **data, int dataoff,
84 H245_TransportAddress *taddr, 83 H245_TransportAddress *taddr,
85 union nf_inet_addr *addr, __be16 port) 84 union nf_inet_addr *addr, __be16 port)
86{ 85{
87 return set_addr(skb, data, dataoff, 86 return set_addr(skb, protoff, data, dataoff,
88 taddr->unicastAddress.iPAddress.network, 87 taddr->unicastAddress.iPAddress.network,
89 addr->ip, port); 88 addr->ip, port);
90} 89}
@@ -92,7 +91,7 @@ static int set_h245_addr(struct sk_buff *skb,
92/****************************************************************************/ 91/****************************************************************************/
93static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct, 92static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
94 enum ip_conntrack_info ctinfo, 93 enum ip_conntrack_info ctinfo,
95 unsigned char **data, 94 unsigned int protoff, unsigned char **data,
96 TransportAddress *taddr, int count) 95 TransportAddress *taddr, int count)
97{ 96{
98 const struct nf_ct_h323_master *info = nfct_help_data(ct); 97 const struct nf_ct_h323_master *info = nfct_help_data(ct);
@@ -118,7 +117,8 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
118 &addr.ip, port, 117 &addr.ip, port,
119 &ct->tuplehash[!dir].tuple.dst.u3.ip, 118 &ct->tuplehash[!dir].tuple.dst.u3.ip,
120 info->sig_port[!dir]); 119 info->sig_port[!dir]);
121 return set_h225_addr(skb, data, 0, &taddr[i], 120 return set_h225_addr(skb, protoff, data, 0,
121 &taddr[i],
122 &ct->tuplehash[!dir]. 122 &ct->tuplehash[!dir].
123 tuple.dst.u3, 123 tuple.dst.u3,
124 info->sig_port[!dir]); 124 info->sig_port[!dir]);
@@ -129,7 +129,8 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
129 &addr.ip, port, 129 &addr.ip, port,
130 &ct->tuplehash[!dir].tuple.src.u3.ip, 130 &ct->tuplehash[!dir].tuple.src.u3.ip,
131 info->sig_port[!dir]); 131 info->sig_port[!dir]);
132 return set_h225_addr(skb, data, 0, &taddr[i], 132 return set_h225_addr(skb, protoff, data, 0,
133 &taddr[i],
133 &ct->tuplehash[!dir]. 134 &ct->tuplehash[!dir].
134 tuple.src.u3, 135 tuple.src.u3,
135 info->sig_port[!dir]); 136 info->sig_port[!dir]);
@@ -143,7 +144,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
143/****************************************************************************/ 144/****************************************************************************/
144static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct, 145static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
145 enum ip_conntrack_info ctinfo, 146 enum ip_conntrack_info ctinfo,
146 unsigned char **data, 147 unsigned int protoff, unsigned char **data,
147 TransportAddress *taddr, int count) 148 TransportAddress *taddr, int count)
148{ 149{
149 int dir = CTINFO2DIR(ctinfo); 150 int dir = CTINFO2DIR(ctinfo);
@@ -159,7 +160,7 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
159 &addr.ip, ntohs(port), 160 &addr.ip, ntohs(port),
160 &ct->tuplehash[!dir].tuple.dst.u3.ip, 161 &ct->tuplehash[!dir].tuple.dst.u3.ip,
161 ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port)); 162 ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port));
162 return set_h225_addr(skb, data, 0, &taddr[i], 163 return set_h225_addr(skb, protoff, data, 0, &taddr[i],
163 &ct->tuplehash[!dir].tuple.dst.u3, 164 &ct->tuplehash[!dir].tuple.dst.u3,
164 ct->tuplehash[!dir].tuple. 165 ct->tuplehash[!dir].tuple.
165 dst.u.udp.port); 166 dst.u.udp.port);
@@ -172,7 +173,7 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
172/****************************************************************************/ 173/****************************************************************************/
173static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, 174static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
174 enum ip_conntrack_info ctinfo, 175 enum ip_conntrack_info ctinfo,
175 unsigned char **data, int dataoff, 176 unsigned int protoff, unsigned char **data, int dataoff,
176 H245_TransportAddress *taddr, 177 H245_TransportAddress *taddr,
177 __be16 port, __be16 rtp_port, 178 __be16 port, __be16 rtp_port,
178 struct nf_conntrack_expect *rtp_exp, 179 struct nf_conntrack_expect *rtp_exp,
@@ -244,7 +245,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
244 } 245 }
245 246
246 /* Modify signal */ 247 /* Modify signal */
247 if (set_h245_addr(skb, data, dataoff, taddr, 248 if (set_h245_addr(skb, protoff, data, dataoff, taddr,
248 &ct->tuplehash[!dir].tuple.dst.u3, 249 &ct->tuplehash[!dir].tuple.dst.u3,
249 htons((port & htons(1)) ? nated_port + 1 : 250 htons((port & htons(1)) ? nated_port + 1 :
250 nated_port)) == 0) { 251 nated_port)) == 0) {
@@ -275,7 +276,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
275/****************************************************************************/ 276/****************************************************************************/
276static int nat_t120(struct sk_buff *skb, struct nf_conn *ct, 277static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
277 enum ip_conntrack_info ctinfo, 278 enum ip_conntrack_info ctinfo,
278 unsigned char **data, int dataoff, 279 unsigned int protoff, unsigned char **data, int dataoff,
279 H245_TransportAddress *taddr, __be16 port, 280 H245_TransportAddress *taddr, __be16 port,
280 struct nf_conntrack_expect *exp) 281 struct nf_conntrack_expect *exp)
281{ 282{
@@ -307,7 +308,7 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
307 } 308 }
308 309
309 /* Modify signal */ 310 /* Modify signal */
310 if (set_h245_addr(skb, data, dataoff, taddr, 311 if (set_h245_addr(skb, protoff, data, dataoff, taddr,
311 &ct->tuplehash[!dir].tuple.dst.u3, 312 &ct->tuplehash[!dir].tuple.dst.u3,
312 htons(nated_port)) < 0) { 313 htons(nated_port)) < 0) {
313 nf_ct_unexpect_related(exp); 314 nf_ct_unexpect_related(exp);
@@ -326,7 +327,7 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
326/****************************************************************************/ 327/****************************************************************************/
327static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, 328static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
328 enum ip_conntrack_info ctinfo, 329 enum ip_conntrack_info ctinfo,
329 unsigned char **data, int dataoff, 330 unsigned int protoff, unsigned char **data, int dataoff,
330 TransportAddress *taddr, __be16 port, 331 TransportAddress *taddr, __be16 port,
331 struct nf_conntrack_expect *exp) 332 struct nf_conntrack_expect *exp)
332{ 333{
@@ -363,7 +364,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
363 } 364 }
364 365
365 /* Modify signal */ 366 /* Modify signal */
366 if (set_h225_addr(skb, data, dataoff, taddr, 367 if (set_h225_addr(skb, protoff, data, dataoff, taddr,
367 &ct->tuplehash[!dir].tuple.dst.u3, 368 &ct->tuplehash[!dir].tuple.dst.u3,
368 htons(nated_port)) == 0) { 369 htons(nated_port)) == 0) {
369 /* Save ports */ 370 /* Save ports */
@@ -390,7 +391,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
390static void ip_nat_q931_expect(struct nf_conn *new, 391static void ip_nat_q931_expect(struct nf_conn *new,
391 struct nf_conntrack_expect *this) 392 struct nf_conntrack_expect *this)
392{ 393{
393 struct nf_nat_ipv4_range range; 394 struct nf_nat_range range;
394 395
395 if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */ 396 if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */
396 nf_nat_follow_master(new, this); 397 nf_nat_follow_master(new, this);
@@ -402,21 +403,23 @@ static void ip_nat_q931_expect(struct nf_conn *new,
402 403
403 /* Change src to where master sends to */ 404 /* Change src to where master sends to */
404 range.flags = NF_NAT_RANGE_MAP_IPS; 405 range.flags = NF_NAT_RANGE_MAP_IPS;
405 range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip; 406 range.min_addr = range.max_addr =
407 new->tuplehash[!this->dir].tuple.src.u3;
406 nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC); 408 nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
407 409
408 /* For DST manip, map port here to where it's expected. */ 410 /* For DST manip, map port here to where it's expected. */
409 range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); 411 range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
410 range.min = range.max = this->saved_proto; 412 range.min_proto = range.max_proto = this->saved_proto;
411 range.min_ip = range.max_ip = 413 range.min_addr = range.max_addr =
412 new->master->tuplehash[!this->dir].tuple.src.u3.ip; 414 new->master->tuplehash[!this->dir].tuple.src.u3;
413 nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST); 415 nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
414} 416}
415 417
416/****************************************************************************/ 418/****************************************************************************/
417static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, 419static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
418 enum ip_conntrack_info ctinfo, 420 enum ip_conntrack_info ctinfo,
419 unsigned char **data, TransportAddress *taddr, int idx, 421 unsigned int protoff, unsigned char **data,
422 TransportAddress *taddr, int idx,
420 __be16 port, struct nf_conntrack_expect *exp) 423 __be16 port, struct nf_conntrack_expect *exp)
421{ 424{
422 struct nf_ct_h323_master *info = nfct_help_data(ct); 425 struct nf_ct_h323_master *info = nfct_help_data(ct);
@@ -453,7 +456,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
453 } 456 }
454 457
455 /* Modify signal */ 458 /* Modify signal */
456 if (set_h225_addr(skb, data, 0, &taddr[idx], 459 if (set_h225_addr(skb, protoff, data, 0, &taddr[idx],
457 &ct->tuplehash[!dir].tuple.dst.u3, 460 &ct->tuplehash[!dir].tuple.dst.u3,
458 htons(nated_port)) == 0) { 461 htons(nated_port)) == 0) {
459 /* Save ports */ 462 /* Save ports */
@@ -464,7 +467,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
464 if (idx > 0 && 467 if (idx > 0 &&
465 get_h225_addr(ct, *data, &taddr[0], &addr, &port) && 468 get_h225_addr(ct, *data, &taddr[0], &addr, &port) &&
466 (ntohl(addr.ip) & 0xff000000) == 0x7f000000) { 469 (ntohl(addr.ip) & 0xff000000) == 0x7f000000) {
467 set_h225_addr(skb, data, 0, &taddr[0], 470 set_h225_addr(skb, protoff, data, 0, &taddr[0],
468 &ct->tuplehash[!dir].tuple.dst.u3, 471 &ct->tuplehash[!dir].tuple.dst.u3,
469 info->sig_port[!dir]); 472 info->sig_port[!dir]);
470 } 473 }
@@ -487,26 +490,28 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
487static void ip_nat_callforwarding_expect(struct nf_conn *new, 490static void ip_nat_callforwarding_expect(struct nf_conn *new,
488 struct nf_conntrack_expect *this) 491 struct nf_conntrack_expect *this)
489{ 492{
490 struct nf_nat_ipv4_range range; 493 struct nf_nat_range range;
491 494
492 /* This must be a fresh one. */ 495 /* This must be a fresh one. */
493 BUG_ON(new->status & IPS_NAT_DONE_MASK); 496 BUG_ON(new->status & IPS_NAT_DONE_MASK);
494 497
495 /* Change src to where master sends to */ 498 /* Change src to where master sends to */
496 range.flags = NF_NAT_RANGE_MAP_IPS; 499 range.flags = NF_NAT_RANGE_MAP_IPS;
497 range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip; 500 range.min_addr = range.max_addr =
501 new->tuplehash[!this->dir].tuple.src.u3;
498 nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC); 502 nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
499 503
500 /* For DST manip, map port here to where it's expected. */ 504 /* For DST manip, map port here to where it's expected. */
501 range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); 505 range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
502 range.min = range.max = this->saved_proto; 506 range.min_proto = range.max_proto = this->saved_proto;
503 range.min_ip = range.max_ip = this->saved_ip; 507 range.min_addr = range.max_addr = this->saved_addr;
504 nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST); 508 nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
505} 509}
506 510
507/****************************************************************************/ 511/****************************************************************************/
508static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, 512static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
509 enum ip_conntrack_info ctinfo, 513 enum ip_conntrack_info ctinfo,
514 unsigned int protoff,
510 unsigned char **data, int dataoff, 515 unsigned char **data, int dataoff,
511 TransportAddress *taddr, __be16 port, 516 TransportAddress *taddr, __be16 port,
512 struct nf_conntrack_expect *exp) 517 struct nf_conntrack_expect *exp)
@@ -515,7 +520,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
515 u_int16_t nated_port; 520 u_int16_t nated_port;
516 521
517 /* Set expectations for NAT */ 522 /* Set expectations for NAT */
518 exp->saved_ip = exp->tuple.dst.u3.ip; 523 exp->saved_addr = exp->tuple.dst.u3;
519 exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip; 524 exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip;
520 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; 525 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
521 exp->expectfn = ip_nat_callforwarding_expect; 526 exp->expectfn = ip_nat_callforwarding_expect;
@@ -541,7 +546,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
541 } 546 }
542 547
543 /* Modify signal */ 548 /* Modify signal */
544 if (!set_h225_addr(skb, data, dataoff, taddr, 549 if (!set_h225_addr(skb, protoff, data, dataoff, taddr,
545 &ct->tuplehash[!dir].tuple.dst.u3, 550 &ct->tuplehash[!dir].tuple.dst.u3,
546 htons(nated_port)) == 0) { 551 htons(nated_port)) == 0) {
547 nf_ct_unexpect_related(exp); 552 nf_ct_unexpect_related(exp);
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
deleted file mode 100644
index 2e59ad0b90c..00000000000
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ /dev/null
@@ -1,458 +0,0 @@
1/* ip_nat_helper.c - generic support functions for NAT helpers
2 *
3 * (C) 2000-2002 Harald Welte <laforge@netfilter.org>
4 * (C) 2003-2006 Netfilter Core Team <coreteam@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10#include <linux/module.h>
11#include <linux/gfp.h>
12#include <linux/kmod.h>
13#include <linux/types.h>
14#include <linux/timer.h>
15#include <linux/skbuff.h>
16#include <linux/tcp.h>
17#include <linux/udp.h>
18#include <net/checksum.h>
19#include <net/tcp.h>
20#include <net/route.h>
21
22#include <linux/netfilter_ipv4.h>
23#include <net/netfilter/nf_conntrack.h>
24#include <net/netfilter/nf_conntrack_helper.h>
25#include <net/netfilter/nf_conntrack_ecache.h>
26#include <net/netfilter/nf_conntrack_expect.h>
27#include <net/netfilter/nf_nat.h>
28#include <net/netfilter/nf_nat_protocol.h>
29#include <net/netfilter/nf_nat_core.h>
30#include <net/netfilter/nf_nat_helper.h>
31
32#define DUMP_OFFSET(x) \
33 pr_debug("offset_before=%d, offset_after=%d, correction_pos=%u\n", \
34 x->offset_before, x->offset_after, x->correction_pos);
35
36static DEFINE_SPINLOCK(nf_nat_seqofs_lock);
37
38/* Setup TCP sequence correction given this change at this sequence */
39static inline void
40adjust_tcp_sequence(u32 seq,
41 int sizediff,
42 struct nf_conn *ct,
43 enum ip_conntrack_info ctinfo)
44{
45 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
46 struct nf_conn_nat *nat = nfct_nat(ct);
47 struct nf_nat_seq *this_way = &nat->seq[dir];
48
49 pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n",
50 seq, sizediff);
51
52 pr_debug("adjust_tcp_sequence: Seq_offset before: ");
53 DUMP_OFFSET(this_way);
54
55 spin_lock_bh(&nf_nat_seqofs_lock);
56
57 /* SYN adjust. If it's uninitialized, or this is after last
58 * correction, record it: we don't handle more than one
59 * adjustment in the window, but do deal with common case of a
60 * retransmit */
61 if (this_way->offset_before == this_way->offset_after ||
62 before(this_way->correction_pos, seq)) {
63 this_way->correction_pos = seq;
64 this_way->offset_before = this_way->offset_after;
65 this_way->offset_after += sizediff;
66 }
67 spin_unlock_bh(&nf_nat_seqofs_lock);
68
69 pr_debug("adjust_tcp_sequence: Seq_offset after: ");
70 DUMP_OFFSET(this_way);
71}
72
73/* Get the offset value, for conntrack */
74s16 nf_nat_get_offset(const struct nf_conn *ct,
75 enum ip_conntrack_dir dir,
76 u32 seq)
77{
78 struct nf_conn_nat *nat = nfct_nat(ct);
79 struct nf_nat_seq *this_way;
80 s16 offset;
81
82 if (!nat)
83 return 0;
84
85 this_way = &nat->seq[dir];
86 spin_lock_bh(&nf_nat_seqofs_lock);
87 offset = after(seq, this_way->correction_pos)
88 ? this_way->offset_after : this_way->offset_before;
89 spin_unlock_bh(&nf_nat_seqofs_lock);
90
91 return offset;
92}
93EXPORT_SYMBOL_GPL(nf_nat_get_offset);
94
95/* Frobs data inside this packet, which is linear. */
96static void mangle_contents(struct sk_buff *skb,
97 unsigned int dataoff,
98 unsigned int match_offset,
99 unsigned int match_len,
100 const char *rep_buffer,
101 unsigned int rep_len)
102{
103 unsigned char *data;
104
105 BUG_ON(skb_is_nonlinear(skb));
106 data = skb_network_header(skb) + dataoff;
107
108 /* move post-replacement */
109 memmove(data + match_offset + rep_len,
110 data + match_offset + match_len,
111 skb->tail - (skb->network_header + dataoff +
112 match_offset + match_len));
113
114 /* insert data from buffer */
115 memcpy(data + match_offset, rep_buffer, rep_len);
116
117 /* update skb info */
118 if (rep_len > match_len) {
119 pr_debug("nf_nat_mangle_packet: Extending packet by "
120 "%u from %u bytes\n", rep_len - match_len, skb->len);
121 skb_put(skb, rep_len - match_len);
122 } else {
123 pr_debug("nf_nat_mangle_packet: Shrinking packet from "
124 "%u from %u bytes\n", match_len - rep_len, skb->len);
125 __skb_trim(skb, skb->len + rep_len - match_len);
126 }
127
128 /* fix IP hdr checksum information */
129 ip_hdr(skb)->tot_len = htons(skb->len);
130 ip_send_check(ip_hdr(skb));
131}
132
133/* Unusual, but possible case. */
134static int enlarge_skb(struct sk_buff *skb, unsigned int extra)
135{
136 if (skb->len + extra > 65535)
137 return 0;
138
139 if (pskb_expand_head(skb, 0, extra - skb_tailroom(skb), GFP_ATOMIC))
140 return 0;
141
142 return 1;
143}
144
145void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
146 __be32 seq, s16 off)
147{
148 if (!off)
149 return;
150 set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
151 adjust_tcp_sequence(ntohl(seq), off, ct, ctinfo);
152 nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
153}
154EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
155
156void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
157 u32 ctinfo, int off)
158{
159 const struct tcphdr *th;
160
161 if (nf_ct_protonum(ct) != IPPROTO_TCP)
162 return;
163
164 th = (struct tcphdr *)(skb_network_header(skb)+ ip_hdrlen(skb));
165 nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
166}
167EXPORT_SYMBOL_GPL(nf_nat_tcp_seq_adjust);
168
169static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data,
170 int datalen, __sum16 *check, int oldlen)
171{
172 struct rtable *rt = skb_rtable(skb);
173
174 if (skb->ip_summed != CHECKSUM_PARTIAL) {
175 if (!(rt->rt_flags & RTCF_LOCAL) &&
176 (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) {
177 skb->ip_summed = CHECKSUM_PARTIAL;
178 skb->csum_start = skb_headroom(skb) +
179 skb_network_offset(skb) +
180 iph->ihl * 4;
181 skb->csum_offset = (void *)check - data;
182 *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
183 datalen, iph->protocol, 0);
184 } else {
185 *check = 0;
186 *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
187 datalen, iph->protocol,
188 csum_partial(data, datalen,
189 0));
190 if (iph->protocol == IPPROTO_UDP && !*check)
191 *check = CSUM_MANGLED_0;
192 }
193 } else
194 inet_proto_csum_replace2(check, skb,
195 htons(oldlen), htons(datalen), 1);
196}
197
198/* Generic function for mangling variable-length address changes inside
199 * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
200 * command in FTP).
201 *
202 * Takes care about all the nasty sequence number changes, checksumming,
203 * skb enlargement, ...
204 *
205 * */
206int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
207 struct nf_conn *ct,
208 enum ip_conntrack_info ctinfo,
209 unsigned int match_offset,
210 unsigned int match_len,
211 const char *rep_buffer,
212 unsigned int rep_len, bool adjust)
213{
214 struct iphdr *iph;
215 struct tcphdr *tcph;
216 int oldlen, datalen;
217
218 if (!skb_make_writable(skb, skb->len))
219 return 0;
220
221 if (rep_len > match_len &&
222 rep_len - match_len > skb_tailroom(skb) &&
223 !enlarge_skb(skb, rep_len - match_len))
224 return 0;
225
226 SKB_LINEAR_ASSERT(skb);
227
228 iph = ip_hdr(skb);
229 tcph = (void *)iph + iph->ihl*4;
230
231 oldlen = skb->len - iph->ihl*4;
232 mangle_contents(skb, iph->ihl*4 + tcph->doff*4,
233 match_offset, match_len, rep_buffer, rep_len);
234
235 datalen = skb->len - iph->ihl*4;
236 nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen);
237
238 if (adjust && rep_len != match_len)
239 nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
240 (int)rep_len - (int)match_len);
241
242 return 1;
243}
244EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet);
245
246/* Generic function for mangling variable-length address changes inside
247 * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
248 * command in the Amanda protocol)
249 *
250 * Takes care about all the nasty sequence number changes, checksumming,
251 * skb enlargement, ...
252 *
253 * XXX - This function could be merged with nf_nat_mangle_tcp_packet which
254 * should be fairly easy to do.
255 */
256int
257nf_nat_mangle_udp_packet(struct sk_buff *skb,
258 struct nf_conn *ct,
259 enum ip_conntrack_info ctinfo,
260 unsigned int match_offset,
261 unsigned int match_len,
262 const char *rep_buffer,
263 unsigned int rep_len)
264{
265 struct iphdr *iph;
266 struct udphdr *udph;
267 int datalen, oldlen;
268
269 if (!skb_make_writable(skb, skb->len))
270 return 0;
271
272 if (rep_len > match_len &&
273 rep_len - match_len > skb_tailroom(skb) &&
274 !enlarge_skb(skb, rep_len - match_len))
275 return 0;
276
277 iph = ip_hdr(skb);
278 udph = (void *)iph + iph->ihl*4;
279
280 oldlen = skb->len - iph->ihl*4;
281 mangle_contents(skb, iph->ihl*4 + sizeof(*udph),
282 match_offset, match_len, rep_buffer, rep_len);
283
284 /* update the length of the UDP packet */
285 datalen = skb->len - iph->ihl*4;
286 udph->len = htons(datalen);
287
288 /* fix udp checksum if udp checksum was previously calculated */
289 if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
290 return 1;
291
292 nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen);
293
294 return 1;
295}
296EXPORT_SYMBOL(nf_nat_mangle_udp_packet);
297
298/* Adjust one found SACK option including checksum correction */
299static void
300sack_adjust(struct sk_buff *skb,
301 struct tcphdr *tcph,
302 unsigned int sackoff,
303 unsigned int sackend,
304 struct nf_nat_seq *natseq)
305{
306 while (sackoff < sackend) {
307 struct tcp_sack_block_wire *sack;
308 __be32 new_start_seq, new_end_seq;
309
310 sack = (void *)skb->data + sackoff;
311 if (after(ntohl(sack->start_seq) - natseq->offset_before,
312 natseq->correction_pos))
313 new_start_seq = htonl(ntohl(sack->start_seq)
314 - natseq->offset_after);
315 else
316 new_start_seq = htonl(ntohl(sack->start_seq)
317 - natseq->offset_before);
318
319 if (after(ntohl(sack->end_seq) - natseq->offset_before,
320 natseq->correction_pos))
321 new_end_seq = htonl(ntohl(sack->end_seq)
322 - natseq->offset_after);
323 else
324 new_end_seq = htonl(ntohl(sack->end_seq)
325 - natseq->offset_before);
326
327 pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
328 ntohl(sack->start_seq), new_start_seq,
329 ntohl(sack->end_seq), new_end_seq);
330
331 inet_proto_csum_replace4(&tcph->check, skb,
332 sack->start_seq, new_start_seq, 0);
333 inet_proto_csum_replace4(&tcph->check, skb,
334 sack->end_seq, new_end_seq, 0);
335 sack->start_seq = new_start_seq;
336 sack->end_seq = new_end_seq;
337 sackoff += sizeof(*sack);
338 }
339}
340
341/* TCP SACK sequence number adjustment */
342static inline unsigned int
343nf_nat_sack_adjust(struct sk_buff *skb,
344 struct tcphdr *tcph,
345 struct nf_conn *ct,
346 enum ip_conntrack_info ctinfo)
347{
348 unsigned int dir, optoff, optend;
349 struct nf_conn_nat *nat = nfct_nat(ct);
350
351 optoff = ip_hdrlen(skb) + sizeof(struct tcphdr);
352 optend = ip_hdrlen(skb) + tcph->doff * 4;
353
354 if (!skb_make_writable(skb, optend))
355 return 0;
356
357 dir = CTINFO2DIR(ctinfo);
358
359 while (optoff < optend) {
360 /* Usually: option, length. */
361 unsigned char *op = skb->data + optoff;
362
363 switch (op[0]) {
364 case TCPOPT_EOL:
365 return 1;
366 case TCPOPT_NOP:
367 optoff++;
368 continue;
369 default:
370 /* no partial options */
371 if (optoff + 1 == optend ||
372 optoff + op[1] > optend ||
373 op[1] < 2)
374 return 0;
375 if (op[0] == TCPOPT_SACK &&
376 op[1] >= 2+TCPOLEN_SACK_PERBLOCK &&
377 ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
378 sack_adjust(skb, tcph, optoff+2,
379 optoff+op[1], &nat->seq[!dir]);
380 optoff += op[1];
381 }
382 }
383 return 1;
384}
385
386/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */
387int
388nf_nat_seq_adjust(struct sk_buff *skb,
389 struct nf_conn *ct,
390 enum ip_conntrack_info ctinfo)
391{
392 struct tcphdr *tcph;
393 int dir;
394 __be32 newseq, newack;
395 s16 seqoff, ackoff;
396 struct nf_conn_nat *nat = nfct_nat(ct);
397 struct nf_nat_seq *this_way, *other_way;
398
399 dir = CTINFO2DIR(ctinfo);
400
401 this_way = &nat->seq[dir];
402 other_way = &nat->seq[!dir];
403
404 if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*tcph)))
405 return 0;
406
407 tcph = (void *)skb->data + ip_hdrlen(skb);
408 if (after(ntohl(tcph->seq), this_way->correction_pos))
409 seqoff = this_way->offset_after;
410 else
411 seqoff = this_way->offset_before;
412
413 if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
414 other_way->correction_pos))
415 ackoff = other_way->offset_after;
416 else
417 ackoff = other_way->offset_before;
418
419 newseq = htonl(ntohl(tcph->seq) + seqoff);
420 newack = htonl(ntohl(tcph->ack_seq) - ackoff);
421
422 inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0);
423 inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0);
424
425 pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
426 ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
427 ntohl(newack));
428
429 tcph->seq = newseq;
430 tcph->ack_seq = newack;
431
432 return nf_nat_sack_adjust(skb, tcph, ct, ctinfo);
433}
434
435/* Setup NAT on this expected conntrack so it follows master. */
436/* If we fail to get a free NAT slot, we'll get dropped on confirm */
437void nf_nat_follow_master(struct nf_conn *ct,
438 struct nf_conntrack_expect *exp)
439{
440 struct nf_nat_ipv4_range range;
441
442 /* This must be a fresh one. */
443 BUG_ON(ct->status & IPS_NAT_DONE_MASK);
444
445 /* Change src to where master sends to */
446 range.flags = NF_NAT_RANGE_MAP_IPS;
447 range.min_ip = range.max_ip
448 = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
449 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
450
451 /* For DST manip, map port here to where it's expected. */
452 range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
453 range.min = range.max = exp->saved_proto;
454 range.min_ip = range.max_ip
455 = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip;
456 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
457}
458EXPORT_SYMBOL(nf_nat_follow_master);
diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c
deleted file mode 100644
index 979ae165f4e..00000000000
--- a/net/ipv4/netfilter/nf_nat_irc.c
+++ /dev/null
@@ -1,99 +0,0 @@
1/* IRC extension for TCP NAT alteration.
2 *
3 * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
4 * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
5 * based on a copy of RR's ip_nat_ftp.c
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#include <linux/module.h>
14#include <linux/moduleparam.h>
15#include <linux/tcp.h>
16#include <linux/kernel.h>
17
18#include <net/netfilter/nf_nat.h>
19#include <net/netfilter/nf_nat_helper.h>
20#include <net/netfilter/nf_nat_rule.h>
21#include <net/netfilter/nf_conntrack_helper.h>
22#include <net/netfilter/nf_conntrack_expect.h>
23#include <linux/netfilter/nf_conntrack_irc.h>
24
25MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
26MODULE_DESCRIPTION("IRC (DCC) NAT helper");
27MODULE_LICENSE("GPL");
28MODULE_ALIAS("ip_nat_irc");
29
30static unsigned int help(struct sk_buff *skb,
31 enum ip_conntrack_info ctinfo,
32 unsigned int matchoff,
33 unsigned int matchlen,
34 struct nf_conntrack_expect *exp)
35{
36 char buffer[sizeof("4294967296 65635")];
37 u_int32_t ip;
38 u_int16_t port;
39 unsigned int ret;
40
41 /* Reply comes from server. */
42 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
43 exp->dir = IP_CT_DIR_REPLY;
44 exp->expectfn = nf_nat_follow_master;
45
46 /* Try to get same port: if not, try to change it. */
47 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
48 int ret;
49
50 exp->tuple.dst.u.tcp.port = htons(port);
51 ret = nf_ct_expect_related(exp);
52 if (ret == 0)
53 break;
54 else if (ret != -EBUSY) {
55 port = 0;
56 break;
57 }
58 }
59
60 if (port == 0)
61 return NF_DROP;
62
63 ip = ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip);
64 sprintf(buffer, "%u %u", ip, port);
65 pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n",
66 buffer, &ip, port);
67
68 ret = nf_nat_mangle_tcp_packet(skb, exp->master, ctinfo,
69 matchoff, matchlen, buffer,
70 strlen(buffer));
71 if (ret != NF_ACCEPT)
72 nf_ct_unexpect_related(exp);
73 return ret;
74}
75
76static void __exit nf_nat_irc_fini(void)
77{
78 RCU_INIT_POINTER(nf_nat_irc_hook, NULL);
79 synchronize_rcu();
80}
81
82static int __init nf_nat_irc_init(void)
83{
84 BUG_ON(nf_nat_irc_hook != NULL);
85 RCU_INIT_POINTER(nf_nat_irc_hook, help);
86 return 0;
87}
88
89/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
90static int warn_set(const char *val, struct kernel_param *kp)
91{
92 printk(KERN_INFO KBUILD_MODNAME
93 ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
94 return 0;
95}
96module_param_call(ports, warn_set, NULL, NULL, 0);
97
98module_init(nf_nat_irc_init);
99module_exit(nf_nat_irc_fini);
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
new file mode 100644
index 00000000000..d8b2e14efdd
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -0,0 +1,281 @@
1/*
2 * (C) 1999-2001 Paul `Rusty' Russell
3 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
4 * (C) 2011 Patrick McHardy <kaber@trash.net>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/types.h>
12#include <linux/module.h>
13#include <linux/skbuff.h>
14#include <linux/ip.h>
15#include <linux/icmp.h>
16#include <linux/netfilter.h>
17#include <linux/netfilter_ipv4.h>
18#include <net/secure_seq.h>
19#include <net/checksum.h>
20#include <net/route.h>
21#include <net/ip.h>
22
23#include <net/netfilter/nf_conntrack_core.h>
24#include <net/netfilter/nf_conntrack.h>
25#include <net/netfilter/nf_nat_core.h>
26#include <net/netfilter/nf_nat_l3proto.h>
27#include <net/netfilter/nf_nat_l4proto.h>
28
29static const struct nf_nat_l3proto nf_nat_l3proto_ipv4;
30
31#ifdef CONFIG_XFRM
32static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
33 const struct nf_conn *ct,
34 enum ip_conntrack_dir dir,
35 unsigned long statusbit,
36 struct flowi *fl)
37{
38 const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple;
39 struct flowi4 *fl4 = &fl->u.ip4;
40
41 if (ct->status & statusbit) {
42 fl4->daddr = t->dst.u3.ip;
43 if (t->dst.protonum == IPPROTO_TCP ||
44 t->dst.protonum == IPPROTO_UDP ||
45 t->dst.protonum == IPPROTO_UDPLITE ||
46 t->dst.protonum == IPPROTO_DCCP ||
47 t->dst.protonum == IPPROTO_SCTP)
48 fl4->fl4_dport = t->dst.u.all;
49 }
50
51 statusbit ^= IPS_NAT_MASK;
52
53 if (ct->status & statusbit) {
54 fl4->saddr = t->src.u3.ip;
55 if (t->dst.protonum == IPPROTO_TCP ||
56 t->dst.protonum == IPPROTO_UDP ||
57 t->dst.protonum == IPPROTO_UDPLITE ||
58 t->dst.protonum == IPPROTO_DCCP ||
59 t->dst.protonum == IPPROTO_SCTP)
60 fl4->fl4_sport = t->src.u.all;
61 }
62}
63#endif /* CONFIG_XFRM */
64
65static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t,
66 const struct nf_nat_range *range)
67{
68 return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
69 ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
70}
71
72static u32 nf_nat_ipv4_secure_port(const struct nf_conntrack_tuple *t,
73 __be16 dport)
74{
75 return secure_ipv4_port_ephemeral(t->src.u3.ip, t->dst.u3.ip, dport);
76}
77
78static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
79 unsigned int iphdroff,
80 const struct nf_nat_l4proto *l4proto,
81 const struct nf_conntrack_tuple *target,
82 enum nf_nat_manip_type maniptype)
83{
84 struct iphdr *iph;
85 unsigned int hdroff;
86
87 if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
88 return false;
89
90 iph = (void *)skb->data + iphdroff;
91 hdroff = iphdroff + iph->ihl * 4;
92
93 if (!l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, hdroff,
94 target, maniptype))
95 return false;
96 iph = (void *)skb->data + iphdroff;
97
98 if (maniptype == NF_NAT_MANIP_SRC) {
99 csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
100 iph->saddr = target->src.u3.ip;
101 } else {
102 csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
103 iph->daddr = target->dst.u3.ip;
104 }
105 return true;
106}
107
108static void nf_nat_ipv4_csum_update(struct sk_buff *skb,
109 unsigned int iphdroff, __sum16 *check,
110 const struct nf_conntrack_tuple *t,
111 enum nf_nat_manip_type maniptype)
112{
113 struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
114 __be32 oldip, newip;
115
116 if (maniptype == NF_NAT_MANIP_SRC) {
117 oldip = iph->saddr;
118 newip = t->src.u3.ip;
119 } else {
120 oldip = iph->daddr;
121 newip = t->dst.u3.ip;
122 }
123 inet_proto_csum_replace4(check, skb, oldip, newip, 1);
124}
125
126static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
127 u8 proto, void *data, __sum16 *check,
128 int datalen, int oldlen)
129{
130 const struct iphdr *iph = ip_hdr(skb);
131 struct rtable *rt = skb_rtable(skb);
132
133 if (skb->ip_summed != CHECKSUM_PARTIAL) {
134 if (!(rt->rt_flags & RTCF_LOCAL) &&
135 (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) {
136 skb->ip_summed = CHECKSUM_PARTIAL;
137 skb->csum_start = skb_headroom(skb) +
138 skb_network_offset(skb) +
139 ip_hdrlen(skb);
140 skb->csum_offset = (void *)check - data;
141 *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
142 datalen, proto, 0);
143 } else {
144 *check = 0;
145 *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
146 datalen, proto,
147 csum_partial(data, datalen,
148 0));
149 if (proto == IPPROTO_UDP && !*check)
150 *check = CSUM_MANGLED_0;
151 }
152 } else
153 inet_proto_csum_replace2(check, skb,
154 htons(oldlen), htons(datalen), 1);
155}
156
157static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
158 struct nf_nat_range *range)
159{
160 if (tb[CTA_NAT_V4_MINIP]) {
161 range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]);
162 range->flags |= NF_NAT_RANGE_MAP_IPS;
163 }
164
165 if (tb[CTA_NAT_V4_MAXIP])
166 range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]);
167 else
168 range->max_addr.ip = range->min_addr.ip;
169
170 return 0;
171}
172
173static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
174 .l3proto = NFPROTO_IPV4,
175 .in_range = nf_nat_ipv4_in_range,
176 .secure_port = nf_nat_ipv4_secure_port,
177 .manip_pkt = nf_nat_ipv4_manip_pkt,
178 .csum_update = nf_nat_ipv4_csum_update,
179 .csum_recalc = nf_nat_ipv4_csum_recalc,
180 .nlattr_to_range = nf_nat_ipv4_nlattr_to_range,
181#ifdef CONFIG_XFRM
182 .decode_session = nf_nat_ipv4_decode_session,
183#endif
184};
185
186int nf_nat_icmp_reply_translation(struct sk_buff *skb,
187 struct nf_conn *ct,
188 enum ip_conntrack_info ctinfo,
189 unsigned int hooknum)
190{
191 struct {
192 struct icmphdr icmp;
193 struct iphdr ip;
194 } *inside;
195 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
196 enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
197 unsigned int hdrlen = ip_hdrlen(skb);
198 const struct nf_nat_l4proto *l4proto;
199 struct nf_conntrack_tuple target;
200 unsigned long statusbit;
201
202 NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY);
203
204 if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
205 return 0;
206 if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
207 return 0;
208
209 inside = (void *)skb->data + hdrlen;
210 if (inside->icmp.type == ICMP_REDIRECT) {
211 if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
212 return 0;
213 if (ct->status & IPS_NAT_MASK)
214 return 0;
215 }
216
217 if (manip == NF_NAT_MANIP_SRC)
218 statusbit = IPS_SRC_NAT;
219 else
220 statusbit = IPS_DST_NAT;
221
222 /* Invert if this is reply direction */
223 if (dir == IP_CT_DIR_REPLY)
224 statusbit ^= IPS_NAT_MASK;
225
226 if (!(ct->status & statusbit))
227 return 1;
228
229 l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol);
230 if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp),
231 l4proto, &ct->tuplehash[!dir].tuple, !manip))
232 return 0;
233
234 if (skb->ip_summed != CHECKSUM_PARTIAL) {
235 /* Reloading "inside" here since manip_pkt may reallocate */
236 inside = (void *)skb->data + hdrlen;
237 inside->icmp.checksum = 0;
238 inside->icmp.checksum =
239 csum_fold(skb_checksum(skb, hdrlen,
240 skb->len - hdrlen, 0));
241 }
242
243 /* Change outer to look like the reply to an incoming packet */
244 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
245 l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0);
246 if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip))
247 return 0;
248
249 return 1;
250}
251EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
252
253static int __init nf_nat_l3proto_ipv4_init(void)
254{
255 int err;
256
257 err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
258 if (err < 0)
259 goto err1;
260 err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv4);
261 if (err < 0)
262 goto err2;
263 return err;
264
265err2:
266 nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
267err1:
268 return err;
269}
270
271static void __exit nf_nat_l3proto_ipv4_exit(void)
272{
273 nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4);
274 nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
275}
276
277MODULE_LICENSE("GPL");
278MODULE_ALIAS("nf-nat-" __stringify(AF_INET));
279
280module_init(nf_nat_l3proto_ipv4_init);
281module_exit(nf_nat_l3proto_ipv4_exit);
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 388140881eb..a06d7d74817 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -22,7 +22,6 @@
22 22
23#include <net/netfilter/nf_nat.h> 23#include <net/netfilter/nf_nat.h>
24#include <net/netfilter/nf_nat_helper.h> 24#include <net/netfilter/nf_nat_helper.h>
25#include <net/netfilter/nf_nat_rule.h>
26#include <net/netfilter/nf_conntrack_helper.h> 25#include <net/netfilter/nf_conntrack_helper.h>
27#include <net/netfilter/nf_conntrack_expect.h> 26#include <net/netfilter/nf_conntrack_expect.h>
28#include <net/netfilter/nf_conntrack_zones.h> 27#include <net/netfilter/nf_conntrack_zones.h>
@@ -47,7 +46,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
47 struct nf_conntrack_tuple t; 46 struct nf_conntrack_tuple t;
48 const struct nf_ct_pptp_master *ct_pptp_info; 47 const struct nf_ct_pptp_master *ct_pptp_info;
49 const struct nf_nat_pptp *nat_pptp_info; 48 const struct nf_nat_pptp *nat_pptp_info;
50 struct nf_nat_ipv4_range range; 49 struct nf_nat_range range;
51 50
52 ct_pptp_info = nfct_help_data(master); 51 ct_pptp_info = nfct_help_data(master);
53 nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info; 52 nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info;
@@ -89,21 +88,21 @@ static void pptp_nat_expected(struct nf_conn *ct,
89 88
90 /* Change src to where master sends to */ 89 /* Change src to where master sends to */
91 range.flags = NF_NAT_RANGE_MAP_IPS; 90 range.flags = NF_NAT_RANGE_MAP_IPS;
92 range.min_ip = range.max_ip 91 range.min_addr = range.max_addr
93 = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; 92 = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
94 if (exp->dir == IP_CT_DIR_ORIGINAL) { 93 if (exp->dir == IP_CT_DIR_ORIGINAL) {
95 range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 94 range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
96 range.min = range.max = exp->saved_proto; 95 range.min_proto = range.max_proto = exp->saved_proto;
97 } 96 }
98 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); 97 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
99 98
100 /* For DST manip, map port here to where it's expected. */ 99 /* For DST manip, map port here to where it's expected. */
101 range.flags = NF_NAT_RANGE_MAP_IPS; 100 range.flags = NF_NAT_RANGE_MAP_IPS;
102 range.min_ip = range.max_ip 101 range.min_addr = range.max_addr
103 = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip; 102 = ct->master->tuplehash[!exp->dir].tuple.src.u3;
104 if (exp->dir == IP_CT_DIR_REPLY) { 103 if (exp->dir == IP_CT_DIR_REPLY) {
105 range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 104 range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
106 range.min = range.max = exp->saved_proto; 105 range.min_proto = range.max_proto = exp->saved_proto;
107 } 106 }
108 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); 107 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
109} 108}
@@ -113,6 +112,7 @@ static int
113pptp_outbound_pkt(struct sk_buff *skb, 112pptp_outbound_pkt(struct sk_buff *skb,
114 struct nf_conn *ct, 113 struct nf_conn *ct,
115 enum ip_conntrack_info ctinfo, 114 enum ip_conntrack_info ctinfo,
115 unsigned int protoff,
116 struct PptpControlHeader *ctlh, 116 struct PptpControlHeader *ctlh,
117 union pptp_ctrl_union *pptpReq) 117 union pptp_ctrl_union *pptpReq)
118 118
@@ -175,7 +175,7 @@ pptp_outbound_pkt(struct sk_buff *skb,
175 ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid)); 175 ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid));
176 176
177 /* mangle packet */ 177 /* mangle packet */
178 if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, 178 if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
179 cid_off + sizeof(struct pptp_pkt_hdr) + 179 cid_off + sizeof(struct pptp_pkt_hdr) +
180 sizeof(struct PptpControlHeader), 180 sizeof(struct PptpControlHeader),
181 sizeof(new_callid), (char *)&new_callid, 181 sizeof(new_callid), (char *)&new_callid,
@@ -216,6 +216,7 @@ static int
216pptp_inbound_pkt(struct sk_buff *skb, 216pptp_inbound_pkt(struct sk_buff *skb,
217 struct nf_conn *ct, 217 struct nf_conn *ct,
218 enum ip_conntrack_info ctinfo, 218 enum ip_conntrack_info ctinfo,
219 unsigned int protoff,
219 struct PptpControlHeader *ctlh, 220 struct PptpControlHeader *ctlh,
220 union pptp_ctrl_union *pptpReq) 221 union pptp_ctrl_union *pptpReq)
221{ 222{
@@ -268,7 +269,7 @@ pptp_inbound_pkt(struct sk_buff *skb,
268 pr_debug("altering peer call id from 0x%04x to 0x%04x\n", 269 pr_debug("altering peer call id from 0x%04x to 0x%04x\n",
269 ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid)); 270 ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid));
270 271
271 if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, 272 if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
272 pcid_off + sizeof(struct pptp_pkt_hdr) + 273 pcid_off + sizeof(struct pptp_pkt_hdr) +
273 sizeof(struct PptpControlHeader), 274 sizeof(struct PptpControlHeader),
274 sizeof(new_pcid), (char *)&new_pcid, 275 sizeof(new_pcid), (char *)&new_pcid,
diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c
deleted file mode 100644
index 9993bc93e10..00000000000
--- a/net/ipv4/netfilter/nf_nat_proto_common.c
+++ /dev/null
@@ -1,114 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
3 * (C) 2008 Patrick McHardy <kaber@trash.net>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/types.h>
11#include <linux/random.h>
12#include <linux/ip.h>
13
14#include <linux/netfilter.h>
15#include <linux/export.h>
16#include <net/secure_seq.h>
17#include <net/netfilter/nf_nat.h>
18#include <net/netfilter/nf_nat_core.h>
19#include <net/netfilter/nf_nat_rule.h>
20#include <net/netfilter/nf_nat_protocol.h>
21
22bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple,
23 enum nf_nat_manip_type maniptype,
24 const union nf_conntrack_man_proto *min,
25 const union nf_conntrack_man_proto *max)
26{
27 __be16 port;
28
29 if (maniptype == NF_NAT_MANIP_SRC)
30 port = tuple->src.u.all;
31 else
32 port = tuple->dst.u.all;
33
34 return ntohs(port) >= ntohs(min->all) &&
35 ntohs(port) <= ntohs(max->all);
36}
37EXPORT_SYMBOL_GPL(nf_nat_proto_in_range);
38
39void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
40 const struct nf_nat_ipv4_range *range,
41 enum nf_nat_manip_type maniptype,
42 const struct nf_conn *ct,
43 u_int16_t *rover)
44{
45 unsigned int range_size, min, i;
46 __be16 *portptr;
47 u_int16_t off;
48
49 if (maniptype == NF_NAT_MANIP_SRC)
50 portptr = &tuple->src.u.all;
51 else
52 portptr = &tuple->dst.u.all;
53
54 /* If no range specified... */
55 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
56 /* If it's dst rewrite, can't change port */
57 if (maniptype == NF_NAT_MANIP_DST)
58 return;
59
60 if (ntohs(*portptr) < 1024) {
61 /* Loose convention: >> 512 is credential passing */
62 if (ntohs(*portptr) < 512) {
63 min = 1;
64 range_size = 511 - min + 1;
65 } else {
66 min = 600;
67 range_size = 1023 - min + 1;
68 }
69 } else {
70 min = 1024;
71 range_size = 65535 - 1024 + 1;
72 }
73 } else {
74 min = ntohs(range->min.all);
75 range_size = ntohs(range->max.all) - min + 1;
76 }
77
78 if (range->flags & NF_NAT_RANGE_PROTO_RANDOM)
79 off = secure_ipv4_port_ephemeral(tuple->src.u3.ip, tuple->dst.u3.ip,
80 maniptype == NF_NAT_MANIP_SRC
81 ? tuple->dst.u.all
82 : tuple->src.u.all);
83 else
84 off = *rover;
85
86 for (i = 0; ; ++off) {
87 *portptr = htons(min + off % range_size);
88 if (++i != range_size && nf_nat_used_tuple(tuple, ct))
89 continue;
90 if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM))
91 *rover = off;
92 return;
93 }
94 return;
95}
96EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple);
97
98#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
99int nf_nat_proto_nlattr_to_range(struct nlattr *tb[],
100 struct nf_nat_ipv4_range *range)
101{
102 if (tb[CTA_PROTONAT_PORT_MIN]) {
103 range->min.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
104 range->max.all = range->min.tcp.port;
105 range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
106 }
107 if (tb[CTA_PROTONAT_PORT_MAX]) {
108 range->max.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]);
109 range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
110 }
111 return 0;
112}
113EXPORT_SYMBOL_GPL(nf_nat_proto_nlattr_to_range);
114#endif
diff --git a/net/ipv4/netfilter/nf_nat_proto_dccp.c b/net/ipv4/netfilter/nf_nat_proto_dccp.c
deleted file mode 100644
index 3f67138d187..00000000000
--- a/net/ipv4/netfilter/nf_nat_proto_dccp.c
+++ /dev/null
@@ -1,106 +0,0 @@
1/*
2 * DCCP NAT protocol helper
3 *
4 * Copyright (c) 2005, 2006. 2008 Patrick McHardy <kaber@trash.net>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 */
11
12#include <linux/kernel.h>
13#include <linux/module.h>
14#include <linux/init.h>
15#include <linux/skbuff.h>
16#include <linux/ip.h>
17#include <linux/dccp.h>
18
19#include <net/netfilter/nf_conntrack.h>
20#include <net/netfilter/nf_nat.h>
21#include <net/netfilter/nf_nat_protocol.h>
22
23static u_int16_t dccp_port_rover;
24
25static void
26dccp_unique_tuple(struct nf_conntrack_tuple *tuple,
27 const struct nf_nat_ipv4_range *range,
28 enum nf_nat_manip_type maniptype,
29 const struct nf_conn *ct)
30{
31 nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
32 &dccp_port_rover);
33}
34
35static bool
36dccp_manip_pkt(struct sk_buff *skb,
37 unsigned int iphdroff,
38 const struct nf_conntrack_tuple *tuple,
39 enum nf_nat_manip_type maniptype)
40{
41 const struct iphdr *iph = (const void *)(skb->data + iphdroff);
42 struct dccp_hdr *hdr;
43 unsigned int hdroff = iphdroff + iph->ihl * 4;
44 __be32 oldip, newip;
45 __be16 *portptr, oldport, newport;
46 int hdrsize = 8; /* DCCP connection tracking guarantees this much */
47
48 if (skb->len >= hdroff + sizeof(struct dccp_hdr))
49 hdrsize = sizeof(struct dccp_hdr);
50
51 if (!skb_make_writable(skb, hdroff + hdrsize))
52 return false;
53
54 iph = (struct iphdr *)(skb->data + iphdroff);
55 hdr = (struct dccp_hdr *)(skb->data + hdroff);
56
57 if (maniptype == NF_NAT_MANIP_SRC) {
58 oldip = iph->saddr;
59 newip = tuple->src.u3.ip;
60 newport = tuple->src.u.dccp.port;
61 portptr = &hdr->dccph_sport;
62 } else {
63 oldip = iph->daddr;
64 newip = tuple->dst.u3.ip;
65 newport = tuple->dst.u.dccp.port;
66 portptr = &hdr->dccph_dport;
67 }
68
69 oldport = *portptr;
70 *portptr = newport;
71
72 if (hdrsize < sizeof(*hdr))
73 return true;
74
75 inet_proto_csum_replace4(&hdr->dccph_checksum, skb, oldip, newip, 1);
76 inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport,
77 0);
78 return true;
79}
80
81static const struct nf_nat_protocol nf_nat_protocol_dccp = {
82 .protonum = IPPROTO_DCCP,
83 .manip_pkt = dccp_manip_pkt,
84 .in_range = nf_nat_proto_in_range,
85 .unique_tuple = dccp_unique_tuple,
86#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
87 .nlattr_to_range = nf_nat_proto_nlattr_to_range,
88#endif
89};
90
91static int __init nf_nat_proto_dccp_init(void)
92{
93 return nf_nat_protocol_register(&nf_nat_protocol_dccp);
94}
95
96static void __exit nf_nat_proto_dccp_fini(void)
97{
98 nf_nat_protocol_unregister(&nf_nat_protocol_dccp);
99}
100
101module_init(nf_nat_proto_dccp_init);
102module_exit(nf_nat_proto_dccp_fini);
103
104MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
105MODULE_DESCRIPTION("DCCP NAT protocol helper");
106MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index 46ba0b9ab98..ea44f02563b 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -28,8 +28,7 @@
28#include <linux/ip.h> 28#include <linux/ip.h>
29 29
30#include <net/netfilter/nf_nat.h> 30#include <net/netfilter/nf_nat.h>
31#include <net/netfilter/nf_nat_rule.h> 31#include <net/netfilter/nf_nat_l4proto.h>
32#include <net/netfilter/nf_nat_protocol.h>
33#include <linux/netfilter/nf_conntrack_proto_gre.h> 32#include <linux/netfilter/nf_conntrack_proto_gre.h>
34 33
35MODULE_LICENSE("GPL"); 34MODULE_LICENSE("GPL");
@@ -38,8 +37,9 @@ MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
38 37
39/* generate unique tuple ... */ 38/* generate unique tuple ... */
40static void 39static void
41gre_unique_tuple(struct nf_conntrack_tuple *tuple, 40gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
42 const struct nf_nat_ipv4_range *range, 41 struct nf_conntrack_tuple *tuple,
42 const struct nf_nat_range *range,
43 enum nf_nat_manip_type maniptype, 43 enum nf_nat_manip_type maniptype,
44 const struct nf_conn *ct) 44 const struct nf_conn *ct)
45{ 45{
@@ -62,8 +62,8 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
62 min = 1; 62 min = 1;
63 range_size = 0xffff; 63 range_size = 0xffff;
64 } else { 64 } else {
65 min = ntohs(range->min.gre.key); 65 min = ntohs(range->min_proto.gre.key);
66 range_size = ntohs(range->max.gre.key) - min + 1; 66 range_size = ntohs(range->max_proto.gre.key) - min + 1;
67 } 67 }
68 68
69 pr_debug("min = %u, range_size = %u\n", min, range_size); 69 pr_debug("min = %u, range_size = %u\n", min, range_size);
@@ -80,14 +80,14 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
80 80
81/* manipulate a GRE packet according to maniptype */ 81/* manipulate a GRE packet according to maniptype */
82static bool 82static bool
83gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, 83gre_manip_pkt(struct sk_buff *skb,
84 const struct nf_nat_l3proto *l3proto,
85 unsigned int iphdroff, unsigned int hdroff,
84 const struct nf_conntrack_tuple *tuple, 86 const struct nf_conntrack_tuple *tuple,
85 enum nf_nat_manip_type maniptype) 87 enum nf_nat_manip_type maniptype)
86{ 88{
87 const struct gre_hdr *greh; 89 const struct gre_hdr *greh;
88 struct gre_hdr_pptp *pgreh; 90 struct gre_hdr_pptp *pgreh;
89 const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
90 unsigned int hdroff = iphdroff + iph->ihl * 4;
91 91
92 /* pgreh includes two optional 32bit fields which are not required 92 /* pgreh includes two optional 32bit fields which are not required
93 * to be there. That's where the magic '8' comes from */ 93 * to be there. That's where the magic '8' comes from */
@@ -117,24 +117,24 @@ gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff,
117 return true; 117 return true;
118} 118}
119 119
120static const struct nf_nat_protocol gre = { 120static const struct nf_nat_l4proto gre = {
121 .protonum = IPPROTO_GRE, 121 .l4proto = IPPROTO_GRE,
122 .manip_pkt = gre_manip_pkt, 122 .manip_pkt = gre_manip_pkt,
123 .in_range = nf_nat_proto_in_range, 123 .in_range = nf_nat_l4proto_in_range,
124 .unique_tuple = gre_unique_tuple, 124 .unique_tuple = gre_unique_tuple,
125#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 125#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
126 .nlattr_to_range = nf_nat_proto_nlattr_to_range, 126 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
127#endif 127#endif
128}; 128};
129 129
130static int __init nf_nat_proto_gre_init(void) 130static int __init nf_nat_proto_gre_init(void)
131{ 131{
132 return nf_nat_protocol_register(&gre); 132 return nf_nat_l4proto_register(NFPROTO_IPV4, &gre);
133} 133}
134 134
135static void __exit nf_nat_proto_gre_fini(void) 135static void __exit nf_nat_proto_gre_fini(void)
136{ 136{
137 nf_nat_protocol_unregister(&gre); 137 nf_nat_l4proto_unregister(NFPROTO_IPV4, &gre);
138} 138}
139 139
140module_init(nf_nat_proto_gre_init); 140module_init(nf_nat_proto_gre_init);
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index b35172851ba..eb303471bcf 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -15,8 +15,7 @@
15#include <linux/netfilter.h> 15#include <linux/netfilter.h>
16#include <net/netfilter/nf_nat.h> 16#include <net/netfilter/nf_nat.h>
17#include <net/netfilter/nf_nat_core.h> 17#include <net/netfilter/nf_nat_core.h>
18#include <net/netfilter/nf_nat_rule.h> 18#include <net/netfilter/nf_nat_l4proto.h>
19#include <net/netfilter/nf_nat_protocol.h>
20 19
21static bool 20static bool
22icmp_in_range(const struct nf_conntrack_tuple *tuple, 21icmp_in_range(const struct nf_conntrack_tuple *tuple,
@@ -29,8 +28,9 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,
29} 28}
30 29
31static void 30static void
32icmp_unique_tuple(struct nf_conntrack_tuple *tuple, 31icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
33 const struct nf_nat_ipv4_range *range, 32 struct nf_conntrack_tuple *tuple,
33 const struct nf_nat_range *range,
34 enum nf_nat_manip_type maniptype, 34 enum nf_nat_manip_type maniptype,
35 const struct nf_conn *ct) 35 const struct nf_conn *ct)
36{ 36{
@@ -38,13 +38,14 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
38 unsigned int range_size; 38 unsigned int range_size;
39 unsigned int i; 39 unsigned int i;
40 40
41 range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1; 41 range_size = ntohs(range->max_proto.icmp.id) -
42 ntohs(range->min_proto.icmp.id) + 1;
42 /* If no range specified... */ 43 /* If no range specified... */
43 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) 44 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
44 range_size = 0xFFFF; 45 range_size = 0xFFFF;
45 46
46 for (i = 0; ; ++id) { 47 for (i = 0; ; ++id) {
47 tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) + 48 tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
48 (id % range_size)); 49 (id % range_size));
49 if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) 50 if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
50 return; 51 return;
@@ -54,13 +55,12 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
54 55
55static bool 56static bool
56icmp_manip_pkt(struct sk_buff *skb, 57icmp_manip_pkt(struct sk_buff *skb,
57 unsigned int iphdroff, 58 const struct nf_nat_l3proto *l3proto,
59 unsigned int iphdroff, unsigned int hdroff,
58 const struct nf_conntrack_tuple *tuple, 60 const struct nf_conntrack_tuple *tuple,
59 enum nf_nat_manip_type maniptype) 61 enum nf_nat_manip_type maniptype)
60{ 62{
61 const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
62 struct icmphdr *hdr; 63 struct icmphdr *hdr;
63 unsigned int hdroff = iphdroff + iph->ihl*4;
64 64
65 if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) 65 if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
66 return false; 66 return false;
@@ -72,12 +72,12 @@ icmp_manip_pkt(struct sk_buff *skb,
72 return true; 72 return true;
73} 73}
74 74
75const struct nf_nat_protocol nf_nat_protocol_icmp = { 75const struct nf_nat_l4proto nf_nat_l4proto_icmp = {
76 .protonum = IPPROTO_ICMP, 76 .l4proto = IPPROTO_ICMP,
77 .manip_pkt = icmp_manip_pkt, 77 .manip_pkt = icmp_manip_pkt,
78 .in_range = icmp_in_range, 78 .in_range = icmp_in_range,
79 .unique_tuple = icmp_unique_tuple, 79 .unique_tuple = icmp_unique_tuple,
80#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 80#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
81 .nlattr_to_range = nf_nat_proto_nlattr_to_range, 81 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
82#endif 82#endif
83}; 83};
diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c
deleted file mode 100644
index 3cce9b6c1c2..00000000000
--- a/net/ipv4/netfilter/nf_nat_proto_sctp.c
+++ /dev/null
@@ -1,96 +0,0 @@
1/*
2 * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/init.h>
11#include <linux/ip.h>
12#include <linux/sctp.h>
13#include <linux/module.h>
14#include <net/sctp/checksum.h>
15
16#include <net/netfilter/nf_nat_protocol.h>
17
18static u_int16_t nf_sctp_port_rover;
19
20static void
21sctp_unique_tuple(struct nf_conntrack_tuple *tuple,
22 const struct nf_nat_ipv4_range *range,
23 enum nf_nat_manip_type maniptype,
24 const struct nf_conn *ct)
25{
26 nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
27 &nf_sctp_port_rover);
28}
29
30static bool
31sctp_manip_pkt(struct sk_buff *skb,
32 unsigned int iphdroff,
33 const struct nf_conntrack_tuple *tuple,
34 enum nf_nat_manip_type maniptype)
35{
36 const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
37 struct sk_buff *frag;
38 sctp_sctphdr_t *hdr;
39 unsigned int hdroff = iphdroff + iph->ihl*4;
40 __be32 oldip, newip;
41 __be32 crc32;
42
43 if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
44 return false;
45
46 iph = (struct iphdr *)(skb->data + iphdroff);
47 hdr = (struct sctphdr *)(skb->data + hdroff);
48
49 if (maniptype == NF_NAT_MANIP_SRC) {
50 /* Get rid of src ip and src pt */
51 oldip = iph->saddr;
52 newip = tuple->src.u3.ip;
53 hdr->source = tuple->src.u.sctp.port;
54 } else {
55 /* Get rid of dst ip and dst pt */
56 oldip = iph->daddr;
57 newip = tuple->dst.u3.ip;
58 hdr->dest = tuple->dst.u.sctp.port;
59 }
60
61 crc32 = sctp_start_cksum((u8 *)hdr, skb_headlen(skb) - hdroff);
62 skb_walk_frags(skb, frag)
63 crc32 = sctp_update_cksum((u8 *)frag->data, skb_headlen(frag),
64 crc32);
65 crc32 = sctp_end_cksum(crc32);
66 hdr->checksum = crc32;
67
68 return true;
69}
70
71static const struct nf_nat_protocol nf_nat_protocol_sctp = {
72 .protonum = IPPROTO_SCTP,
73 .manip_pkt = sctp_manip_pkt,
74 .in_range = nf_nat_proto_in_range,
75 .unique_tuple = sctp_unique_tuple,
76#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
77 .nlattr_to_range = nf_nat_proto_nlattr_to_range,
78#endif
79};
80
81static int __init nf_nat_proto_sctp_init(void)
82{
83 return nf_nat_protocol_register(&nf_nat_protocol_sctp);
84}
85
86static void __exit nf_nat_proto_sctp_exit(void)
87{
88 nf_nat_protocol_unregister(&nf_nat_protocol_sctp);
89}
90
91module_init(nf_nat_proto_sctp_init);
92module_exit(nf_nat_proto_sctp_exit);
93
94MODULE_LICENSE("GPL");
95MODULE_DESCRIPTION("SCTP NAT protocol helper");
96MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c
deleted file mode 100644
index 9fb4b4e72bb..00000000000
--- a/net/ipv4/netfilter/nf_nat_proto_tcp.c
+++ /dev/null
@@ -1,91 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/init.h>
11#include <linux/export.h>
12#include <linux/ip.h>
13#include <linux/tcp.h>
14
15#include <linux/netfilter.h>
16#include <linux/netfilter/nfnetlink_conntrack.h>
17#include <net/netfilter/nf_nat.h>
18#include <net/netfilter/nf_nat_rule.h>
19#include <net/netfilter/nf_nat_protocol.h>
20#include <net/netfilter/nf_nat_core.h>
21
22static u_int16_t tcp_port_rover;
23
24static void
25tcp_unique_tuple(struct nf_conntrack_tuple *tuple,
26 const struct nf_nat_ipv4_range *range,
27 enum nf_nat_manip_type maniptype,
28 const struct nf_conn *ct)
29{
30 nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &tcp_port_rover);
31}
32
33static bool
34tcp_manip_pkt(struct sk_buff *skb,
35 unsigned int iphdroff,
36 const struct nf_conntrack_tuple *tuple,
37 enum nf_nat_manip_type maniptype)
38{
39 const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
40 struct tcphdr *hdr;
41 unsigned int hdroff = iphdroff + iph->ihl*4;
42 __be32 oldip, newip;
43 __be16 *portptr, newport, oldport;
44 int hdrsize = 8; /* TCP connection tracking guarantees this much */
45
46 /* this could be a inner header returned in icmp packet; in such
47 cases we cannot update the checksum field since it is outside of
48 the 8 bytes of transport layer headers we are guaranteed */
49 if (skb->len >= hdroff + sizeof(struct tcphdr))
50 hdrsize = sizeof(struct tcphdr);
51
52 if (!skb_make_writable(skb, hdroff + hdrsize))
53 return false;
54
55 iph = (struct iphdr *)(skb->data + iphdroff);
56 hdr = (struct tcphdr *)(skb->data + hdroff);
57
58 if (maniptype == NF_NAT_MANIP_SRC) {
59 /* Get rid of src ip and src pt */
60 oldip = iph->saddr;
61 newip = tuple->src.u3.ip;
62 newport = tuple->src.u.tcp.port;
63 portptr = &hdr->source;
64 } else {
65 /* Get rid of dst ip and dst pt */
66 oldip = iph->daddr;
67 newip = tuple->dst.u3.ip;
68 newport = tuple->dst.u.tcp.port;
69 portptr = &hdr->dest;
70 }
71
72 oldport = *portptr;
73 *portptr = newport;
74
75 if (hdrsize < sizeof(*hdr))
76 return true;
77
78 inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
79 inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0);
80 return true;
81}
82
83const struct nf_nat_protocol nf_nat_protocol_tcp = {
84 .protonum = IPPROTO_TCP,
85 .manip_pkt = tcp_manip_pkt,
86 .in_range = nf_nat_proto_in_range,
87 .unique_tuple = tcp_unique_tuple,
88#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
89 .nlattr_to_range = nf_nat_proto_nlattr_to_range,
90#endif
91};
diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c
deleted file mode 100644
index 9883336e628..00000000000
--- a/net/ipv4/netfilter/nf_nat_proto_udp.c
+++ /dev/null
@@ -1,82 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/export.h>
11#include <linux/init.h>
12#include <linux/ip.h>
13#include <linux/udp.h>
14
15#include <linux/netfilter.h>
16#include <net/netfilter/nf_nat.h>
17#include <net/netfilter/nf_nat_core.h>
18#include <net/netfilter/nf_nat_rule.h>
19#include <net/netfilter/nf_nat_protocol.h>
20
21static u_int16_t udp_port_rover;
22
23static void
24udp_unique_tuple(struct nf_conntrack_tuple *tuple,
25 const struct nf_nat_ipv4_range *range,
26 enum nf_nat_manip_type maniptype,
27 const struct nf_conn *ct)
28{
29 nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &udp_port_rover);
30}
31
32static bool
33udp_manip_pkt(struct sk_buff *skb,
34 unsigned int iphdroff,
35 const struct nf_conntrack_tuple *tuple,
36 enum nf_nat_manip_type maniptype)
37{
38 const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
39 struct udphdr *hdr;
40 unsigned int hdroff = iphdroff + iph->ihl*4;
41 __be32 oldip, newip;
42 __be16 *portptr, newport;
43
44 if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
45 return false;
46
47 iph = (struct iphdr *)(skb->data + iphdroff);
48 hdr = (struct udphdr *)(skb->data + hdroff);
49
50 if (maniptype == NF_NAT_MANIP_SRC) {
51 /* Get rid of src ip and src pt */
52 oldip = iph->saddr;
53 newip = tuple->src.u3.ip;
54 newport = tuple->src.u.udp.port;
55 portptr = &hdr->source;
56 } else {
57 /* Get rid of dst ip and dst pt */
58 oldip = iph->daddr;
59 newip = tuple->dst.u3.ip;
60 newport = tuple->dst.u.udp.port;
61 portptr = &hdr->dest;
62 }
63 if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) {
64 inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
65 inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
66 0);
67 if (!hdr->check)
68 hdr->check = CSUM_MANGLED_0;
69 }
70 *portptr = newport;
71 return true;
72}
73
74const struct nf_nat_protocol nf_nat_protocol_udp = {
75 .protonum = IPPROTO_UDP,
76 .manip_pkt = udp_manip_pkt,
77 .in_range = nf_nat_proto_in_range,
78 .unique_tuple = udp_unique_tuple,
79#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
80 .nlattr_to_range = nf_nat_proto_nlattr_to_range,
81#endif
82};
diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c
deleted file mode 100644
index d24d10a7beb..00000000000
--- a/net/ipv4/netfilter/nf_nat_proto_udplite.c
+++ /dev/null
@@ -1,98 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
3 * (C) 2008 Patrick McHardy <kaber@trash.net>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/types.h>
11#include <linux/init.h>
12#include <linux/ip.h>
13#include <linux/udp.h>
14
15#include <linux/netfilter.h>
16#include <linux/module.h>
17#include <net/netfilter/nf_nat.h>
18#include <net/netfilter/nf_nat_protocol.h>
19
20static u_int16_t udplite_port_rover;
21
22static void
23udplite_unique_tuple(struct nf_conntrack_tuple *tuple,
24 const struct nf_nat_ipv4_range *range,
25 enum nf_nat_manip_type maniptype,
26 const struct nf_conn *ct)
27{
28 nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
29 &udplite_port_rover);
30}
31
32static bool
33udplite_manip_pkt(struct sk_buff *skb,
34 unsigned int iphdroff,
35 const struct nf_conntrack_tuple *tuple,
36 enum nf_nat_manip_type maniptype)
37{
38 const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
39 struct udphdr *hdr;
40 unsigned int hdroff = iphdroff + iph->ihl*4;
41 __be32 oldip, newip;
42 __be16 *portptr, newport;
43
44 if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
45 return false;
46
47 iph = (struct iphdr *)(skb->data + iphdroff);
48 hdr = (struct udphdr *)(skb->data + hdroff);
49
50 if (maniptype == NF_NAT_MANIP_SRC) {
51 /* Get rid of src ip and src pt */
52 oldip = iph->saddr;
53 newip = tuple->src.u3.ip;
54 newport = tuple->src.u.udp.port;
55 portptr = &hdr->source;
56 } else {
57 /* Get rid of dst ip and dst pt */
58 oldip = iph->daddr;
59 newip = tuple->dst.u3.ip;
60 newport = tuple->dst.u.udp.port;
61 portptr = &hdr->dest;
62 }
63
64 inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
65 inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0);
66 if (!hdr->check)
67 hdr->check = CSUM_MANGLED_0;
68
69 *portptr = newport;
70 return true;
71}
72
73static const struct nf_nat_protocol nf_nat_protocol_udplite = {
74 .protonum = IPPROTO_UDPLITE,
75 .manip_pkt = udplite_manip_pkt,
76 .in_range = nf_nat_proto_in_range,
77 .unique_tuple = udplite_unique_tuple,
78#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
79 .nlattr_to_range = nf_nat_proto_nlattr_to_range,
80#endif
81};
82
83static int __init nf_nat_proto_udplite_init(void)
84{
85 return nf_nat_protocol_register(&nf_nat_protocol_udplite);
86}
87
88static void __exit nf_nat_proto_udplite_fini(void)
89{
90 nf_nat_protocol_unregister(&nf_nat_protocol_udplite);
91}
92
93module_init(nf_nat_proto_udplite_init);
94module_exit(nf_nat_proto_udplite_fini);
95
96MODULE_LICENSE("GPL");
97MODULE_DESCRIPTION("UDP-Lite NAT protocol helper");
98MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c
deleted file mode 100644
index e0afe8112b1..00000000000
--- a/net/ipv4/netfilter/nf_nat_proto_unknown.c
+++ /dev/null
@@ -1,52 +0,0 @@
1/* The "unknown" protocol. This is what is used for protocols we
2 * don't understand. It's returned by ip_ct_find_proto().
3 */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/types.h>
14#include <linux/init.h>
15
16#include <linux/netfilter.h>
17#include <net/netfilter/nf_nat.h>
18#include <net/netfilter/nf_nat_rule.h>
19#include <net/netfilter/nf_nat_protocol.h>
20
21static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
22 enum nf_nat_manip_type manip_type,
23 const union nf_conntrack_man_proto *min,
24 const union nf_conntrack_man_proto *max)
25{
26 return true;
27}
28
29static void unknown_unique_tuple(struct nf_conntrack_tuple *tuple,
30 const struct nf_nat_ipv4_range *range,
31 enum nf_nat_manip_type maniptype,
32 const struct nf_conn *ct)
33{
34 /* Sorry: we can't help you; if it's not unique, we can't frob
35 anything. */
36 return;
37}
38
39static bool
40unknown_manip_pkt(struct sk_buff *skb,
41 unsigned int iphdroff,
42 const struct nf_conntrack_tuple *tuple,
43 enum nf_nat_manip_type maniptype)
44{
45 return true;
46}
47
48const struct nf_nat_protocol nf_nat_unknown_protocol = {
49 .manip_pkt = unknown_manip_pkt,
50 .in_range = unknown_in_range,
51 .unique_tuple = unknown_unique_tuple,
52};
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
deleted file mode 100644
index d2a9dc314e0..00000000000
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ /dev/null
@@ -1,214 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9/* Everything about the rules for NAT. */
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11#include <linux/types.h>
12#include <linux/ip.h>
13#include <linux/netfilter.h>
14#include <linux/netfilter_ipv4.h>
15#include <linux/module.h>
16#include <linux/kmod.h>
17#include <linux/skbuff.h>
18#include <linux/proc_fs.h>
19#include <linux/slab.h>
20#include <net/checksum.h>
21#include <net/route.h>
22#include <linux/bitops.h>
23
24#include <linux/netfilter_ipv4/ip_tables.h>
25#include <net/netfilter/nf_nat.h>
26#include <net/netfilter/nf_nat_core.h>
27#include <net/netfilter/nf_nat_rule.h>
28
29#define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
30 (1 << NF_INET_POST_ROUTING) | \
31 (1 << NF_INET_LOCAL_OUT) | \
32 (1 << NF_INET_LOCAL_IN))
33
34static const struct xt_table nat_table = {
35 .name = "nat",
36 .valid_hooks = NAT_VALID_HOOKS,
37 .me = THIS_MODULE,
38 .af = NFPROTO_IPV4,
39};
40
41/* Source NAT */
42static unsigned int
43ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par)
44{
45 struct nf_conn *ct;
46 enum ip_conntrack_info ctinfo;
47 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
48
49 NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING ||
50 par->hooknum == NF_INET_LOCAL_IN);
51
52 ct = nf_ct_get(skb, &ctinfo);
53
54 /* Connection must be valid and new. */
55 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
56 ctinfo == IP_CT_RELATED_REPLY));
57 NF_CT_ASSERT(par->out != NULL);
58
59 return nf_nat_setup_info(ct, &mr->range[0], NF_NAT_MANIP_SRC);
60}
61
62static unsigned int
63ipt_dnat_target(struct sk_buff *skb, const struct xt_action_param *par)
64{
65 struct nf_conn *ct;
66 enum ip_conntrack_info ctinfo;
67 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
68
69 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
70 par->hooknum == NF_INET_LOCAL_OUT);
71
72 ct = nf_ct_get(skb, &ctinfo);
73
74 /* Connection must be valid and new. */
75 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
76
77 return nf_nat_setup_info(ct, &mr->range[0], NF_NAT_MANIP_DST);
78}
79
80static int ipt_snat_checkentry(const struct xt_tgchk_param *par)
81{
82 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
83
84 /* Must be a valid range */
85 if (mr->rangesize != 1) {
86 pr_info("SNAT: multiple ranges no longer supported\n");
87 return -EINVAL;
88 }
89 return 0;
90}
91
92static int ipt_dnat_checkentry(const struct xt_tgchk_param *par)
93{
94 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
95
96 /* Must be a valid range */
97 if (mr->rangesize != 1) {
98 pr_info("DNAT: multiple ranges no longer supported\n");
99 return -EINVAL;
100 }
101 return 0;
102}
103
104static unsigned int
105alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
106{
107 /* Force range to this IP; let proto decide mapping for
108 per-proto parts (hence not NF_NAT_RANGE_PROTO_SPECIFIED).
109 */
110 struct nf_nat_ipv4_range range;
111
112 range.flags = 0;
113 pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
114 HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ?
115 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
116 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
117
118 return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
119}
120
121int nf_nat_rule_find(struct sk_buff *skb,
122 unsigned int hooknum,
123 const struct net_device *in,
124 const struct net_device *out,
125 struct nf_conn *ct)
126{
127 struct net *net = nf_ct_net(ct);
128 int ret;
129
130 ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table);
131
132 if (ret == NF_ACCEPT) {
133 if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
134 /* NUL mapping */
135 ret = alloc_null_binding(ct, hooknum);
136 }
137 return ret;
138}
139
140static struct xt_target ipt_snat_reg __read_mostly = {
141 .name = "SNAT",
142 .target = ipt_snat_target,
143 .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
144 .table = "nat",
145 .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN),
146 .checkentry = ipt_snat_checkentry,
147 .family = AF_INET,
148};
149
150static struct xt_target ipt_dnat_reg __read_mostly = {
151 .name = "DNAT",
152 .target = ipt_dnat_target,
153 .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
154 .table = "nat",
155 .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT),
156 .checkentry = ipt_dnat_checkentry,
157 .family = AF_INET,
158};
159
160static int __net_init nf_nat_rule_net_init(struct net *net)
161{
162 struct ipt_replace *repl;
163
164 repl = ipt_alloc_initial_table(&nat_table);
165 if (repl == NULL)
166 return -ENOMEM;
167 net->ipv4.nat_table = ipt_register_table(net, &nat_table, repl);
168 kfree(repl);
169 if (IS_ERR(net->ipv4.nat_table))
170 return PTR_ERR(net->ipv4.nat_table);
171 return 0;
172}
173
174static void __net_exit nf_nat_rule_net_exit(struct net *net)
175{
176 ipt_unregister_table(net, net->ipv4.nat_table);
177}
178
179static struct pernet_operations nf_nat_rule_net_ops = {
180 .init = nf_nat_rule_net_init,
181 .exit = nf_nat_rule_net_exit,
182};
183
184int __init nf_nat_rule_init(void)
185{
186 int ret;
187
188 ret = register_pernet_subsys(&nf_nat_rule_net_ops);
189 if (ret != 0)
190 goto out;
191 ret = xt_register_target(&ipt_snat_reg);
192 if (ret != 0)
193 goto unregister_table;
194
195 ret = xt_register_target(&ipt_dnat_reg);
196 if (ret != 0)
197 goto unregister_snat;
198
199 return ret;
200
201 unregister_snat:
202 xt_unregister_target(&ipt_snat_reg);
203 unregister_table:
204 unregister_pernet_subsys(&nf_nat_rule_net_ops);
205 out:
206 return ret;
207}
208
209void nf_nat_rule_cleanup(void)
210{
211 xt_unregister_target(&ipt_dnat_reg);
212 xt_unregister_target(&ipt_snat_reg);
213 unregister_pernet_subsys(&nf_nat_rule_net_ops);
214}
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
deleted file mode 100644
index ea4a23813d2..00000000000
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ /dev/null
@@ -1,568 +0,0 @@
1/* SIP extension for NAT alteration.
2 *
3 * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
4 * based on RR's ip_nat_ftp.c and other modules.
5 * (C) 2007 United Security Providers
6 * (C) 2007, 2008 Patrick McHardy <kaber@trash.net>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/module.h>
14#include <linux/skbuff.h>
15#include <linux/ip.h>
16#include <net/ip.h>
17#include <linux/udp.h>
18#include <linux/tcp.h>
19
20#include <net/netfilter/nf_nat.h>
21#include <net/netfilter/nf_nat_helper.h>
22#include <net/netfilter/nf_nat_rule.h>
23#include <net/netfilter/nf_conntrack_helper.h>
24#include <net/netfilter/nf_conntrack_expect.h>
25#include <linux/netfilter/nf_conntrack_sip.h>
26
27MODULE_LICENSE("GPL");
28MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
29MODULE_DESCRIPTION("SIP NAT helper");
30MODULE_ALIAS("ip_nat_sip");
31
32
33static unsigned int mangle_packet(struct sk_buff *skb, unsigned int dataoff,
34 const char **dptr, unsigned int *datalen,
35 unsigned int matchoff, unsigned int matchlen,
36 const char *buffer, unsigned int buflen)
37{
38 enum ip_conntrack_info ctinfo;
39 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
40 struct tcphdr *th;
41 unsigned int baseoff;
42
43 if (nf_ct_protonum(ct) == IPPROTO_TCP) {
44 th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
45 baseoff = ip_hdrlen(skb) + th->doff * 4;
46 matchoff += dataoff - baseoff;
47
48 if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
49 matchoff, matchlen,
50 buffer, buflen, false))
51 return 0;
52 } else {
53 baseoff = ip_hdrlen(skb) + sizeof(struct udphdr);
54 matchoff += dataoff - baseoff;
55
56 if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
57 matchoff, matchlen,
58 buffer, buflen))
59 return 0;
60 }
61
62 /* Reload data pointer and adjust datalen value */
63 *dptr = skb->data + dataoff;
64 *datalen += buflen - matchlen;
65 return 1;
66}
67
68static int map_addr(struct sk_buff *skb, unsigned int dataoff,
69 const char **dptr, unsigned int *datalen,
70 unsigned int matchoff, unsigned int matchlen,
71 union nf_inet_addr *addr, __be16 port)
72{
73 enum ip_conntrack_info ctinfo;
74 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
75 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
76 char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
77 unsigned int buflen;
78 __be32 newaddr;
79 __be16 newport;
80
81 if (ct->tuplehash[dir].tuple.src.u3.ip == addr->ip &&
82 ct->tuplehash[dir].tuple.src.u.udp.port == port) {
83 newaddr = ct->tuplehash[!dir].tuple.dst.u3.ip;
84 newport = ct->tuplehash[!dir].tuple.dst.u.udp.port;
85 } else if (ct->tuplehash[dir].tuple.dst.u3.ip == addr->ip &&
86 ct->tuplehash[dir].tuple.dst.u.udp.port == port) {
87 newaddr = ct->tuplehash[!dir].tuple.src.u3.ip;
88 newport = ct->tuplehash[!dir].tuple.src.u.udp.port;
89 } else
90 return 1;
91
92 if (newaddr == addr->ip && newport == port)
93 return 1;
94
95 buflen = sprintf(buffer, "%pI4:%u", &newaddr, ntohs(newport));
96
97 return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
98 buffer, buflen);
99}
100
101static int map_sip_addr(struct sk_buff *skb, unsigned int dataoff,
102 const char **dptr, unsigned int *datalen,
103 enum sip_header_types type)
104{
105 enum ip_conntrack_info ctinfo;
106 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
107 unsigned int matchlen, matchoff;
108 union nf_inet_addr addr;
109 __be16 port;
110
111 if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL,
112 &matchoff, &matchlen, &addr, &port) <= 0)
113 return 1;
114 return map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
115 &addr, port);
116}
117
118static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
119 const char **dptr, unsigned int *datalen)
120{
121 enum ip_conntrack_info ctinfo;
122 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
123 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
124 unsigned int coff, matchoff, matchlen;
125 enum sip_header_types hdr;
126 union nf_inet_addr addr;
127 __be16 port;
128 int request, in_header;
129
130 /* Basic rules: requests and responses. */
131 if (strnicmp(*dptr, "SIP/2.0", strlen("SIP/2.0")) != 0) {
132 if (ct_sip_parse_request(ct, *dptr, *datalen,
133 &matchoff, &matchlen,
134 &addr, &port) > 0 &&
135 !map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
136 &addr, port))
137 return NF_DROP;
138 request = 1;
139 } else
140 request = 0;
141
142 if (nf_ct_protonum(ct) == IPPROTO_TCP)
143 hdr = SIP_HDR_VIA_TCP;
144 else
145 hdr = SIP_HDR_VIA_UDP;
146
147 /* Translate topmost Via header and parameters */
148 if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
149 hdr, NULL, &matchoff, &matchlen,
150 &addr, &port) > 0) {
151 unsigned int matchend, poff, plen, buflen, n;
152 char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
153
154 /* We're only interested in headers related to this
155 * connection */
156 if (request) {
157 if (addr.ip != ct->tuplehash[dir].tuple.src.u3.ip ||
158 port != ct->tuplehash[dir].tuple.src.u.udp.port)
159 goto next;
160 } else {
161 if (addr.ip != ct->tuplehash[dir].tuple.dst.u3.ip ||
162 port != ct->tuplehash[dir].tuple.dst.u.udp.port)
163 goto next;
164 }
165
166 if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
167 &addr, port))
168 return NF_DROP;
169
170 matchend = matchoff + matchlen;
171
172 /* The maddr= parameter (RFC 2361) specifies where to send
173 * the reply. */
174 if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
175 "maddr=", &poff, &plen,
176 &addr) > 0 &&
177 addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
178 addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) {
179 buflen = sprintf(buffer, "%pI4",
180 &ct->tuplehash[!dir].tuple.dst.u3.ip);
181 if (!mangle_packet(skb, dataoff, dptr, datalen,
182 poff, plen, buffer, buflen))
183 return NF_DROP;
184 }
185
186 /* The received= parameter (RFC 2361) contains the address
187 * from which the server received the request. */
188 if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
189 "received=", &poff, &plen,
190 &addr) > 0 &&
191 addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip &&
192 addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) {
193 buflen = sprintf(buffer, "%pI4",
194 &ct->tuplehash[!dir].tuple.src.u3.ip);
195 if (!mangle_packet(skb, dataoff, dptr, datalen,
196 poff, plen, buffer, buflen))
197 return NF_DROP;
198 }
199
200 /* The rport= parameter (RFC 3581) contains the port number
201 * from which the server received the request. */
202 if (ct_sip_parse_numerical_param(ct, *dptr, matchend, *datalen,
203 "rport=", &poff, &plen,
204 &n) > 0 &&
205 htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port &&
206 htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) {
207 __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port;
208 buflen = sprintf(buffer, "%u", ntohs(p));
209 if (!mangle_packet(skb, dataoff, dptr, datalen,
210 poff, plen, buffer, buflen))
211 return NF_DROP;
212 }
213 }
214
215next:
216 /* Translate Contact headers */
217 coff = 0;
218 in_header = 0;
219 while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen,
220 SIP_HDR_CONTACT, &in_header,
221 &matchoff, &matchlen,
222 &addr, &port) > 0) {
223 if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
224 &addr, port))
225 return NF_DROP;
226 }
227
228 if (!map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_FROM) ||
229 !map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_TO))
230 return NF_DROP;
231
232 return NF_ACCEPT;
233}
234
235static void ip_nat_sip_seq_adjust(struct sk_buff *skb, s16 off)
236{
237 enum ip_conntrack_info ctinfo;
238 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
239 const struct tcphdr *th;
240
241 if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0)
242 return;
243
244 th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
245 nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
246}
247
248/* Handles expected signalling connections and media streams */
249static void ip_nat_sip_expected(struct nf_conn *ct,
250 struct nf_conntrack_expect *exp)
251{
252 struct nf_nat_ipv4_range range;
253
254 /* This must be a fresh one. */
255 BUG_ON(ct->status & IPS_NAT_DONE_MASK);
256
257 /* For DST manip, map port here to where it's expected. */
258 range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
259 range.min = range.max = exp->saved_proto;
260 range.min_ip = range.max_ip = exp->saved_ip;
261 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
262
263 /* Change src to where master sends to, but only if the connection
264 * actually came from the same source. */
265 if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip ==
266 ct->master->tuplehash[exp->dir].tuple.src.u3.ip) {
267 range.flags = NF_NAT_RANGE_MAP_IPS;
268 range.min_ip = range.max_ip
269 = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
270 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
271 }
272}
273
274static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
275 const char **dptr, unsigned int *datalen,
276 struct nf_conntrack_expect *exp,
277 unsigned int matchoff,
278 unsigned int matchlen)
279{
280 enum ip_conntrack_info ctinfo;
281 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
282 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
283 __be32 newip;
284 u_int16_t port;
285 char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
286 unsigned int buflen;
287
288 /* Connection will come from reply */
289 if (ct->tuplehash[dir].tuple.src.u3.ip == ct->tuplehash[!dir].tuple.dst.u3.ip)
290 newip = exp->tuple.dst.u3.ip;
291 else
292 newip = ct->tuplehash[!dir].tuple.dst.u3.ip;
293
294 /* If the signalling port matches the connection's source port in the
295 * original direction, try to use the destination port in the opposite
296 * direction. */
297 if (exp->tuple.dst.u.udp.port ==
298 ct->tuplehash[dir].tuple.src.u.udp.port)
299 port = ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port);
300 else
301 port = ntohs(exp->tuple.dst.u.udp.port);
302
303 exp->saved_ip = exp->tuple.dst.u3.ip;
304 exp->tuple.dst.u3.ip = newip;
305 exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port;
306 exp->dir = !dir;
307 exp->expectfn = ip_nat_sip_expected;
308
309 for (; port != 0; port++) {
310 int ret;
311
312 exp->tuple.dst.u.udp.port = htons(port);
313 ret = nf_ct_expect_related(exp);
314 if (ret == 0)
315 break;
316 else if (ret != -EBUSY) {
317 port = 0;
318 break;
319 }
320 }
321
322 if (port == 0)
323 return NF_DROP;
324
325 if (exp->tuple.dst.u3.ip != exp->saved_ip ||
326 exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) {
327 buflen = sprintf(buffer, "%pI4:%u", &newip, port);
328 if (!mangle_packet(skb, dataoff, dptr, datalen,
329 matchoff, matchlen, buffer, buflen))
330 goto err;
331 }
332 return NF_ACCEPT;
333
334err:
335 nf_ct_unexpect_related(exp);
336 return NF_DROP;
337}
338
339static int mangle_content_len(struct sk_buff *skb, unsigned int dataoff,
340 const char **dptr, unsigned int *datalen)
341{
342 enum ip_conntrack_info ctinfo;
343 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
344 unsigned int matchoff, matchlen;
345 char buffer[sizeof("65536")];
346 int buflen, c_len;
347
348 /* Get actual SDP length */
349 if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen,
350 SDP_HDR_VERSION, SDP_HDR_UNSPEC,
351 &matchoff, &matchlen) <= 0)
352 return 0;
353 c_len = *datalen - matchoff + strlen("v=");
354
355 /* Now, update SDP length */
356 if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CONTENT_LENGTH,
357 &matchoff, &matchlen) <= 0)
358 return 0;
359
360 buflen = sprintf(buffer, "%u", c_len);
361 return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
362 buffer, buflen);
363}
364
365static int mangle_sdp_packet(struct sk_buff *skb, unsigned int dataoff,
366 const char **dptr, unsigned int *datalen,
367 unsigned int sdpoff,
368 enum sdp_header_types type,
369 enum sdp_header_types term,
370 char *buffer, int buflen)
371{
372 enum ip_conntrack_info ctinfo;
373 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
374 unsigned int matchlen, matchoff;
375
376 if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term,
377 &matchoff, &matchlen) <= 0)
378 return -ENOENT;
379 return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
380 buffer, buflen) ? 0 : -EINVAL;
381}
382
383static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, unsigned int dataoff,
384 const char **dptr, unsigned int *datalen,
385 unsigned int sdpoff,
386 enum sdp_header_types type,
387 enum sdp_header_types term,
388 const union nf_inet_addr *addr)
389{
390 char buffer[sizeof("nnn.nnn.nnn.nnn")];
391 unsigned int buflen;
392
393 buflen = sprintf(buffer, "%pI4", &addr->ip);
394 if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, type, term,
395 buffer, buflen))
396 return 0;
397
398 return mangle_content_len(skb, dataoff, dptr, datalen);
399}
400
401static unsigned int ip_nat_sdp_port(struct sk_buff *skb, unsigned int dataoff,
402 const char **dptr, unsigned int *datalen,
403 unsigned int matchoff,
404 unsigned int matchlen,
405 u_int16_t port)
406{
407 char buffer[sizeof("nnnnn")];
408 unsigned int buflen;
409
410 buflen = sprintf(buffer, "%u", port);
411 if (!mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
412 buffer, buflen))
413 return 0;
414
415 return mangle_content_len(skb, dataoff, dptr, datalen);
416}
417
418static unsigned int ip_nat_sdp_session(struct sk_buff *skb, unsigned int dataoff,
419 const char **dptr, unsigned int *datalen,
420 unsigned int sdpoff,
421 const union nf_inet_addr *addr)
422{
423 char buffer[sizeof("nnn.nnn.nnn.nnn")];
424 unsigned int buflen;
425
426 /* Mangle session description owner and contact addresses */
427 buflen = sprintf(buffer, "%pI4", &addr->ip);
428 if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff,
429 SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA,
430 buffer, buflen))
431 return 0;
432
433 switch (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff,
434 SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA,
435 buffer, buflen)) {
436 case 0:
437 /*
438 * RFC 2327:
439 *
440 * Session description
441 *
442 * c=* (connection information - not required if included in all media)
443 */
444 case -ENOENT:
445 break;
446 default:
447 return 0;
448 }
449
450 return mangle_content_len(skb, dataoff, dptr, datalen);
451}
452
453/* So, this packet has hit the connection tracking matching code.
454 Mangle it, and change the expectation to match the new version. */
455static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
456 const char **dptr, unsigned int *datalen,
457 struct nf_conntrack_expect *rtp_exp,
458 struct nf_conntrack_expect *rtcp_exp,
459 unsigned int mediaoff,
460 unsigned int medialen,
461 union nf_inet_addr *rtp_addr)
462{
463 enum ip_conntrack_info ctinfo;
464 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
465 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
466 u_int16_t port;
467
468 /* Connection will come from reply */
469 if (ct->tuplehash[dir].tuple.src.u3.ip ==
470 ct->tuplehash[!dir].tuple.dst.u3.ip)
471 rtp_addr->ip = rtp_exp->tuple.dst.u3.ip;
472 else
473 rtp_addr->ip = ct->tuplehash[!dir].tuple.dst.u3.ip;
474
475 rtp_exp->saved_ip = rtp_exp->tuple.dst.u3.ip;
476 rtp_exp->tuple.dst.u3.ip = rtp_addr->ip;
477 rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
478 rtp_exp->dir = !dir;
479 rtp_exp->expectfn = ip_nat_sip_expected;
480
481 rtcp_exp->saved_ip = rtcp_exp->tuple.dst.u3.ip;
482 rtcp_exp->tuple.dst.u3.ip = rtp_addr->ip;
483 rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port;
484 rtcp_exp->dir = !dir;
485 rtcp_exp->expectfn = ip_nat_sip_expected;
486
487 /* Try to get same pair of ports: if not, try to change them. */
488 for (port = ntohs(rtp_exp->tuple.dst.u.udp.port);
489 port != 0; port += 2) {
490 int ret;
491
492 rtp_exp->tuple.dst.u.udp.port = htons(port);
493 ret = nf_ct_expect_related(rtp_exp);
494 if (ret == -EBUSY)
495 continue;
496 else if (ret < 0) {
497 port = 0;
498 break;
499 }
500 rtcp_exp->tuple.dst.u.udp.port = htons(port + 1);
501 ret = nf_ct_expect_related(rtcp_exp);
502 if (ret == 0)
503 break;
504 else if (ret != -EBUSY) {
505 nf_ct_unexpect_related(rtp_exp);
506 port = 0;
507 break;
508 }
509 }
510
511 if (port == 0)
512 goto err1;
513
514 /* Update media port. */
515 if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port &&
516 !ip_nat_sdp_port(skb, dataoff, dptr, datalen,
517 mediaoff, medialen, port))
518 goto err2;
519
520 return NF_ACCEPT;
521
522err2:
523 nf_ct_unexpect_related(rtp_exp);
524 nf_ct_unexpect_related(rtcp_exp);
525err1:
526 return NF_DROP;
527}
528
529static struct nf_ct_helper_expectfn sip_nat = {
530 .name = "sip",
531 .expectfn = ip_nat_sip_expected,
532};
533
534static void __exit nf_nat_sip_fini(void)
535{
536 RCU_INIT_POINTER(nf_nat_sip_hook, NULL);
537 RCU_INIT_POINTER(nf_nat_sip_seq_adjust_hook, NULL);
538 RCU_INIT_POINTER(nf_nat_sip_expect_hook, NULL);
539 RCU_INIT_POINTER(nf_nat_sdp_addr_hook, NULL);
540 RCU_INIT_POINTER(nf_nat_sdp_port_hook, NULL);
541 RCU_INIT_POINTER(nf_nat_sdp_session_hook, NULL);
542 RCU_INIT_POINTER(nf_nat_sdp_media_hook, NULL);
543 nf_ct_helper_expectfn_unregister(&sip_nat);
544 synchronize_rcu();
545}
546
547static int __init nf_nat_sip_init(void)
548{
549 BUG_ON(nf_nat_sip_hook != NULL);
550 BUG_ON(nf_nat_sip_seq_adjust_hook != NULL);
551 BUG_ON(nf_nat_sip_expect_hook != NULL);
552 BUG_ON(nf_nat_sdp_addr_hook != NULL);
553 BUG_ON(nf_nat_sdp_port_hook != NULL);
554 BUG_ON(nf_nat_sdp_session_hook != NULL);
555 BUG_ON(nf_nat_sdp_media_hook != NULL);
556 RCU_INIT_POINTER(nf_nat_sip_hook, ip_nat_sip);
557 RCU_INIT_POINTER(nf_nat_sip_seq_adjust_hook, ip_nat_sip_seq_adjust);
558 RCU_INIT_POINTER(nf_nat_sip_expect_hook, ip_nat_sip_expect);
559 RCU_INIT_POINTER(nf_nat_sdp_addr_hook, ip_nat_sdp_addr);
560 RCU_INIT_POINTER(nf_nat_sdp_port_hook, ip_nat_sdp_port);
561 RCU_INIT_POINTER(nf_nat_sdp_session_hook, ip_nat_sdp_session);
562 RCU_INIT_POINTER(nf_nat_sdp_media_hook, ip_nat_sdp_media);
563 nf_ct_helper_expectfn_register(&sip_nat);
564 return 0;
565}
566
567module_init(nf_nat_sip_init);
568module_exit(nf_nat_sip_fini);
diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c
deleted file mode 100644
index 9dbb8d284f9..00000000000
--- a/net/ipv4/netfilter/nf_nat_tftp.c
+++ /dev/null
@@ -1,51 +0,0 @@
1/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
2 *
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License version 2 as
5 * published by the Free Software Foundation.
6 */
7
8#include <linux/module.h>
9#include <linux/udp.h>
10
11#include <net/netfilter/nf_conntrack_helper.h>
12#include <net/netfilter/nf_conntrack_expect.h>
13#include <net/netfilter/nf_nat_helper.h>
14#include <net/netfilter/nf_nat_rule.h>
15#include <linux/netfilter/nf_conntrack_tftp.h>
16
17MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
18MODULE_DESCRIPTION("TFTP NAT helper");
19MODULE_LICENSE("GPL");
20MODULE_ALIAS("ip_nat_tftp");
21
22static unsigned int help(struct sk_buff *skb,
23 enum ip_conntrack_info ctinfo,
24 struct nf_conntrack_expect *exp)
25{
26 const struct nf_conn *ct = exp->master;
27
28 exp->saved_proto.udp.port
29 = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
30 exp->dir = IP_CT_DIR_REPLY;
31 exp->expectfn = nf_nat_follow_master;
32 if (nf_ct_expect_related(exp) != 0)
33 return NF_DROP;
34 return NF_ACCEPT;
35}
36
37static void __exit nf_nat_tftp_fini(void)
38{
39 RCU_INIT_POINTER(nf_nat_tftp_hook, NULL);
40 synchronize_rcu();
41}
42
43static int __init nf_nat_tftp_init(void)
44{
45 BUG_ON(nf_nat_tftp_hook != NULL);
46 RCU_INIT_POINTER(nf_nat_tftp_hook, help);
47 return 0;
48}
49
50module_init(nf_nat_tftp_init);
51module_exit(nf_nat_tftp_fini);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 6232d476f37..8f3d05424a3 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -185,10 +185,10 @@ exit:
185 return sk; 185 return sk;
186} 186}
187 187
188static void inet_get_ping_group_range_net(struct net *net, gid_t *low, 188static void inet_get_ping_group_range_net(struct net *net, kgid_t *low,
189 gid_t *high) 189 kgid_t *high)
190{ 190{
191 gid_t *data = net->ipv4.sysctl_ping_group_range; 191 kgid_t *data = net->ipv4.sysctl_ping_group_range;
192 unsigned int seq; 192 unsigned int seq;
193 193
194 do { 194 do {
@@ -203,19 +203,13 @@ static void inet_get_ping_group_range_net(struct net *net, gid_t *low,
203static int ping_init_sock(struct sock *sk) 203static int ping_init_sock(struct sock *sk)
204{ 204{
205 struct net *net = sock_net(sk); 205 struct net *net = sock_net(sk);
206 gid_t group = current_egid(); 206 kgid_t group = current_egid();
207 gid_t range[2];
208 struct group_info *group_info = get_current_groups(); 207 struct group_info *group_info = get_current_groups();
209 int i, j, count = group_info->ngroups; 208 int i, j, count = group_info->ngroups;
210 kgid_t low, high; 209 kgid_t low, high;
211 210
212 inet_get_ping_group_range_net(net, range, range+1); 211 inet_get_ping_group_range_net(net, &low, &high);
213 low = make_kgid(&init_user_ns, range[0]); 212 if (gid_lte(low, group) && gid_lte(group, high))
214 high = make_kgid(&init_user_ns, range[1]);
215 if (!gid_valid(low) || !gid_valid(high) || gid_lt(high, low))
216 return -EACCES;
217
218 if (range[0] <= group && group <= range[1])
219 return 0; 213 return 0;
220 214
221 for (i = 0; i < group_info->nblocks; i++) { 215 for (i = 0; i < group_info->nblocks; i++) {
@@ -845,7 +839,9 @@ static void ping_format_sock(struct sock *sp, struct seq_file *f,
845 bucket, src, srcp, dest, destp, sp->sk_state, 839 bucket, src, srcp, dest, destp, sp->sk_state,
846 sk_wmem_alloc_get(sp), 840 sk_wmem_alloc_get(sp),
847 sk_rmem_alloc_get(sp), 841 sk_rmem_alloc_get(sp),
848 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), 842 0, 0L, 0,
843 from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
844 0, sock_i_ino(sp),
849 atomic_read(&sp->sk_refcnt), sp, 845 atomic_read(&sp->sk_refcnt), sp,
850 atomic_read(&sp->sk_drops), len); 846 atomic_read(&sp->sk_drops), len);
851} 847}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 957acd12250..8de53e1ddd5 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -263,6 +263,10 @@ static const struct snmp_mib snmp4_net_list[] = {
263 SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), 263 SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
264 SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), 264 SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
265 SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), 265 SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
266 SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE),
267 SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
268 SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
269 SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
266 SNMP_MIB_SENTINEL 270 SNMP_MIB_SENTINEL
267}; 271};
268 272
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index ff0f071969e..73d1e4df4bf 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -131,18 +131,20 @@ found:
131 * 0 - deliver 131 * 0 - deliver
132 * 1 - block 132 * 1 - block
133 */ 133 */
134static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) 134static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
135{ 135{
136 int type; 136 struct icmphdr _hdr;
137 const struct icmphdr *hdr;
137 138
138 if (!pskb_may_pull(skb, sizeof(struct icmphdr))) 139 hdr = skb_header_pointer(skb, skb_transport_offset(skb),
140 sizeof(_hdr), &_hdr);
141 if (!hdr)
139 return 1; 142 return 1;
140 143
141 type = icmp_hdr(skb)->type; 144 if (hdr->type < 32) {
142 if (type < 32) {
143 __u32 data = raw_sk(sk)->filter.data; 145 __u32 data = raw_sk(sk)->filter.data;
144 146
145 return ((1 << type) & data) != 0; 147 return ((1U << hdr->type) & data) != 0;
146 } 148 }
147 149
148 /* Do not block unknown ICMP types */ 150 /* Do not block unknown ICMP types */
@@ -992,7 +994,9 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
992 i, src, srcp, dest, destp, sp->sk_state, 994 i, src, srcp, dest, destp, sp->sk_state,
993 sk_wmem_alloc_get(sp), 995 sk_wmem_alloc_get(sp),
994 sk_rmem_alloc_get(sp), 996 sk_rmem_alloc_get(sp),
995 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), 997 0, 0L, 0,
998 from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
999 0, sock_i_ino(sp),
996 atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); 1000 atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
997} 1001}
998 1002
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index e4ba974f143..ff622069fce 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -202,11 +202,6 @@ EXPORT_SYMBOL(ip_tos2prio);
202static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 202static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
203#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) 203#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
204 204
205static inline int rt_genid(struct net *net)
206{
207 return atomic_read(&net->ipv4.rt_genid);
208}
209
210#ifdef CONFIG_PROC_FS 205#ifdef CONFIG_PROC_FS
211static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 206static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
212{ 207{
@@ -447,27 +442,9 @@ static inline bool rt_is_expired(const struct rtable *rth)
447 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); 442 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
448} 443}
449 444
450/* 445void rt_cache_flush(struct net *net)
451 * Perturbation of rt_genid by a small quantity [1..256]
452 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
453 * many times (2^24) without giving recent rt_genid.
454 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
455 */
456static void rt_cache_invalidate(struct net *net)
457{
458 unsigned char shuffle;
459
460 get_random_bytes(&shuffle, sizeof(shuffle));
461 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
462}
463
464/*
465 * delay < 0 : invalidate cache (fast : entries will be deleted later)
466 * delay >= 0 : invalidate & flush cache (can be long)
467 */
468void rt_cache_flush(struct net *net, int delay)
469{ 446{
470 rt_cache_invalidate(net); 447 rt_genid_bump(net);
471} 448}
472 449
473static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 450static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
@@ -934,12 +911,14 @@ static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
934 if (mtu < ip_rt_min_pmtu) 911 if (mtu < ip_rt_min_pmtu)
935 mtu = ip_rt_min_pmtu; 912 mtu = ip_rt_min_pmtu;
936 913
914 rcu_read_lock();
937 if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) { 915 if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
938 struct fib_nh *nh = &FIB_RES_NH(res); 916 struct fib_nh *nh = &FIB_RES_NH(res);
939 917
940 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, 918 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
941 jiffies + ip_rt_mtu_expires); 919 jiffies + ip_rt_mtu_expires);
942 } 920 }
921 rcu_read_unlock();
943 return mtu; 922 return mtu;
944} 923}
945 924
@@ -956,7 +935,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
956 dst->obsolete = DST_OBSOLETE_KILL; 935 dst->obsolete = DST_OBSOLETE_KILL;
957 } else { 936 } else {
958 rt->rt_pmtu = mtu; 937 rt->rt_pmtu = mtu;
959 dst_set_expires(&rt->dst, ip_rt_mtu_expires); 938 rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires);
960 } 939 }
961} 940}
962 941
@@ -1132,10 +1111,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
1132 const struct rtable *rt = (const struct rtable *) dst; 1111 const struct rtable *rt = (const struct rtable *) dst;
1133 unsigned int mtu = rt->rt_pmtu; 1112 unsigned int mtu = rt->rt_pmtu;
1134 1113
1135 if (mtu && time_after_eq(jiffies, rt->dst.expires)) 1114 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1136 mtu = 0;
1137
1138 if (!mtu)
1139 mtu = dst_metric_raw(dst, RTAX_MTU); 1115 mtu = dst_metric_raw(dst, RTAX_MTU);
1140 1116
1141 if (mtu && rt_is_output_route(rt)) 1117 if (mtu && rt_is_output_route(rt))
@@ -1263,7 +1239,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
1263{ 1239{
1264 struct rtable *rt = (struct rtable *) dst; 1240 struct rtable *rt = (struct rtable *) dst;
1265 1241
1266 if (dst->flags & DST_NOCACHE) { 1242 if (!list_empty(&rt->rt_uncached)) {
1267 spin_lock_bh(&rt_uncached_lock); 1243 spin_lock_bh(&rt_uncached_lock);
1268 list_del(&rt->rt_uncached); 1244 list_del(&rt->rt_uncached);
1269 spin_unlock_bh(&rt_uncached_lock); 1245 spin_unlock_bh(&rt_uncached_lock);
@@ -1587,11 +1563,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1587 if (ipv4_is_zeronet(daddr)) 1563 if (ipv4_is_zeronet(daddr))
1588 goto martian_destination; 1564 goto martian_destination;
1589 1565
1590 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) { 1566 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1591 if (ipv4_is_loopback(daddr)) 1567 * and call it once if daddr or/and saddr are loopback addresses
1568 */
1569 if (ipv4_is_loopback(daddr)) {
1570 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1592 goto martian_destination; 1571 goto martian_destination;
1593 1572 } else if (ipv4_is_loopback(saddr)) {
1594 if (ipv4_is_loopback(saddr)) 1573 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1595 goto martian_source; 1574 goto martian_source;
1596 } 1575 }
1597 1576
@@ -1616,7 +1595,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1616 1595
1617 if (res.type == RTN_LOCAL) { 1596 if (res.type == RTN_LOCAL) {
1618 err = fib_validate_source(skb, saddr, daddr, tos, 1597 err = fib_validate_source(skb, saddr, daddr, tos,
1619 net->loopback_dev->ifindex, 1598 LOOPBACK_IFINDEX,
1620 dev, in_dev, &itag); 1599 dev, in_dev, &itag);
1621 if (err < 0) 1600 if (err < 0)
1622 goto martian_source_keep_err; 1601 goto martian_source_keep_err;
@@ -1892,7 +1871,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1892 1871
1893 orig_oif = fl4->flowi4_oif; 1872 orig_oif = fl4->flowi4_oif;
1894 1873
1895 fl4->flowi4_iif = net->loopback_dev->ifindex; 1874 fl4->flowi4_iif = LOOPBACK_IFINDEX;
1896 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 1875 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1897 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 1876 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1898 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 1877 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
@@ -1981,7 +1960,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1981 if (!fl4->daddr) 1960 if (!fl4->daddr)
1982 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); 1961 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1983 dev_out = net->loopback_dev; 1962 dev_out = net->loopback_dev;
1984 fl4->flowi4_oif = net->loopback_dev->ifindex; 1963 fl4->flowi4_oif = LOOPBACK_IFINDEX;
1985 res.type = RTN_LOCAL; 1964 res.type = RTN_LOCAL;
1986 flags |= RTCF_LOCAL; 1965 flags |= RTCF_LOCAL;
1987 goto make_route; 1966 goto make_route;
@@ -2028,7 +2007,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
2028 } 2007 }
2029 dev_out = net->loopback_dev; 2008 dev_out = net->loopback_dev;
2030 fl4->flowi4_oif = dev_out->ifindex; 2009 fl4->flowi4_oif = dev_out->ifindex;
2031 res.fi = NULL;
2032 flags |= RTCF_LOCAL; 2010 flags |= RTCF_LOCAL;
2033 goto make_route; 2011 goto make_route;
2034 } 2012 }
@@ -2153,7 +2131,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2153EXPORT_SYMBOL_GPL(ip_route_output_flow); 2131EXPORT_SYMBOL_GPL(ip_route_output_flow);
2154 2132
2155static int rt_fill_info(struct net *net, __be32 dst, __be32 src, 2133static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2156 struct flowi4 *fl4, struct sk_buff *skb, u32 pid, 2134 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2157 u32 seq, int event, int nowait, unsigned int flags) 2135 u32 seq, int event, int nowait, unsigned int flags)
2158{ 2136{
2159 struct rtable *rt = skb_rtable(skb); 2137 struct rtable *rt = skb_rtable(skb);
@@ -2163,7 +2141,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2163 u32 error; 2141 u32 error;
2164 u32 metrics[RTAX_MAX]; 2142 u32 metrics[RTAX_MAX];
2165 2143
2166 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); 2144 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2167 if (nlh == NULL) 2145 if (nlh == NULL)
2168 return -EMSGSIZE; 2146 return -EMSGSIZE;
2169 2147
@@ -2323,12 +2301,12 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
2323 rt->rt_flags |= RTCF_NOTIFY; 2301 rt->rt_flags |= RTCF_NOTIFY;
2324 2302
2325 err = rt_fill_info(net, dst, src, &fl4, skb, 2303 err = rt_fill_info(net, dst, src, &fl4, skb,
2326 NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 2304 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2327 RTM_NEWROUTE, 0, 0); 2305 RTM_NEWROUTE, 0, 0);
2328 if (err <= 0) 2306 if (err <= 0)
2329 goto errout_free; 2307 goto errout_free;
2330 2308
2331 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); 2309 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2332errout: 2310errout:
2333 return err; 2311 return err;
2334 2312
@@ -2344,7 +2322,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2344 2322
2345void ip_rt_multicast_event(struct in_device *in_dev) 2323void ip_rt_multicast_event(struct in_device *in_dev)
2346{ 2324{
2347 rt_cache_flush(dev_net(in_dev->dev), 0); 2325 rt_cache_flush(dev_net(in_dev->dev));
2348} 2326}
2349 2327
2350#ifdef CONFIG_SYSCTL 2328#ifdef CONFIG_SYSCTL
@@ -2353,16 +2331,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2353 size_t *lenp, loff_t *ppos) 2331 size_t *lenp, loff_t *ppos)
2354{ 2332{
2355 if (write) { 2333 if (write) {
2356 int flush_delay; 2334 rt_cache_flush((struct net *)__ctl->extra1);
2357 ctl_table ctl;
2358 struct net *net;
2359
2360 memcpy(&ctl, __ctl, sizeof(ctl));
2361 ctl.data = &flush_delay;
2362 proc_dointvec(&ctl, write, buffer, lenp, ppos);
2363
2364 net = (struct net *)__ctl->extra1;
2365 rt_cache_flush(net, flush_delay);
2366 return 0; 2335 return 0;
2367 } 2336 }
2368 2337
@@ -2532,8 +2501,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
2532 2501
2533static __net_init int rt_genid_init(struct net *net) 2502static __net_init int rt_genid_init(struct net *net)
2534{ 2503{
2535 get_random_bytes(&net->ipv4.rt_genid, 2504 atomic_set(&net->rt_genid, 0);
2536 sizeof(net->ipv4.rt_genid));
2537 get_random_bytes(&net->ipv4.dev_addr_genid, 2505 get_random_bytes(&net->ipv4.dev_addr_genid,
2538 sizeof(net->ipv4.dev_addr_genid)); 2506 sizeof(net->ipv4.dev_addr_genid));
2539 return 0; 2507 return 0;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 650e1528e1e..ba48e799b03 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -319,6 +319,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
319 ireq->tstamp_ok = tcp_opt.saw_tstamp; 319 ireq->tstamp_ok = tcp_opt.saw_tstamp;
320 req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; 320 req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
321 treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; 321 treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
322 treq->listener = NULL;
322 323
323 /* We throwed the options of the initial SYN away, so we hope 324 /* We throwed the options of the initial SYN away, so we hope
324 * the ACK carries the same options again (see RFC1122 4.2.3.8) 325 * the ACK carries the same options again (see RFC1122 4.2.3.8)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 1b5ce96707a..9205e492dc9 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -76,9 +76,9 @@ static int ipv4_local_port_range(ctl_table *table, int write,
76} 76}
77 77
78 78
79static void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high) 79static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
80{ 80{
81 gid_t *data = table->data; 81 kgid_t *data = table->data;
82 unsigned int seq; 82 unsigned int seq;
83 do { 83 do {
84 seq = read_seqbegin(&sysctl_local_ports.lock); 84 seq = read_seqbegin(&sysctl_local_ports.lock);
@@ -89,12 +89,12 @@ static void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low,
89} 89}
90 90
91/* Update system visible IP port range */ 91/* Update system visible IP port range */
92static void set_ping_group_range(struct ctl_table *table, gid_t range[2]) 92static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high)
93{ 93{
94 gid_t *data = table->data; 94 kgid_t *data = table->data;
95 write_seqlock(&sysctl_local_ports.lock); 95 write_seqlock(&sysctl_local_ports.lock);
96 data[0] = range[0]; 96 data[0] = low;
97 data[1] = range[1]; 97 data[1] = high;
98 write_sequnlock(&sysctl_local_ports.lock); 98 write_sequnlock(&sysctl_local_ports.lock);
99} 99}
100 100
@@ -103,21 +103,33 @@ static int ipv4_ping_group_range(ctl_table *table, int write,
103 void __user *buffer, 103 void __user *buffer,
104 size_t *lenp, loff_t *ppos) 104 size_t *lenp, loff_t *ppos)
105{ 105{
106 struct user_namespace *user_ns = current_user_ns();
106 int ret; 107 int ret;
107 gid_t range[2]; 108 gid_t urange[2];
109 kgid_t low, high;
108 ctl_table tmp = { 110 ctl_table tmp = {
109 .data = &range, 111 .data = &urange,
110 .maxlen = sizeof(range), 112 .maxlen = sizeof(urange),
111 .mode = table->mode, 113 .mode = table->mode,
112 .extra1 = &ip_ping_group_range_min, 114 .extra1 = &ip_ping_group_range_min,
113 .extra2 = &ip_ping_group_range_max, 115 .extra2 = &ip_ping_group_range_max,
114 }; 116 };
115 117
116 inet_get_ping_group_range_table(table, range, range + 1); 118 inet_get_ping_group_range_table(table, &low, &high);
119 urange[0] = from_kgid_munged(user_ns, low);
120 urange[1] = from_kgid_munged(user_ns, high);
117 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 121 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
118 122
119 if (write && ret == 0) 123 if (write && ret == 0) {
120 set_ping_group_range(table, range); 124 low = make_kgid(user_ns, urange[0]);
125 high = make_kgid(user_ns, urange[1]);
126 if (!gid_valid(low) || !gid_valid(high) ||
127 (urange[1] < urange[0]) || gid_lt(high, low)) {
128 low = make_kgid(&init_user_ns, 1);
129 high = make_kgid(&init_user_ns, 0);
130 }
131 set_ping_group_range(table, low, high);
132 }
121 133
122 return ret; 134 return ret;
123} 135}
@@ -220,6 +232,45 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
220 return 0; 232 return 0;
221} 233}
222 234
235int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer,
236 size_t *lenp, loff_t *ppos)
237{
238 ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
239 struct tcp_fastopen_context *ctxt;
240 int ret;
241 u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
242
243 tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
244 if (!tbl.data)
245 return -ENOMEM;
246
247 rcu_read_lock();
248 ctxt = rcu_dereference(tcp_fastopen_ctx);
249 if (ctxt)
250 memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
251 rcu_read_unlock();
252
253 snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
254 user_key[0], user_key[1], user_key[2], user_key[3]);
255 ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
256
257 if (write && ret == 0) {
258 if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1,
259 user_key + 2, user_key + 3) != 4) {
260 ret = -EINVAL;
261 goto bad_key;
262 }
263 tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
264 }
265
266bad_key:
267 pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
268 user_key[0], user_key[1], user_key[2], user_key[3],
269 (char *)tbl.data, ret);
270 kfree(tbl.data);
271 return ret;
272}
273
223static struct ctl_table ipv4_table[] = { 274static struct ctl_table ipv4_table[] = {
224 { 275 {
225 .procname = "tcp_timestamps", 276 .procname = "tcp_timestamps",
@@ -374,6 +425,12 @@ static struct ctl_table ipv4_table[] = {
374 .proc_handler = proc_dointvec, 425 .proc_handler = proc_dointvec,
375 }, 426 },
376 { 427 {
428 .procname = "tcp_fastopen_key",
429 .mode = 0600,
430 .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
431 .proc_handler = proc_tcp_fastopen_key,
432 },
433 {
377 .procname = "tcp_tw_recycle", 434 .procname = "tcp_tw_recycle",
378 .data = &tcp_death_row.sysctl_tw_recycle, 435 .data = &tcp_death_row.sysctl_tw_recycle,
379 .maxlen = sizeof(int), 436 .maxlen = sizeof(int),
@@ -786,7 +843,7 @@ static struct ctl_table ipv4_net_table[] = {
786 { 843 {
787 .procname = "ping_group_range", 844 .procname = "ping_group_range",
788 .data = &init_net.ipv4.sysctl_ping_group_range, 845 .data = &init_net.ipv4.sysctl_ping_group_range,
789 .maxlen = sizeof(init_net.ipv4.sysctl_ping_group_range), 846 .maxlen = sizeof(gid_t)*2,
790 .mode = 0644, 847 .mode = 0644,
791 .proc_handler = ipv4_ping_group_range, 848 .proc_handler = ipv4_ping_group_range,
792 }, 849 },
@@ -830,8 +887,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
830 * Sane defaults - nobody may create ping sockets. 887 * Sane defaults - nobody may create ping sockets.
831 * Boot scripts should set this to distro-specific group. 888 * Boot scripts should set this to distro-specific group.
832 */ 889 */
833 net->ipv4.sysctl_ping_group_range[0] = 1; 890 net->ipv4.sysctl_ping_group_range[0] = make_kgid(&init_user_ns, 1);
834 net->ipv4.sysctl_ping_group_range[1] = 0; 891 net->ipv4.sysctl_ping_group_range[1] = make_kgid(&init_user_ns, 0);
835 892
836 tcp_init_mem(net); 893 tcp_init_mem(net);
837 894
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2109ff4a1da..f32c02e2a54 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -486,8 +486,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
486 if (sk->sk_shutdown & RCV_SHUTDOWN) 486 if (sk->sk_shutdown & RCV_SHUTDOWN)
487 mask |= POLLIN | POLLRDNORM | POLLRDHUP; 487 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
488 488
489 /* Connected? */ 489 /* Connected or passive Fast Open socket? */
490 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { 490 if (sk->sk_state != TCP_SYN_SENT &&
491 (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
491 int target = sock_rcvlowat(sk, 0, INT_MAX); 492 int target = sock_rcvlowat(sk, 0, INT_MAX);
492 493
493 if (tp->urg_seq == tp->copied_seq && 494 if (tp->urg_seq == tp->copied_seq &&
@@ -840,10 +841,15 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
840 ssize_t copied; 841 ssize_t copied;
841 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 842 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
842 843
843 /* Wait for a connection to finish. */ 844 /* Wait for a connection to finish. One exception is TCP Fast Open
844 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) 845 * (passive side) where data is allowed to be sent before a connection
846 * is fully established.
847 */
848 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
849 !tcp_passive_fastopen(sk)) {
845 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) 850 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
846 goto out_err; 851 goto out_err;
852 }
847 853
848 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 854 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
849 855
@@ -1042,10 +1048,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1042 1048
1043 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 1049 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1044 1050
1045 /* Wait for a connection to finish. */ 1051 /* Wait for a connection to finish. One exception is TCP Fast Open
1046 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) 1052 * (passive side) where data is allowed to be sent before a connection
1053 * is fully established.
1054 */
1055 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1056 !tcp_passive_fastopen(sk)) {
1047 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) 1057 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
1048 goto do_error; 1058 goto do_error;
1059 }
1049 1060
1050 if (unlikely(tp->repair)) { 1061 if (unlikely(tp->repair)) {
1051 if (tp->repair_queue == TCP_RECV_QUEUE) { 1062 if (tp->repair_queue == TCP_RECV_QUEUE) {
@@ -1139,78 +1150,43 @@ new_segment:
1139 if (err) 1150 if (err)
1140 goto do_fault; 1151 goto do_fault;
1141 } else { 1152 } else {
1142 bool merge = false; 1153 bool merge = true;
1143 int i = skb_shinfo(skb)->nr_frags; 1154 int i = skb_shinfo(skb)->nr_frags;
1144 struct page *page = sk->sk_sndmsg_page; 1155 struct page_frag *pfrag = sk_page_frag(sk);
1145 int off; 1156
1146 1157 if (!sk_page_frag_refill(sk, pfrag))
1147 if (page && page_count(page) == 1) 1158 goto wait_for_memory;
1148 sk->sk_sndmsg_off = 0; 1159
1149 1160 if (!skb_can_coalesce(skb, i, pfrag->page,
1150 off = sk->sk_sndmsg_off; 1161 pfrag->offset)) {
1151 1162 if (i == MAX_SKB_FRAGS || !sg) {
1152 if (skb_can_coalesce(skb, i, page, off) && 1163 tcp_mark_push(tp, skb);
1153 off != PAGE_SIZE) { 1164 goto new_segment;
1154 /* We can extend the last page
1155 * fragment. */
1156 merge = true;
1157 } else if (i == MAX_SKB_FRAGS || !sg) {
1158 /* Need to add new fragment and cannot
1159 * do this because interface is non-SG,
1160 * or because all the page slots are
1161 * busy. */
1162 tcp_mark_push(tp, skb);
1163 goto new_segment;
1164 } else if (page) {
1165 if (off == PAGE_SIZE) {
1166 put_page(page);
1167 sk->sk_sndmsg_page = page = NULL;
1168 off = 0;
1169 } 1165 }
1170 } else 1166 merge = false;
1171 off = 0; 1167 }
1172 1168
1173 if (copy > PAGE_SIZE - off) 1169 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1174 copy = PAGE_SIZE - off;
1175 1170
1176 if (!sk_wmem_schedule(sk, copy)) 1171 if (!sk_wmem_schedule(sk, copy))
1177 goto wait_for_memory; 1172 goto wait_for_memory;
1178 1173
1179 if (!page) {
1180 /* Allocate new cache page. */
1181 if (!(page = sk_stream_alloc_page(sk)))
1182 goto wait_for_memory;
1183 }
1184
1185 /* Time to copy data. We are close to
1186 * the end! */
1187 err = skb_copy_to_page_nocache(sk, from, skb, 1174 err = skb_copy_to_page_nocache(sk, from, skb,
1188 page, off, copy); 1175 pfrag->page,
1189 if (err) { 1176 pfrag->offset,
1190 /* If this page was new, give it to the 1177 copy);
1191 * socket so it does not get leaked. 1178 if (err)
1192 */
1193 if (!sk->sk_sndmsg_page) {
1194 sk->sk_sndmsg_page = page;
1195 sk->sk_sndmsg_off = 0;
1196 }
1197 goto do_error; 1179 goto do_error;
1198 }
1199 1180
1200 /* Update the skb. */ 1181 /* Update the skb. */
1201 if (merge) { 1182 if (merge) {
1202 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1183 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1203 } else { 1184 } else {
1204 skb_fill_page_desc(skb, i, page, off, copy); 1185 skb_fill_page_desc(skb, i, pfrag->page,
1205 if (sk->sk_sndmsg_page) { 1186 pfrag->offset, copy);
1206 get_page(page); 1187 get_page(pfrag->page);
1207 } else if (off + copy < PAGE_SIZE) {
1208 get_page(page);
1209 sk->sk_sndmsg_page = page;
1210 }
1211 } 1188 }
1212 1189 pfrag->offset += copy;
1213 sk->sk_sndmsg_off = off + copy;
1214 } 1190 }
1215 1191
1216 if (!copied) 1192 if (!copied)
@@ -1762,8 +1738,14 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1762 } 1738 }
1763 1739
1764#ifdef CONFIG_NET_DMA 1740#ifdef CONFIG_NET_DMA
1765 if (tp->ucopy.dma_chan) 1741 if (tp->ucopy.dma_chan) {
1766 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); 1742 if (tp->rcv_wnd == 0 &&
1743 !skb_queue_empty(&sk->sk_async_wait_queue)) {
1744 tcp_service_net_dma(sk, true);
1745 tcp_cleanup_rbuf(sk, copied);
1746 } else
1747 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1748 }
1767#endif 1749#endif
1768 if (copied >= target) { 1750 if (copied >= target) {
1769 /* Do not sleep, just process backlog. */ 1751 /* Do not sleep, just process backlog. */
@@ -2144,6 +2126,10 @@ void tcp_close(struct sock *sk, long timeout)
2144 * they look as CLOSING or LAST_ACK for Linux) 2126 * they look as CLOSING or LAST_ACK for Linux)
2145 * Probably, I missed some more holelets. 2127 * Probably, I missed some more holelets.
2146 * --ANK 2128 * --ANK
2129 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
2130 * in a single packet! (May consider it later but will
2131 * probably need API support or TCP_CORK SYN-ACK until
2132 * data is written and socket is closed.)
2147 */ 2133 */
2148 tcp_send_fin(sk); 2134 tcp_send_fin(sk);
2149 } 2135 }
@@ -2215,8 +2201,16 @@ adjudge_to_death:
2215 } 2201 }
2216 } 2202 }
2217 2203
2218 if (sk->sk_state == TCP_CLOSE) 2204 if (sk->sk_state == TCP_CLOSE) {
2205 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2206 /* We could get here with a non-NULL req if the socket is
2207 * aborted (e.g., closed with unread data) before 3WHS
2208 * finishes.
2209 */
2210 if (req != NULL)
2211 reqsk_fastopen_remove(sk, req, false);
2219 inet_csk_destroy_sock(sk); 2212 inet_csk_destroy_sock(sk);
2213 }
2220 /* Otherwise, socket is reprieved until protocol close. */ 2214 /* Otherwise, socket is reprieved until protocol close. */
2221 2215
2222out: 2216out:
@@ -2302,6 +2296,13 @@ int tcp_disconnect(struct sock *sk, int flags)
2302} 2296}
2303EXPORT_SYMBOL(tcp_disconnect); 2297EXPORT_SYMBOL(tcp_disconnect);
2304 2298
2299void tcp_sock_destruct(struct sock *sk)
2300{
2301 inet_sock_destruct(sk);
2302
2303 kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
2304}
2305
2305static inline bool tcp_can_repair_sock(const struct sock *sk) 2306static inline bool tcp_can_repair_sock(const struct sock *sk)
2306{ 2307{
2307 return capable(CAP_NET_ADMIN) && 2308 return capable(CAP_NET_ADMIN) &&
@@ -2325,10 +2326,17 @@ static int tcp_repair_options_est(struct tcp_sock *tp,
2325 tp->rx_opt.mss_clamp = opt.opt_val; 2326 tp->rx_opt.mss_clamp = opt.opt_val;
2326 break; 2327 break;
2327 case TCPOPT_WINDOW: 2328 case TCPOPT_WINDOW:
2328 if (opt.opt_val > 14) 2329 {
2329 return -EFBIG; 2330 u16 snd_wscale = opt.opt_val & 0xFFFF;
2331 u16 rcv_wscale = opt.opt_val >> 16;
2332
2333 if (snd_wscale > 14 || rcv_wscale > 14)
2334 return -EFBIG;
2330 2335
2331 tp->rx_opt.snd_wscale = opt.opt_val; 2336 tp->rx_opt.snd_wscale = snd_wscale;
2337 tp->rx_opt.rcv_wscale = rcv_wscale;
2338 tp->rx_opt.wscale_ok = 1;
2339 }
2332 break; 2340 break;
2333 case TCPOPT_SACK_PERM: 2341 case TCPOPT_SACK_PERM:
2334 if (opt.opt_val != 0) 2342 if (opt.opt_val != 0)
@@ -2688,6 +2696,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2688 else 2696 else
2689 icsk->icsk_user_timeout = msecs_to_jiffies(val); 2697 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2690 break; 2698 break;
2699
2700 case TCP_FASTOPEN:
2701 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
2702 TCPF_LISTEN)))
2703 err = fastopen_init_queue(sk, val);
2704 else
2705 err = -EINVAL;
2706 break;
2691 default: 2707 default:
2692 err = -ENOPROTOOPT; 2708 err = -ENOPROTOOPT;
2693 break; 2709 break;
@@ -3501,11 +3517,15 @@ EXPORT_SYMBOL(tcp_cookie_generator);
3501 3517
3502void tcp_done(struct sock *sk) 3518void tcp_done(struct sock *sk)
3503{ 3519{
3520 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3521
3504 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) 3522 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3505 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); 3523 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3506 3524
3507 tcp_set_state(sk, TCP_CLOSE); 3525 tcp_set_state(sk, TCP_CLOSE);
3508 tcp_clear_xmit_timers(sk); 3526 tcp_clear_xmit_timers(sk);
3527 if (req != NULL)
3528 reqsk_fastopen_remove(sk, req, false);
3509 3529
3510 sk->sk_shutdown = SHUTDOWN_MASK; 3530 sk->sk_shutdown = SHUTDOWN_MASK;
3511 3531
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index a7f729c409d..8f7ef0ad80e 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -1,10 +1,91 @@
1#include <linux/err.h>
1#include <linux/init.h> 2#include <linux/init.h>
2#include <linux/kernel.h> 3#include <linux/kernel.h>
4#include <linux/list.h>
5#include <linux/tcp.h>
6#include <linux/rcupdate.h>
7#include <linux/rculist.h>
8#include <net/inetpeer.h>
9#include <net/tcp.h>
3 10
4int sysctl_tcp_fastopen; 11int sysctl_tcp_fastopen __read_mostly;
12
13struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
14
15static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
16
17static void tcp_fastopen_ctx_free(struct rcu_head *head)
18{
19 struct tcp_fastopen_context *ctx =
20 container_of(head, struct tcp_fastopen_context, rcu);
21 crypto_free_cipher(ctx->tfm);
22 kfree(ctx);
23}
24
25int tcp_fastopen_reset_cipher(void *key, unsigned int len)
26{
27 int err;
28 struct tcp_fastopen_context *ctx, *octx;
29
30 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
31 if (!ctx)
32 return -ENOMEM;
33 ctx->tfm = crypto_alloc_cipher("aes", 0, 0);
34
35 if (IS_ERR(ctx->tfm)) {
36 err = PTR_ERR(ctx->tfm);
37error: kfree(ctx);
38 pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
39 return err;
40 }
41 err = crypto_cipher_setkey(ctx->tfm, key, len);
42 if (err) {
43 pr_err("TCP: TFO cipher key error: %d\n", err);
44 crypto_free_cipher(ctx->tfm);
45 goto error;
46 }
47 memcpy(ctx->key, key, len);
48
49 spin_lock(&tcp_fastopen_ctx_lock);
50
51 octx = rcu_dereference_protected(tcp_fastopen_ctx,
52 lockdep_is_held(&tcp_fastopen_ctx_lock));
53 rcu_assign_pointer(tcp_fastopen_ctx, ctx);
54 spin_unlock(&tcp_fastopen_ctx_lock);
55
56 if (octx)
57 call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
58 return err;
59}
60
61/* Computes the fastopen cookie for the peer.
62 * The peer address is a 128 bits long (pad with zeros for IPv4).
63 *
64 * The caller must check foc->len to determine if a valid cookie
65 * has been generated successfully.
66*/
67void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc)
68{
69 __be32 peer_addr[4] = { addr, 0, 0, 0 };
70 struct tcp_fastopen_context *ctx;
71
72 rcu_read_lock();
73 ctx = rcu_dereference(tcp_fastopen_ctx);
74 if (ctx) {
75 crypto_cipher_encrypt_one(ctx->tfm,
76 foc->val,
77 (__u8 *)peer_addr);
78 foc->len = TCP_FASTOPEN_COOKIE_SIZE;
79 }
80 rcu_read_unlock();
81}
5 82
6static int __init tcp_fastopen_init(void) 83static int __init tcp_fastopen_init(void)
7{ 84{
85 __u8 key[TCP_FASTOPEN_KEY_LENGTH];
86
87 get_random_bytes(key, sizeof(key));
88 tcp_fastopen_reset_cipher(key, sizeof(key));
8 return 0; 89 return 0;
9} 90}
10 91
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 85308b90df8..432c36649db 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -237,7 +237,11 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
237 tcp_enter_quickack_mode((struct sock *)tp); 237 tcp_enter_quickack_mode((struct sock *)tp);
238 break; 238 break;
239 case INET_ECN_CE: 239 case INET_ECN_CE:
240 tp->ecn_flags |= TCP_ECN_DEMAND_CWR; 240 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
241 /* Better not delay acks, sender can have a very low cwnd */
242 tcp_enter_quickack_mode((struct sock *)tp);
243 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
244 }
241 /* fallinto */ 245 /* fallinto */
242 default: 246 default:
243 tp->ecn_flags |= TCP_ECN_SEEN; 247 tp->ecn_flags |= TCP_ECN_SEEN;
@@ -374,7 +378,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
374/* 4. Try to fixup all. It is made immediately after connection enters 378/* 4. Try to fixup all. It is made immediately after connection enters
375 * established state. 379 * established state.
376 */ 380 */
377static void tcp_init_buffer_space(struct sock *sk) 381void tcp_init_buffer_space(struct sock *sk)
378{ 382{
379 struct tcp_sock *tp = tcp_sk(sk); 383 struct tcp_sock *tp = tcp_sk(sk);
380 int maxwin; 384 int maxwin;
@@ -739,29 +743,6 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
739 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 743 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
740} 744}
741 745
742/* Set slow start threshold and cwnd not falling to slow start */
743void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
744{
745 struct tcp_sock *tp = tcp_sk(sk);
746 const struct inet_connection_sock *icsk = inet_csk(sk);
747
748 tp->prior_ssthresh = 0;
749 tp->bytes_acked = 0;
750 if (icsk->icsk_ca_state < TCP_CA_CWR) {
751 tp->undo_marker = 0;
752 if (set_ssthresh)
753 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
754 tp->snd_cwnd = min(tp->snd_cwnd,
755 tcp_packets_in_flight(tp) + 1U);
756 tp->snd_cwnd_cnt = 0;
757 tp->high_seq = tp->snd_nxt;
758 tp->snd_cwnd_stamp = tcp_time_stamp;
759 TCP_ECN_queue_cwr(tp);
760
761 tcp_set_ca_state(sk, TCP_CA_CWR);
762 }
763}
764
765/* 746/*
766 * Packet counting of FACK is based on in-order assumptions, therefore TCP 747 * Packet counting of FACK is based on in-order assumptions, therefore TCP
767 * disables it when reordering is detected 748 * disables it when reordering is detected
@@ -2489,35 +2470,6 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
2489 tp->snd_cwnd_stamp = tcp_time_stamp; 2470 tp->snd_cwnd_stamp = tcp_time_stamp;
2490} 2471}
2491 2472
2492/* Lower bound on congestion window is slow start threshold
2493 * unless congestion avoidance choice decides to overide it.
2494 */
2495static inline u32 tcp_cwnd_min(const struct sock *sk)
2496{
2497 const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
2498
2499 return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
2500}
2501
2502/* Decrease cwnd each second ack. */
2503static void tcp_cwnd_down(struct sock *sk, int flag)
2504{
2505 struct tcp_sock *tp = tcp_sk(sk);
2506 int decr = tp->snd_cwnd_cnt + 1;
2507
2508 if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) ||
2509 (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) {
2510 tp->snd_cwnd_cnt = decr & 1;
2511 decr >>= 1;
2512
2513 if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
2514 tp->snd_cwnd -= decr;
2515
2516 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
2517 tp->snd_cwnd_stamp = tcp_time_stamp;
2518 }
2519}
2520
2521/* Nothing was retransmitted or returned timestamp is less 2473/* Nothing was retransmitted or returned timestamp is less
2522 * than timestamp of the first retransmission. 2474 * than timestamp of the first retransmission.
2523 */ 2475 */
@@ -2719,24 +2671,80 @@ static bool tcp_try_undo_loss(struct sock *sk)
2719 return false; 2671 return false;
2720} 2672}
2721 2673
2722static inline void tcp_complete_cwr(struct sock *sk) 2674/* The cwnd reduction in CWR and Recovery use the PRR algorithm
2675 * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/
2676 * It computes the number of packets to send (sndcnt) based on packets newly
2677 * delivered:
2678 * 1) If the packets in flight is larger than ssthresh, PRR spreads the
2679 * cwnd reductions across a full RTT.
2680 * 2) If packets in flight is lower than ssthresh (such as due to excess
2681 * losses and/or application stalls), do not perform any further cwnd
2682 * reductions, but instead slow start up to ssthresh.
2683 */
2684static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
2723{ 2685{
2724 struct tcp_sock *tp = tcp_sk(sk); 2686 struct tcp_sock *tp = tcp_sk(sk);
2725 2687
2726 /* Do not moderate cwnd if it's already undone in cwr or recovery. */ 2688 tp->high_seq = tp->snd_nxt;
2727 if (tp->undo_marker) { 2689 tp->bytes_acked = 0;
2728 if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) { 2690 tp->snd_cwnd_cnt = 0;
2729 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); 2691 tp->prior_cwnd = tp->snd_cwnd;
2730 tp->snd_cwnd_stamp = tcp_time_stamp; 2692 tp->prr_delivered = 0;
2731 } else if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) { 2693 tp->prr_out = 0;
2732 /* PRR algorithm. */ 2694 if (set_ssthresh)
2733 tp->snd_cwnd = tp->snd_ssthresh; 2695 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2734 tp->snd_cwnd_stamp = tcp_time_stamp; 2696 TCP_ECN_queue_cwr(tp);
2735 } 2697}
2698
2699static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
2700 int fast_rexmit)
2701{
2702 struct tcp_sock *tp = tcp_sk(sk);
2703 int sndcnt = 0;
2704 int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
2705
2706 tp->prr_delivered += newly_acked_sacked;
2707 if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
2708 u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
2709 tp->prior_cwnd - 1;
2710 sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
2711 } else {
2712 sndcnt = min_t(int, delta,
2713 max_t(int, tp->prr_delivered - tp->prr_out,
2714 newly_acked_sacked) + 1);
2715 }
2716
2717 sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
2718 tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
2719}
2720
2721static inline void tcp_end_cwnd_reduction(struct sock *sk)
2722{
2723 struct tcp_sock *tp = tcp_sk(sk);
2724
2725 /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
2726 if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
2727 (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
2728 tp->snd_cwnd = tp->snd_ssthresh;
2729 tp->snd_cwnd_stamp = tcp_time_stamp;
2736 } 2730 }
2737 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); 2731 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
2738} 2732}
2739 2733
2734/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
2735void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
2736{
2737 struct tcp_sock *tp = tcp_sk(sk);
2738
2739 tp->prior_ssthresh = 0;
2740 tp->bytes_acked = 0;
2741 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2742 tp->undo_marker = 0;
2743 tcp_init_cwnd_reduction(sk, set_ssthresh);
2744 tcp_set_ca_state(sk, TCP_CA_CWR);
2745 }
2746}
2747
2740static void tcp_try_keep_open(struct sock *sk) 2748static void tcp_try_keep_open(struct sock *sk)
2741{ 2749{
2742 struct tcp_sock *tp = tcp_sk(sk); 2750 struct tcp_sock *tp = tcp_sk(sk);
@@ -2751,7 +2759,7 @@ static void tcp_try_keep_open(struct sock *sk)
2751 } 2759 }
2752} 2760}
2753 2761
2754static void tcp_try_to_open(struct sock *sk, int flag) 2762static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked)
2755{ 2763{
2756 struct tcp_sock *tp = tcp_sk(sk); 2764 struct tcp_sock *tp = tcp_sk(sk);
2757 2765
@@ -2768,7 +2776,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
2768 if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open) 2776 if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
2769 tcp_moderate_cwnd(tp); 2777 tcp_moderate_cwnd(tp);
2770 } else { 2778 } else {
2771 tcp_cwnd_down(sk, flag); 2779 tcp_cwnd_reduction(sk, newly_acked_sacked, 0);
2772 } 2780 }
2773} 2781}
2774 2782
@@ -2850,38 +2858,6 @@ void tcp_simple_retransmit(struct sock *sk)
2850} 2858}
2851EXPORT_SYMBOL(tcp_simple_retransmit); 2859EXPORT_SYMBOL(tcp_simple_retransmit);
2852 2860
2853/* This function implements the PRR algorithm, specifcally the PRR-SSRB
2854 * (proportional rate reduction with slow start reduction bound) as described in
2855 * http://www.ietf.org/id/draft-mathis-tcpm-proportional-rate-reduction-01.txt.
2856 * It computes the number of packets to send (sndcnt) based on packets newly
2857 * delivered:
2858 * 1) If the packets in flight is larger than ssthresh, PRR spreads the
2859 * cwnd reductions across a full RTT.
2860 * 2) If packets in flight is lower than ssthresh (such as due to excess
2861 * losses and/or application stalls), do not perform any further cwnd
2862 * reductions, but instead slow start up to ssthresh.
2863 */
2864static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
2865 int fast_rexmit, int flag)
2866{
2867 struct tcp_sock *tp = tcp_sk(sk);
2868 int sndcnt = 0;
2869 int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
2870
2871 if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
2872 u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
2873 tp->prior_cwnd - 1;
2874 sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
2875 } else {
2876 sndcnt = min_t(int, delta,
2877 max_t(int, tp->prr_delivered - tp->prr_out,
2878 newly_acked_sacked) + 1);
2879 }
2880
2881 sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
2882 tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
2883}
2884
2885static void tcp_enter_recovery(struct sock *sk, bool ece_ack) 2861static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2886{ 2862{
2887 struct tcp_sock *tp = tcp_sk(sk); 2863 struct tcp_sock *tp = tcp_sk(sk);
@@ -2894,7 +2870,6 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2894 2870
2895 NET_INC_STATS_BH(sock_net(sk), mib_idx); 2871 NET_INC_STATS_BH(sock_net(sk), mib_idx);
2896 2872
2897 tp->high_seq = tp->snd_nxt;
2898 tp->prior_ssthresh = 0; 2873 tp->prior_ssthresh = 0;
2899 tp->undo_marker = tp->snd_una; 2874 tp->undo_marker = tp->snd_una;
2900 tp->undo_retrans = tp->retrans_out; 2875 tp->undo_retrans = tp->retrans_out;
@@ -2902,15 +2877,8 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2902 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { 2877 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2903 if (!ece_ack) 2878 if (!ece_ack)
2904 tp->prior_ssthresh = tcp_current_ssthresh(sk); 2879 tp->prior_ssthresh = tcp_current_ssthresh(sk);
2905 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); 2880 tcp_init_cwnd_reduction(sk, true);
2906 TCP_ECN_queue_cwr(tp);
2907 } 2881 }
2908
2909 tp->bytes_acked = 0;
2910 tp->snd_cwnd_cnt = 0;
2911 tp->prior_cwnd = tp->snd_cwnd;
2912 tp->prr_delivered = 0;
2913 tp->prr_out = 0;
2914 tcp_set_ca_state(sk, TCP_CA_Recovery); 2882 tcp_set_ca_state(sk, TCP_CA_Recovery);
2915} 2883}
2916 2884
@@ -2926,13 +2894,14 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2926 * tcp_xmit_retransmit_queue(). 2894 * tcp_xmit_retransmit_queue().
2927 */ 2895 */
2928static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, 2896static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2929 int newly_acked_sacked, bool is_dupack, 2897 int prior_sacked, bool is_dupack,
2930 int flag) 2898 int flag)
2931{ 2899{
2932 struct inet_connection_sock *icsk = inet_csk(sk); 2900 struct inet_connection_sock *icsk = inet_csk(sk);
2933 struct tcp_sock *tp = tcp_sk(sk); 2901 struct tcp_sock *tp = tcp_sk(sk);
2934 int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && 2902 int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
2935 (tcp_fackets_out(tp) > tp->reordering)); 2903 (tcp_fackets_out(tp) > tp->reordering));
2904 int newly_acked_sacked = 0;
2936 int fast_rexmit = 0; 2905 int fast_rexmit = 0;
2937 2906
2938 if (WARN_ON(!tp->packets_out && tp->sacked_out)) 2907 if (WARN_ON(!tp->packets_out && tp->sacked_out))
@@ -2969,7 +2938,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2969 /* CWR is to be held something *above* high_seq 2938 /* CWR is to be held something *above* high_seq
2970 * is ACKed for CWR bit to reach receiver. */ 2939 * is ACKed for CWR bit to reach receiver. */
2971 if (tp->snd_una != tp->high_seq) { 2940 if (tp->snd_una != tp->high_seq) {
2972 tcp_complete_cwr(sk); 2941 tcp_end_cwnd_reduction(sk);
2973 tcp_set_ca_state(sk, TCP_CA_Open); 2942 tcp_set_ca_state(sk, TCP_CA_Open);
2974 } 2943 }
2975 break; 2944 break;
@@ -2979,7 +2948,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2979 tcp_reset_reno_sack(tp); 2948 tcp_reset_reno_sack(tp);
2980 if (tcp_try_undo_recovery(sk)) 2949 if (tcp_try_undo_recovery(sk))
2981 return; 2950 return;
2982 tcp_complete_cwr(sk); 2951 tcp_end_cwnd_reduction(sk);
2983 break; 2952 break;
2984 } 2953 }
2985 } 2954 }
@@ -2992,6 +2961,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2992 tcp_add_reno_sack(sk); 2961 tcp_add_reno_sack(sk);
2993 } else 2962 } else
2994 do_lost = tcp_try_undo_partial(sk, pkts_acked); 2963 do_lost = tcp_try_undo_partial(sk, pkts_acked);
2964 newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
2995 break; 2965 break;
2996 case TCP_CA_Loss: 2966 case TCP_CA_Loss:
2997 if (flag & FLAG_DATA_ACKED) 2967 if (flag & FLAG_DATA_ACKED)
@@ -3013,12 +2983,13 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
3013 if (is_dupack) 2983 if (is_dupack)
3014 tcp_add_reno_sack(sk); 2984 tcp_add_reno_sack(sk);
3015 } 2985 }
2986 newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
3016 2987
3017 if (icsk->icsk_ca_state <= TCP_CA_Disorder) 2988 if (icsk->icsk_ca_state <= TCP_CA_Disorder)
3018 tcp_try_undo_dsack(sk); 2989 tcp_try_undo_dsack(sk);
3019 2990
3020 if (!tcp_time_to_recover(sk, flag)) { 2991 if (!tcp_time_to_recover(sk, flag)) {
3021 tcp_try_to_open(sk, flag); 2992 tcp_try_to_open(sk, flag, newly_acked_sacked);
3022 return; 2993 return;
3023 } 2994 }
3024 2995
@@ -3040,8 +3011,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
3040 3011
3041 if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) 3012 if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
3042 tcp_update_scoreboard(sk, fast_rexmit); 3013 tcp_update_scoreboard(sk, fast_rexmit);
3043 tp->prr_delivered += newly_acked_sacked; 3014 tcp_cwnd_reduction(sk, newly_acked_sacked, fast_rexmit);
3044 tcp_update_cwnd_in_recovery(sk, newly_acked_sacked, fast_rexmit, flag);
3045 tcp_xmit_retransmit_queue(sk); 3015 tcp_xmit_retransmit_queue(sk);
3046} 3016}
3047 3017
@@ -3120,6 +3090,12 @@ void tcp_rearm_rto(struct sock *sk)
3120{ 3090{
3121 struct tcp_sock *tp = tcp_sk(sk); 3091 struct tcp_sock *tp = tcp_sk(sk);
3122 3092
3093 /* If the retrans timer is currently being used by Fast Open
3094 * for SYN-ACK retrans purpose, stay put.
3095 */
3096 if (tp->fastopen_rsk)
3097 return;
3098
3123 if (!tp->packets_out) { 3099 if (!tp->packets_out) {
3124 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); 3100 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
3125 } else { 3101 } else {
@@ -3381,7 +3357,7 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
3381{ 3357{
3382 const struct tcp_sock *tp = tcp_sk(sk); 3358 const struct tcp_sock *tp = tcp_sk(sk);
3383 return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && 3359 return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
3384 !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR)); 3360 !tcp_in_cwnd_reduction(sk);
3385} 3361}
3386 3362
3387/* Check that window update is acceptable. 3363/* Check that window update is acceptable.
@@ -3449,9 +3425,9 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
3449} 3425}
3450 3426
3451/* A conservative spurious RTO response algorithm: reduce cwnd using 3427/* A conservative spurious RTO response algorithm: reduce cwnd using
3452 * rate halving and continue in congestion avoidance. 3428 * PRR and continue in congestion avoidance.
3453 */ 3429 */
3454static void tcp_ratehalving_spur_to_response(struct sock *sk) 3430static void tcp_cwr_spur_to_response(struct sock *sk)
3455{ 3431{
3456 tcp_enter_cwr(sk, 0); 3432 tcp_enter_cwr(sk, 0);
3457} 3433}
@@ -3459,7 +3435,7 @@ static void tcp_ratehalving_spur_to_response(struct sock *sk)
3459static void tcp_undo_spur_to_response(struct sock *sk, int flag) 3435static void tcp_undo_spur_to_response(struct sock *sk, int flag)
3460{ 3436{
3461 if (flag & FLAG_ECE) 3437 if (flag & FLAG_ECE)
3462 tcp_ratehalving_spur_to_response(sk); 3438 tcp_cwr_spur_to_response(sk);
3463 else 3439 else
3464 tcp_undo_cwr(sk, true); 3440 tcp_undo_cwr(sk, true);
3465} 3441}
@@ -3566,7 +3542,7 @@ static bool tcp_process_frto(struct sock *sk, int flag)
3566 tcp_conservative_spur_to_response(tp); 3542 tcp_conservative_spur_to_response(tp);
3567 break; 3543 break;
3568 default: 3544 default:
3569 tcp_ratehalving_spur_to_response(sk); 3545 tcp_cwr_spur_to_response(sk);
3570 break; 3546 break;
3571 } 3547 }
3572 tp->frto_counter = 0; 3548 tp->frto_counter = 0;
@@ -3590,7 +3566,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3590 int prior_packets; 3566 int prior_packets;
3591 int prior_sacked = tp->sacked_out; 3567 int prior_sacked = tp->sacked_out;
3592 int pkts_acked = 0; 3568 int pkts_acked = 0;
3593 int newly_acked_sacked = 0;
3594 bool frto_cwnd = false; 3569 bool frto_cwnd = false;
3595 3570
3596 /* If the ack is older than previous acks 3571 /* If the ack is older than previous acks
@@ -3666,8 +3641,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3666 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); 3641 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
3667 3642
3668 pkts_acked = prior_packets - tp->packets_out; 3643 pkts_acked = prior_packets - tp->packets_out;
3669 newly_acked_sacked = (prior_packets - prior_sacked) -
3670 (tp->packets_out - tp->sacked_out);
3671 3644
3672 if (tp->frto_counter) 3645 if (tp->frto_counter)
3673 frto_cwnd = tcp_process_frto(sk, flag); 3646 frto_cwnd = tcp_process_frto(sk, flag);
@@ -3681,7 +3654,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3681 tcp_may_raise_cwnd(sk, flag)) 3654 tcp_may_raise_cwnd(sk, flag))
3682 tcp_cong_avoid(sk, ack, prior_in_flight); 3655 tcp_cong_avoid(sk, ack, prior_in_flight);
3683 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3656 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3684 tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, 3657 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
3685 is_dupack, flag); 3658 is_dupack, flag);
3686 } else { 3659 } else {
3687 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) 3660 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
@@ -3698,7 +3671,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3698no_queue: 3671no_queue:
3699 /* If data was DSACKed, see if we can undo a cwnd reduction. */ 3672 /* If data was DSACKed, see if we can undo a cwnd reduction. */
3700 if (flag & FLAG_DSACKING_ACK) 3673 if (flag & FLAG_DSACKING_ACK)
3701 tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, 3674 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
3702 is_dupack, flag); 3675 is_dupack, flag);
3703 /* If this ack opens up a zero window, clear backoff. It was 3676 /* If this ack opens up a zero window, clear backoff. It was
3704 * being used to time the probes, and is probably far higher than 3677 * being used to time the probes, and is probably far higher than
@@ -3718,8 +3691,7 @@ old_ack:
3718 */ 3691 */
3719 if (TCP_SKB_CB(skb)->sacked) { 3692 if (TCP_SKB_CB(skb)->sacked) {
3720 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); 3693 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
3721 newly_acked_sacked = tp->sacked_out - prior_sacked; 3694 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
3722 tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked,
3723 is_dupack, flag); 3695 is_dupack, flag);
3724 } 3696 }
3725 3697
@@ -4035,7 +4007,7 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
4035} 4007}
4036 4008
4037/* When we get a reset we do this. */ 4009/* When we get a reset we do this. */
4038static void tcp_reset(struct sock *sk) 4010void tcp_reset(struct sock *sk)
4039{ 4011{
4040 /* We want the right error as BSD sees it (and indeed as we do). */ 4012 /* We want the right error as BSD sees it (and indeed as we do). */
4041 switch (sk->sk_state) { 4013 switch (sk->sk_state) {
@@ -4662,7 +4634,7 @@ queue_and_out:
4662 4634
4663 if (eaten > 0) 4635 if (eaten > 0)
4664 kfree_skb_partial(skb, fragstolen); 4636 kfree_skb_partial(skb, fragstolen);
4665 else if (!sock_flag(sk, SOCK_DEAD)) 4637 if (!sock_flag(sk, SOCK_DEAD))
4666 sk->sk_data_ready(sk, 0); 4638 sk->sk_data_ready(sk, 0);
4667 return; 4639 return;
4668 } 4640 }
@@ -5557,8 +5529,7 @@ no_ack:
5557#endif 5529#endif
5558 if (eaten) 5530 if (eaten)
5559 kfree_skb_partial(skb, fragstolen); 5531 kfree_skb_partial(skb, fragstolen);
5560 else 5532 sk->sk_data_ready(sk, 0);
5561 sk->sk_data_ready(sk, 0);
5562 return 0; 5533 return 0;
5563 } 5534 }
5564 } 5535 }
@@ -5742,7 +5713,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5742 5713
5743 TCP_ECN_rcv_synack(tp, th); 5714 TCP_ECN_rcv_synack(tp, th);
5744 5715
5745 tp->snd_wl1 = TCP_SKB_CB(skb)->seq; 5716 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5746 tcp_ack(sk, skb, FLAG_SLOWPATH); 5717 tcp_ack(sk, skb, FLAG_SLOWPATH);
5747 5718
5748 /* Ok.. it's good. Set up sequence numbers and 5719 /* Ok.. it's good. Set up sequence numbers and
@@ -5755,7 +5726,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5755 * never scaled. 5726 * never scaled.
5756 */ 5727 */
5757 tp->snd_wnd = ntohs(th->window); 5728 tp->snd_wnd = ntohs(th->window);
5758 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5759 5729
5760 if (!tp->rx_opt.wscale_ok) { 5730 if (!tp->rx_opt.wscale_ok) {
5761 tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; 5731 tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
@@ -5893,7 +5863,9 @@ discard:
5893 tcp_send_synack(sk); 5863 tcp_send_synack(sk);
5894#if 0 5864#if 0
5895 /* Note, we could accept data and URG from this segment. 5865 /* Note, we could accept data and URG from this segment.
5896 * There are no obstacles to make this. 5866 * There are no obstacles to make this (except that we must
5867 * either change tcp_recvmsg() to prevent it from returning data
5868 * before 3WHS completes per RFC793, or employ TCP Fast Open).
5897 * 5869 *
5898 * However, if we ignore data in ACKless segments sometimes, 5870 * However, if we ignore data in ACKless segments sometimes,
5899 * we have no reasons to accept it sometimes. 5871 * we have no reasons to accept it sometimes.
@@ -5933,6 +5905,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5933{ 5905{
5934 struct tcp_sock *tp = tcp_sk(sk); 5906 struct tcp_sock *tp = tcp_sk(sk);
5935 struct inet_connection_sock *icsk = inet_csk(sk); 5907 struct inet_connection_sock *icsk = inet_csk(sk);
5908 struct request_sock *req;
5936 int queued = 0; 5909 int queued = 0;
5937 5910
5938 tp->rx_opt.saw_tstamp = 0; 5911 tp->rx_opt.saw_tstamp = 0;
@@ -5988,6 +5961,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5988 return 0; 5961 return 0;
5989 } 5962 }
5990 5963
5964 req = tp->fastopen_rsk;
5965 if (req != NULL) {
5966 BUG_ON(sk->sk_state != TCP_SYN_RECV &&
5967 sk->sk_state != TCP_FIN_WAIT1);
5968
5969 if (tcp_check_req(sk, skb, req, NULL, true) == NULL)
5970 goto discard;
5971 }
5991 if (!tcp_validate_incoming(sk, skb, th, 0)) 5972 if (!tcp_validate_incoming(sk, skb, th, 0))
5992 return 0; 5973 return 0;
5993 5974
@@ -5998,7 +5979,25 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5998 switch (sk->sk_state) { 5979 switch (sk->sk_state) {
5999 case TCP_SYN_RECV: 5980 case TCP_SYN_RECV:
6000 if (acceptable) { 5981 if (acceptable) {
6001 tp->copied_seq = tp->rcv_nxt; 5982 /* Once we leave TCP_SYN_RECV, we no longer
5983 * need req so release it.
5984 */
5985 if (req) {
5986 tcp_synack_rtt_meas(sk, req);
5987 tp->total_retrans = req->retrans;
5988
5989 reqsk_fastopen_remove(sk, req, false);
5990 } else {
5991 /* Make sure socket is routed, for
5992 * correct metrics.
5993 */
5994 icsk->icsk_af_ops->rebuild_header(sk);
5995 tcp_init_congestion_control(sk);
5996
5997 tcp_mtup_init(sk);
5998 tcp_init_buffer_space(sk);
5999 tp->copied_seq = tp->rcv_nxt;
6000 }
6002 smp_mb(); 6001 smp_mb();
6003 tcp_set_state(sk, TCP_ESTABLISHED); 6002 tcp_set_state(sk, TCP_ESTABLISHED);
6004 sk->sk_state_change(sk); 6003 sk->sk_state_change(sk);
@@ -6020,23 +6019,27 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
6020 if (tp->rx_opt.tstamp_ok) 6019 if (tp->rx_opt.tstamp_ok)
6021 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 6020 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
6022 6021
6023 /* Make sure socket is routed, for 6022 if (req) {
6024 * correct metrics. 6023 /* Re-arm the timer because data may
6025 */ 6024 * have been sent out. This is similar
6026 icsk->icsk_af_ops->rebuild_header(sk); 6025 * to the regular data transmission case
6027 6026 * when new data has just been ack'ed.
6028 tcp_init_metrics(sk); 6027 *
6029 6028 * (TFO) - we could try to be more
6030 tcp_init_congestion_control(sk); 6029 * aggressive and retranmitting any data
6030 * sooner based on when they were sent
6031 * out.
6032 */
6033 tcp_rearm_rto(sk);
6034 } else
6035 tcp_init_metrics(sk);
6031 6036
6032 /* Prevent spurious tcp_cwnd_restart() on 6037 /* Prevent spurious tcp_cwnd_restart() on
6033 * first data packet. 6038 * first data packet.
6034 */ 6039 */
6035 tp->lsndtime = tcp_time_stamp; 6040 tp->lsndtime = tcp_time_stamp;
6036 6041
6037 tcp_mtup_init(sk);
6038 tcp_initialize_rcv_mss(sk); 6042 tcp_initialize_rcv_mss(sk);
6039 tcp_init_buffer_space(sk);
6040 tcp_fast_path_on(tp); 6043 tcp_fast_path_on(tp);
6041 } else { 6044 } else {
6042 return 1; 6045 return 1;
@@ -6044,6 +6047,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
6044 break; 6047 break;
6045 6048
6046 case TCP_FIN_WAIT1: 6049 case TCP_FIN_WAIT1:
6050 /* If we enter the TCP_FIN_WAIT1 state and we are a
6051 * Fast Open socket and this is the first acceptable
6052 * ACK we have received, this would have acknowledged
6053 * our SYNACK so stop the SYNACK timer.
6054 */
6055 if (acceptable && req != NULL) {
6056 /* We no longer need the request sock. */
6057 reqsk_fastopen_remove(sk, req, false);
6058 tcp_rearm_rto(sk);
6059 }
6047 if (tp->snd_una == tp->write_seq) { 6060 if (tp->snd_una == tp->write_seq) {
6048 struct dst_entry *dst; 6061 struct dst_entry *dst;
6049 6062
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 76782376401..75735c9a6a9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -352,6 +352,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
352 const int code = icmp_hdr(icmp_skb)->code; 352 const int code = icmp_hdr(icmp_skb)->code;
353 struct sock *sk; 353 struct sock *sk;
354 struct sk_buff *skb; 354 struct sk_buff *skb;
355 struct request_sock *req;
355 __u32 seq; 356 __u32 seq;
356 __u32 remaining; 357 __u32 remaining;
357 int err; 358 int err;
@@ -394,9 +395,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
394 395
395 icsk = inet_csk(sk); 396 icsk = inet_csk(sk);
396 tp = tcp_sk(sk); 397 tp = tcp_sk(sk);
398 req = tp->fastopen_rsk;
397 seq = ntohl(th->seq); 399 seq = ntohl(th->seq);
398 if (sk->sk_state != TCP_LISTEN && 400 if (sk->sk_state != TCP_LISTEN &&
399 !between(seq, tp->snd_una, tp->snd_nxt)) { 401 !between(seq, tp->snd_una, tp->snd_nxt) &&
402 (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
403 /* For a Fast Open socket, allow seq to be snt_isn. */
400 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); 404 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
401 goto out; 405 goto out;
402 } 406 }
@@ -417,10 +421,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
417 421
418 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 422 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
419 tp->mtu_info = info; 423 tp->mtu_info = info;
420 if (!sock_owned_by_user(sk)) 424 if (!sock_owned_by_user(sk)) {
421 tcp_v4_mtu_reduced(sk); 425 tcp_v4_mtu_reduced(sk);
422 else 426 } else {
423 set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags); 427 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
428 sock_hold(sk);
429 }
424 goto out; 430 goto out;
425 } 431 }
426 432
@@ -433,6 +439,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
433 !icsk->icsk_backoff) 439 !icsk->icsk_backoff)
434 break; 440 break;
435 441
442 /* XXX (TFO) - revisit the following logic for TFO */
443
436 if (sock_owned_by_user(sk)) 444 if (sock_owned_by_user(sk))
437 break; 445 break;
438 446
@@ -464,6 +472,14 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
464 goto out; 472 goto out;
465 } 473 }
466 474
475 /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
476 * than following the TCP_SYN_RECV case and closing the socket,
477 * we ignore the ICMP error and keep trying like a fully established
478 * socket. Is this the right thing to do?
479 */
480 if (req && req->sk == NULL)
481 goto out;
482
467 switch (sk->sk_state) { 483 switch (sk->sk_state) {
468 struct request_sock *req, **prev; 484 struct request_sock *req, **prev;
469 case TCP_LISTEN: 485 case TCP_LISTEN:
@@ -496,7 +512,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
496 512
497 case TCP_SYN_SENT: 513 case TCP_SYN_SENT:
498 case TCP_SYN_RECV: /* Cannot happen. 514 case TCP_SYN_RECV: /* Cannot happen.
499 It can f.e. if SYNs crossed. 515 It can f.e. if SYNs crossed,
516 or Fast Open.
500 */ 517 */
501 if (!sock_owned_by_user(sk)) { 518 if (!sock_owned_by_user(sk)) {
502 sk->sk_err = err; 519 sk->sk_err = err;
@@ -807,8 +824,12 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
807static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, 824static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
808 struct request_sock *req) 825 struct request_sock *req)
809{ 826{
810 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, 827 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
811 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, 828 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
829 */
830 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
831 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
832 tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
812 req->ts_recent, 833 req->ts_recent,
813 0, 834 0,
814 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, 835 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
@@ -837,7 +858,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
837 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 858 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
838 return -1; 859 return -1;
839 860
840 skb = tcp_make_synack(sk, dst, req, rvp); 861 skb = tcp_make_synack(sk, dst, req, rvp, NULL);
841 862
842 if (skb) { 863 if (skb) {
843 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); 864 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
@@ -847,6 +868,8 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
847 ireq->rmt_addr, 868 ireq->rmt_addr,
848 ireq->opt); 869 ireq->opt);
849 err = net_xmit_eval(err); 870 err = net_xmit_eval(err);
871 if (!tcp_rsk(req)->snt_synack && !err)
872 tcp_rsk(req)->snt_synack = tcp_time_stamp;
850 } 873 }
851 874
852 return err; 875 return err;
@@ -902,8 +925,7 @@ EXPORT_SYMBOL(tcp_syn_flood_action);
902/* 925/*
903 * Save and compile IPv4 options into the request_sock if needed. 926 * Save and compile IPv4 options into the request_sock if needed.
904 */ 927 */
905static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk, 928static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
906 struct sk_buff *skb)
907{ 929{
908 const struct ip_options *opt = &(IPCB(skb)->opt); 930 const struct ip_options *opt = &(IPCB(skb)->opt);
909 struct ip_options_rcu *dopt = NULL; 931 struct ip_options_rcu *dopt = NULL;
@@ -1270,6 +1292,182 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1270}; 1292};
1271#endif 1293#endif
1272 1294
1295static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1296 struct request_sock *req,
1297 struct tcp_fastopen_cookie *foc,
1298 struct tcp_fastopen_cookie *valid_foc)
1299{
1300 bool skip_cookie = false;
1301 struct fastopen_queue *fastopenq;
1302
1303 if (likely(!fastopen_cookie_present(foc))) {
1304 /* See include/net/tcp.h for the meaning of these knobs */
1305 if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1306 ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1307 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1308 skip_cookie = true; /* no cookie to validate */
1309 else
1310 return false;
1311 }
1312 fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1313 /* A FO option is present; bump the counter. */
1314 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1315
1316 /* Make sure the listener has enabled fastopen, and we don't
1317 * exceed the max # of pending TFO requests allowed before trying
1318 * to validating the cookie in order to avoid burning CPU cycles
1319 * unnecessarily.
1320 *
1321 * XXX (TFO) - The implication of checking the max_qlen before
1322 * processing a cookie request is that clients can't differentiate
1323 * between qlen overflow causing Fast Open to be disabled
1324 * temporarily vs a server not supporting Fast Open at all.
1325 */
1326 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1327 fastopenq == NULL || fastopenq->max_qlen == 0)
1328 return false;
1329
1330 if (fastopenq->qlen >= fastopenq->max_qlen) {
1331 struct request_sock *req1;
1332 spin_lock(&fastopenq->lock);
1333 req1 = fastopenq->rskq_rst_head;
1334 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1335 spin_unlock(&fastopenq->lock);
1336 NET_INC_STATS_BH(sock_net(sk),
1337 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1338 /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1339 foc->len = -1;
1340 return false;
1341 }
1342 fastopenq->rskq_rst_head = req1->dl_next;
1343 fastopenq->qlen--;
1344 spin_unlock(&fastopenq->lock);
1345 reqsk_free(req1);
1346 }
1347 if (skip_cookie) {
1348 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1349 return true;
1350 }
1351 if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1352 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1353 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1354 if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1355 memcmp(&foc->val[0], &valid_foc->val[0],
1356 TCP_FASTOPEN_COOKIE_SIZE) != 0)
1357 return false;
1358 valid_foc->len = -1;
1359 }
1360 /* Acknowledge the data received from the peer. */
1361 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1362 return true;
1363 } else if (foc->len == 0) { /* Client requesting a cookie */
1364 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1365 NET_INC_STATS_BH(sock_net(sk),
1366 LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1367 } else {
1368 /* Client sent a cookie with wrong size. Treat it
1369 * the same as invalid and return a valid one.
1370 */
1371 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1372 }
1373 return false;
1374}
1375
1376static int tcp_v4_conn_req_fastopen(struct sock *sk,
1377 struct sk_buff *skb,
1378 struct sk_buff *skb_synack,
1379 struct request_sock *req,
1380 struct request_values *rvp)
1381{
1382 struct tcp_sock *tp = tcp_sk(sk);
1383 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1384 const struct inet_request_sock *ireq = inet_rsk(req);
1385 struct sock *child;
1386 int err;
1387
1388 req->retrans = 0;
1389 req->sk = NULL;
1390
1391 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1392 if (child == NULL) {
1393 NET_INC_STATS_BH(sock_net(sk),
1394 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1395 kfree_skb(skb_synack);
1396 return -1;
1397 }
1398 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1399 ireq->rmt_addr, ireq->opt);
1400 err = net_xmit_eval(err);
1401 if (!err)
1402 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1403 /* XXX (TFO) - is it ok to ignore error and continue? */
1404
1405 spin_lock(&queue->fastopenq->lock);
1406 queue->fastopenq->qlen++;
1407 spin_unlock(&queue->fastopenq->lock);
1408
1409 /* Initialize the child socket. Have to fix some values to take
1410 * into account the child is a Fast Open socket and is created
1411 * only out of the bits carried in the SYN packet.
1412 */
1413 tp = tcp_sk(child);
1414
1415 tp->fastopen_rsk = req;
1416 /* Do a hold on the listner sk so that if the listener is being
1417 * closed, the child that has been accepted can live on and still
1418 * access listen_lock.
1419 */
1420 sock_hold(sk);
1421 tcp_rsk(req)->listener = sk;
1422
1423 /* RFC1323: The window in SYN & SYN/ACK segments is never
1424 * scaled. So correct it appropriately.
1425 */
1426 tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1427
1428 /* Activate the retrans timer so that SYNACK can be retransmitted.
1429 * The request socket is not added to the SYN table of the parent
1430 * because it's been added to the accept queue directly.
1431 */
1432 inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1433 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1434
1435 /* Add the child socket directly into the accept queue */
1436 inet_csk_reqsk_queue_add(sk, req, child);
1437
1438 /* Now finish processing the fastopen child socket. */
1439 inet_csk(child)->icsk_af_ops->rebuild_header(child);
1440 tcp_init_congestion_control(child);
1441 tcp_mtup_init(child);
1442 tcp_init_buffer_space(child);
1443 tcp_init_metrics(child);
1444
1445 /* Queue the data carried in the SYN packet. We need to first
1446 * bump skb's refcnt because the caller will attempt to free it.
1447 *
1448 * XXX (TFO) - we honor a zero-payload TFO request for now.
1449 * (Any reason not to?)
1450 */
1451 if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1452 /* Don't queue the skb if there is no payload in SYN.
1453 * XXX (TFO) - How about SYN+FIN?
1454 */
1455 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1456 } else {
1457 skb = skb_get(skb);
1458 skb_dst_drop(skb);
1459 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1460 skb_set_owner_r(skb, child);
1461 __skb_queue_tail(&child->sk_receive_queue, skb);
1462 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1463 }
1464 sk->sk_data_ready(sk, 0);
1465 bh_unlock_sock(child);
1466 sock_put(child);
1467 WARN_ON(req->sk == NULL);
1468 return 0;
1469}
1470
1273int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1471int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1274{ 1472{
1275 struct tcp_extend_values tmp_ext; 1473 struct tcp_extend_values tmp_ext;
@@ -1283,6 +1481,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1283 __be32 daddr = ip_hdr(skb)->daddr; 1481 __be32 daddr = ip_hdr(skb)->daddr;
1284 __u32 isn = TCP_SKB_CB(skb)->when; 1482 __u32 isn = TCP_SKB_CB(skb)->when;
1285 bool want_cookie = false; 1483 bool want_cookie = false;
1484 struct flowi4 fl4;
1485 struct tcp_fastopen_cookie foc = { .len = -1 };
1486 struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1487 struct sk_buff *skb_synack;
1488 int do_fastopen;
1286 1489
1287 /* Never answer to SYNs send to broadcast or multicast */ 1490 /* Never answer to SYNs send to broadcast or multicast */
1288 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1491 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -1317,7 +1520,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1317 tcp_clear_options(&tmp_opt); 1520 tcp_clear_options(&tmp_opt);
1318 tmp_opt.mss_clamp = TCP_MSS_DEFAULT; 1521 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1319 tmp_opt.user_mss = tp->rx_opt.user_mss; 1522 tmp_opt.user_mss = tp->rx_opt.user_mss;
1320 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); 1523 tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
1524 want_cookie ? NULL : &foc);
1321 1525
1322 if (tmp_opt.cookie_plus > 0 && 1526 if (tmp_opt.cookie_plus > 0 &&
1323 tmp_opt.saw_tstamp && 1527 tmp_opt.saw_tstamp &&
@@ -1363,7 +1567,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1363 ireq->loc_addr = daddr; 1567 ireq->loc_addr = daddr;
1364 ireq->rmt_addr = saddr; 1568 ireq->rmt_addr = saddr;
1365 ireq->no_srccheck = inet_sk(sk)->transparent; 1569 ireq->no_srccheck = inet_sk(sk)->transparent;
1366 ireq->opt = tcp_v4_save_options(sk, skb); 1570 ireq->opt = tcp_v4_save_options(skb);
1367 1571
1368 if (security_inet_conn_request(sk, skb, req)) 1572 if (security_inet_conn_request(sk, skb, req))
1369 goto drop_and_free; 1573 goto drop_and_free;
@@ -1375,8 +1579,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1375 isn = cookie_v4_init_sequence(sk, skb, &req->mss); 1579 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1376 req->cookie_ts = tmp_opt.tstamp_ok; 1580 req->cookie_ts = tmp_opt.tstamp_ok;
1377 } else if (!isn) { 1581 } else if (!isn) {
1378 struct flowi4 fl4;
1379
1380 /* VJ's idea. We save last timestamp seen 1582 /* VJ's idea. We save last timestamp seen
1381 * from the destination in peer table, when entering 1583 * from the destination in peer table, when entering
1382 * state TIME-WAIT, and check against it before 1584 * state TIME-WAIT, and check against it before
@@ -1415,16 +1617,54 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1415 isn = tcp_v4_init_sequence(skb); 1617 isn = tcp_v4_init_sequence(skb);
1416 } 1618 }
1417 tcp_rsk(req)->snt_isn = isn; 1619 tcp_rsk(req)->snt_isn = isn;
1418 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1419 1620
1420 if (tcp_v4_send_synack(sk, dst, req, 1621 if (dst == NULL) {
1421 (struct request_values *)&tmp_ext, 1622 dst = inet_csk_route_req(sk, &fl4, req);
1422 skb_get_queue_mapping(skb), 1623 if (dst == NULL)
1423 want_cookie) || 1624 goto drop_and_free;
1424 want_cookie) 1625 }
1626 do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1627
1628 /* We don't call tcp_v4_send_synack() directly because we need
1629 * to make sure a child socket can be created successfully before
1630 * sending back synack!
1631 *
1632 * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1633 * (or better yet, call tcp_send_synack() in the child context
1634 * directly, but will have to fix bunch of other code first)
1635 * after syn_recv_sock() except one will need to first fix the
1636 * latter to remove its dependency on the current implementation
1637 * of tcp_v4_send_synack()->tcp_select_initial_window().
1638 */
1639 skb_synack = tcp_make_synack(sk, dst, req,
1640 (struct request_values *)&tmp_ext,
1641 fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1642
1643 if (skb_synack) {
1644 __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1645 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1646 } else
1647 goto drop_and_free;
1648
1649 if (likely(!do_fastopen)) {
1650 int err;
1651 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1652 ireq->rmt_addr, ireq->opt);
1653 err = net_xmit_eval(err);
1654 if (err || want_cookie)
1655 goto drop_and_free;
1656
1657 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1658 tcp_rsk(req)->listener = NULL;
1659 /* Add the request_sock to the SYN table */
1660 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1661 if (fastopen_cookie_present(&foc) && foc.len != 0)
1662 NET_INC_STATS_BH(sock_net(sk),
1663 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1664 } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
1665 (struct request_values *)&tmp_ext))
1425 goto drop_and_free; 1666 goto drop_and_free;
1426 1667
1427 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1428 return 0; 1668 return 0;
1429 1669
1430drop_and_release: 1670drop_and_release:
@@ -1462,6 +1702,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1462 goto exit_nonewsk; 1702 goto exit_nonewsk;
1463 1703
1464 newsk->sk_gso_type = SKB_GSO_TCPV4; 1704 newsk->sk_gso_type = SKB_GSO_TCPV4;
1705 inet_sk_rx_dst_set(newsk, skb);
1465 1706
1466 newtp = tcp_sk(newsk); 1707 newtp = tcp_sk(newsk);
1467 newinet = inet_sk(newsk); 1708 newinet = inet_sk(newsk);
@@ -1497,9 +1738,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1497 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; 1738 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1498 1739
1499 tcp_initialize_rcv_mss(newsk); 1740 tcp_initialize_rcv_mss(newsk);
1500 if (tcp_rsk(req)->snt_synack) 1741 tcp_synack_rtt_meas(newsk, req);
1501 tcp_valid_rtt_meas(newsk,
1502 tcp_time_stamp - tcp_rsk(req)->snt_synack);
1503 newtp->total_retrans = req->retrans; 1742 newtp->total_retrans = req->retrans;
1504 1743
1505#ifdef CONFIG_TCP_MD5SIG 1744#ifdef CONFIG_TCP_MD5SIG
@@ -1551,7 +1790,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1551 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, 1790 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1552 iph->saddr, iph->daddr); 1791 iph->saddr, iph->daddr);
1553 if (req) 1792 if (req)
1554 return tcp_check_req(sk, skb, req, prev); 1793 return tcp_check_req(sk, skb, req, prev, false);
1555 1794
1556 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, 1795 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1557 th->source, iph->daddr, th->dest, inet_iif(skb)); 1796 th->source, iph->daddr, th->dest, inet_iif(skb));
@@ -1960,20 +2199,13 @@ void tcp_v4_destroy_sock(struct sock *sk)
1960 if (inet_csk(sk)->icsk_bind_hash) 2199 if (inet_csk(sk)->icsk_bind_hash)
1961 inet_put_port(sk); 2200 inet_put_port(sk);
1962 2201
1963 /*
1964 * If sendmsg cached page exists, toss it.
1965 */
1966 if (sk->sk_sndmsg_page) {
1967 __free_page(sk->sk_sndmsg_page);
1968 sk->sk_sndmsg_page = NULL;
1969 }
1970
1971 /* TCP Cookie Transactions */ 2202 /* TCP Cookie Transactions */
1972 if (tp->cookie_values != NULL) { 2203 if (tp->cookie_values != NULL) {
1973 kref_put(&tp->cookie_values->kref, 2204 kref_put(&tp->cookie_values->kref,
1974 tcp_cookie_values_release); 2205 tcp_cookie_values_release);
1975 tp->cookie_values = NULL; 2206 tp->cookie_values = NULL;
1976 } 2207 }
2208 BUG_ON(tp->fastopen_rsk != NULL);
1977 2209
1978 /* If socket is aborted during connect operation */ 2210 /* If socket is aborted during connect operation */
1979 tcp_free_fastopen_req(tp); 2211 tcp_free_fastopen_req(tp);
@@ -2390,10 +2622,10 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2390EXPORT_SYMBOL(tcp_proc_unregister); 2622EXPORT_SYMBOL(tcp_proc_unregister);
2391 2623
2392static void get_openreq4(const struct sock *sk, const struct request_sock *req, 2624static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2393 struct seq_file *f, int i, int uid, int *len) 2625 struct seq_file *f, int i, kuid_t uid, int *len)
2394{ 2626{
2395 const struct inet_request_sock *ireq = inet_rsk(req); 2627 const struct inet_request_sock *ireq = inet_rsk(req);
2396 int ttd = req->expires - jiffies; 2628 long delta = req->expires - jiffies;
2397 2629
2398 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2630 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2399 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n", 2631 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
@@ -2405,9 +2637,9 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2405 TCP_SYN_RECV, 2637 TCP_SYN_RECV,
2406 0, 0, /* could print option size, but that is af dependent. */ 2638 0, 0, /* could print option size, but that is af dependent. */
2407 1, /* timers active (only the expire timer) */ 2639 1, /* timers active (only the expire timer) */
2408 jiffies_to_clock_t(ttd), 2640 jiffies_delta_to_clock_t(delta),
2409 req->retrans, 2641 req->retrans,
2410 uid, 2642 from_kuid_munged(seq_user_ns(f), uid),
2411 0, /* non standard timer */ 2643 0, /* non standard timer */
2412 0, /* open_requests have no inode */ 2644 0, /* open_requests have no inode */
2413 atomic_read(&sk->sk_refcnt), 2645 atomic_read(&sk->sk_refcnt),
@@ -2422,6 +2654,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2422 const struct tcp_sock *tp = tcp_sk(sk); 2654 const struct tcp_sock *tp = tcp_sk(sk);
2423 const struct inet_connection_sock *icsk = inet_csk(sk); 2655 const struct inet_connection_sock *icsk = inet_csk(sk);
2424 const struct inet_sock *inet = inet_sk(sk); 2656 const struct inet_sock *inet = inet_sk(sk);
2657 struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2425 __be32 dest = inet->inet_daddr; 2658 __be32 dest = inet->inet_daddr;
2426 __be32 src = inet->inet_rcv_saddr; 2659 __be32 src = inet->inet_rcv_saddr;
2427 __u16 destp = ntohs(inet->inet_dport); 2660 __u16 destp = ntohs(inet->inet_dport);
@@ -2456,9 +2689,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2456 tp->write_seq - tp->snd_una, 2689 tp->write_seq - tp->snd_una,
2457 rx_queue, 2690 rx_queue,
2458 timer_active, 2691 timer_active,
2459 jiffies_to_clock_t(timer_expires - jiffies), 2692 jiffies_delta_to_clock_t(timer_expires - jiffies),
2460 icsk->icsk_retransmits, 2693 icsk->icsk_retransmits,
2461 sock_i_uid(sk), 2694 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2462 icsk->icsk_probes_out, 2695 icsk->icsk_probes_out,
2463 sock_i_ino(sk), 2696 sock_i_ino(sk),
2464 atomic_read(&sk->sk_refcnt), sk, 2697 atomic_read(&sk->sk_refcnt), sk,
@@ -2466,7 +2699,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2466 jiffies_to_clock_t(icsk->icsk_ack.ato), 2699 jiffies_to_clock_t(icsk->icsk_ack.ato),
2467 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, 2700 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2468 tp->snd_cwnd, 2701 tp->snd_cwnd,
2469 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh, 2702 sk->sk_state == TCP_LISTEN ?
2703 (fastopenq ? fastopenq->max_qlen : 0) :
2704 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2470 len); 2705 len);
2471} 2706}
2472 2707
@@ -2475,10 +2710,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2475{ 2710{
2476 __be32 dest, src; 2711 __be32 dest, src;
2477 __u16 destp, srcp; 2712 __u16 destp, srcp;
2478 int ttd = tw->tw_ttd - jiffies; 2713 long delta = tw->tw_ttd - jiffies;
2479
2480 if (ttd < 0)
2481 ttd = 0;
2482 2714
2483 dest = tw->tw_daddr; 2715 dest = tw->tw_daddr;
2484 src = tw->tw_rcv_saddr; 2716 src = tw->tw_rcv_saddr;
@@ -2488,7 +2720,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2488 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2720 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2489 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n", 2721 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2490 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2722 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2491 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, 2723 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2492 atomic_read(&tw->tw_refcnt), tw, len); 2724 atomic_read(&tw->tw_refcnt), tw, len);
2493} 2725}
2494 2726
@@ -2571,6 +2803,8 @@ void tcp4_proc_exit(void)
2571struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) 2803struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2572{ 2804{
2573 const struct iphdr *iph = skb_gro_network_header(skb); 2805 const struct iphdr *iph = skb_gro_network_header(skb);
2806 __wsum wsum;
2807 __sum16 sum;
2574 2808
2575 switch (skb->ip_summed) { 2809 switch (skb->ip_summed) {
2576 case CHECKSUM_COMPLETE: 2810 case CHECKSUM_COMPLETE:
@@ -2579,11 +2813,22 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2579 skb->ip_summed = CHECKSUM_UNNECESSARY; 2813 skb->ip_summed = CHECKSUM_UNNECESSARY;
2580 break; 2814 break;
2581 } 2815 }
2582 2816flush:
2583 /* fall through */
2584 case CHECKSUM_NONE:
2585 NAPI_GRO_CB(skb)->flush = 1; 2817 NAPI_GRO_CB(skb)->flush = 1;
2586 return NULL; 2818 return NULL;
2819
2820 case CHECKSUM_NONE:
2821 wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
2822 skb_gro_len(skb), IPPROTO_TCP, 0);
2823 sum = csum_fold(skb_checksum(skb,
2824 skb_gro_offset(skb),
2825 skb_gro_len(skb),
2826 wsum));
2827 if (sum)
2828 goto flush;
2829
2830 skb->ip_summed = CHECKSUM_UNNECESSARY;
2831 break;
2587 } 2832 }
2588 2833
2589 return tcp_gro_receive(head, skb); 2834 return tcp_gro_receive(head, skb);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 0abe67bb4d3..4c752a6e0bc 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -8,6 +8,7 @@
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/tcp.h> 9#include <linux/tcp.h>
10#include <linux/hash.h> 10#include <linux/hash.h>
11#include <linux/tcp_metrics.h>
11 12
12#include <net/inet_connection_sock.h> 13#include <net/inet_connection_sock.h>
13#include <net/net_namespace.h> 14#include <net/net_namespace.h>
@@ -17,20 +18,10 @@
17#include <net/ipv6.h> 18#include <net/ipv6.h>
18#include <net/dst.h> 19#include <net/dst.h>
19#include <net/tcp.h> 20#include <net/tcp.h>
21#include <net/genetlink.h>
20 22
21int sysctl_tcp_nometrics_save __read_mostly; 23int sysctl_tcp_nometrics_save __read_mostly;
22 24
23enum tcp_metric_index {
24 TCP_METRIC_RTT,
25 TCP_METRIC_RTTVAR,
26 TCP_METRIC_SSTHRESH,
27 TCP_METRIC_CWND,
28 TCP_METRIC_REORDERING,
29
30 /* Always last. */
31 TCP_METRIC_MAX,
32};
33
34struct tcp_fastopen_metrics { 25struct tcp_fastopen_metrics {
35 u16 mss; 26 u16 mss;
36 u16 syn_loss:10; /* Recurring Fast Open SYN losses */ 27 u16 syn_loss:10; /* Recurring Fast Open SYN losses */
@@ -45,8 +36,10 @@ struct tcp_metrics_block {
45 u32 tcpm_ts; 36 u32 tcpm_ts;
46 u32 tcpm_ts_stamp; 37 u32 tcpm_ts_stamp;
47 u32 tcpm_lock; 38 u32 tcpm_lock;
48 u32 tcpm_vals[TCP_METRIC_MAX]; 39 u32 tcpm_vals[TCP_METRIC_MAX + 1];
49 struct tcp_fastopen_metrics tcpm_fastopen; 40 struct tcp_fastopen_metrics tcpm_fastopen;
41
42 struct rcu_head rcu_head;
50}; 43};
51 44
52static bool tcp_metric_locked(struct tcp_metrics_block *tm, 45static bool tcp_metric_locked(struct tcp_metrics_block *tm,
@@ -690,6 +683,325 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
690 rcu_read_unlock(); 683 rcu_read_unlock();
691} 684}
692 685
686static struct genl_family tcp_metrics_nl_family = {
687 .id = GENL_ID_GENERATE,
688 .hdrsize = 0,
689 .name = TCP_METRICS_GENL_NAME,
690 .version = TCP_METRICS_GENL_VERSION,
691 .maxattr = TCP_METRICS_ATTR_MAX,
692 .netnsok = true,
693};
694
695static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
696 [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, },
697 [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY,
698 .len = sizeof(struct in6_addr), },
699 /* Following attributes are not received for GET/DEL,
700 * we keep them for reference
701 */
702#if 0
703 [TCP_METRICS_ATTR_AGE] = { .type = NLA_MSECS, },
704 [TCP_METRICS_ATTR_TW_TSVAL] = { .type = NLA_U32, },
705 [TCP_METRICS_ATTR_TW_TS_STAMP] = { .type = NLA_S32, },
706 [TCP_METRICS_ATTR_VALS] = { .type = NLA_NESTED, },
707 [TCP_METRICS_ATTR_FOPEN_MSS] = { .type = NLA_U16, },
708 [TCP_METRICS_ATTR_FOPEN_SYN_DROPS] = { .type = NLA_U16, },
709 [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS] = { .type = NLA_MSECS, },
710 [TCP_METRICS_ATTR_FOPEN_COOKIE] = { .type = NLA_BINARY,
711 .len = TCP_FASTOPEN_COOKIE_MAX, },
712#endif
713};
714
715/* Add attributes, caller cancels its header on failure */
716static int tcp_metrics_fill_info(struct sk_buff *msg,
717 struct tcp_metrics_block *tm)
718{
719 struct nlattr *nest;
720 int i;
721
722 switch (tm->tcpm_addr.family) {
723 case AF_INET:
724 if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4,
725 tm->tcpm_addr.addr.a4) < 0)
726 goto nla_put_failure;
727 break;
728 case AF_INET6:
729 if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16,
730 tm->tcpm_addr.addr.a6) < 0)
731 goto nla_put_failure;
732 break;
733 default:
734 return -EAFNOSUPPORT;
735 }
736
737 if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
738 jiffies - tm->tcpm_stamp) < 0)
739 goto nla_put_failure;
740 if (tm->tcpm_ts_stamp) {
741 if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP,
742 (s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0)
743 goto nla_put_failure;
744 if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL,
745 tm->tcpm_ts) < 0)
746 goto nla_put_failure;
747 }
748
749 {
750 int n = 0;
751
752 nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
753 if (!nest)
754 goto nla_put_failure;
755 for (i = 0; i < TCP_METRIC_MAX + 1; i++) {
756 if (!tm->tcpm_vals[i])
757 continue;
758 if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0)
759 goto nla_put_failure;
760 n++;
761 }
762 if (n)
763 nla_nest_end(msg, nest);
764 else
765 nla_nest_cancel(msg, nest);
766 }
767
768 {
769 struct tcp_fastopen_metrics tfom_copy[1], *tfom;
770 unsigned int seq;
771
772 do {
773 seq = read_seqbegin(&fastopen_seqlock);
774 tfom_copy[0] = tm->tcpm_fastopen;
775 } while (read_seqretry(&fastopen_seqlock, seq));
776
777 tfom = tfom_copy;
778 if (tfom->mss &&
779 nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS,
780 tfom->mss) < 0)
781 goto nla_put_failure;
782 if (tfom->syn_loss &&
783 (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS,
784 tfom->syn_loss) < 0 ||
785 nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,
786 jiffies - tfom->last_syn_loss) < 0))
787 goto nla_put_failure;
788 if (tfom->cookie.len > 0 &&
789 nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE,
790 tfom->cookie.len, tfom->cookie.val) < 0)
791 goto nla_put_failure;
792 }
793
794 return 0;
795
796nla_put_failure:
797 return -EMSGSIZE;
798}
799
800static int tcp_metrics_dump_info(struct sk_buff *skb,
801 struct netlink_callback *cb,
802 struct tcp_metrics_block *tm)
803{
804 void *hdr;
805
806 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
807 &tcp_metrics_nl_family, NLM_F_MULTI,
808 TCP_METRICS_CMD_GET);
809 if (!hdr)
810 return -EMSGSIZE;
811
812 if (tcp_metrics_fill_info(skb, tm) < 0)
813 goto nla_put_failure;
814
815 return genlmsg_end(skb, hdr);
816
817nla_put_failure:
818 genlmsg_cancel(skb, hdr);
819 return -EMSGSIZE;
820}
821
822static int tcp_metrics_nl_dump(struct sk_buff *skb,
823 struct netlink_callback *cb)
824{
825 struct net *net = sock_net(skb->sk);
826 unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
827 unsigned int row, s_row = cb->args[0];
828 int s_col = cb->args[1], col = s_col;
829
830 for (row = s_row; row < max_rows; row++, s_col = 0) {
831 struct tcp_metrics_block *tm;
832 struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row;
833
834 rcu_read_lock();
835 for (col = 0, tm = rcu_dereference(hb->chain); tm;
836 tm = rcu_dereference(tm->tcpm_next), col++) {
837 if (col < s_col)
838 continue;
839 if (tcp_metrics_dump_info(skb, cb, tm) < 0) {
840 rcu_read_unlock();
841 goto done;
842 }
843 }
844 rcu_read_unlock();
845 }
846
847done:
848 cb->args[0] = row;
849 cb->args[1] = col;
850 return skb->len;
851}
852
853static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
854 unsigned int *hash, int optional)
855{
856 struct nlattr *a;
857
858 a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV4];
859 if (a) {
860 addr->family = AF_INET;
861 addr->addr.a4 = nla_get_be32(a);
862 *hash = (__force unsigned int) addr->addr.a4;
863 return 0;
864 }
865 a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV6];
866 if (a) {
867 if (nla_len(a) != sizeof(sizeof(struct in6_addr)))
868 return -EINVAL;
869 addr->family = AF_INET6;
870 memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6));
871 *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6);
872 return 0;
873 }
874 return optional ? 1 : -EAFNOSUPPORT;
875}
876
877static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
878{
879 struct tcp_metrics_block *tm;
880 struct inetpeer_addr addr;
881 unsigned int hash;
882 struct sk_buff *msg;
883 struct net *net = genl_info_net(info);
884 void *reply;
885 int ret;
886
887 ret = parse_nl_addr(info, &addr, &hash, 0);
888 if (ret < 0)
889 return ret;
890
891 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
892 if (!msg)
893 return -ENOMEM;
894
895 reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0,
896 info->genlhdr->cmd);
897 if (!reply)
898 goto nla_put_failure;
899
900 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
901 ret = -ESRCH;
902 rcu_read_lock();
903 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
904 tm = rcu_dereference(tm->tcpm_next)) {
905 if (addr_same(&tm->tcpm_addr, &addr)) {
906 ret = tcp_metrics_fill_info(msg, tm);
907 break;
908 }
909 }
910 rcu_read_unlock();
911 if (ret < 0)
912 goto out_free;
913
914 genlmsg_end(msg, reply);
915 return genlmsg_reply(msg, info);
916
917nla_put_failure:
918 ret = -EMSGSIZE;
919
920out_free:
921 nlmsg_free(msg);
922 return ret;
923}
924
925#define deref_locked_genl(p) \
926 rcu_dereference_protected(p, lockdep_genl_is_held() && \
927 lockdep_is_held(&tcp_metrics_lock))
928
929#define deref_genl(p) rcu_dereference_protected(p, lockdep_genl_is_held())
930
931static int tcp_metrics_flush_all(struct net *net)
932{
933 unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
934 struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash;
935 struct tcp_metrics_block *tm;
936 unsigned int row;
937
938 for (row = 0; row < max_rows; row++, hb++) {
939 spin_lock_bh(&tcp_metrics_lock);
940 tm = deref_locked_genl(hb->chain);
941 if (tm)
942 hb->chain = NULL;
943 spin_unlock_bh(&tcp_metrics_lock);
944 while (tm) {
945 struct tcp_metrics_block *next;
946
947 next = deref_genl(tm->tcpm_next);
948 kfree_rcu(tm, rcu_head);
949 tm = next;
950 }
951 }
952 return 0;
953}
954
955static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
956{
957 struct tcpm_hash_bucket *hb;
958 struct tcp_metrics_block *tm;
959 struct tcp_metrics_block __rcu **pp;
960 struct inetpeer_addr addr;
961 unsigned int hash;
962 struct net *net = genl_info_net(info);
963 int ret;
964
965 ret = parse_nl_addr(info, &addr, &hash, 1);
966 if (ret < 0)
967 return ret;
968 if (ret > 0)
969 return tcp_metrics_flush_all(net);
970
971 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
972 hb = net->ipv4.tcp_metrics_hash + hash;
973 pp = &hb->chain;
974 spin_lock_bh(&tcp_metrics_lock);
975 for (tm = deref_locked_genl(*pp); tm;
976 pp = &tm->tcpm_next, tm = deref_locked_genl(*pp)) {
977 if (addr_same(&tm->tcpm_addr, &addr)) {
978 *pp = tm->tcpm_next;
979 break;
980 }
981 }
982 spin_unlock_bh(&tcp_metrics_lock);
983 if (!tm)
984 return -ESRCH;
985 kfree_rcu(tm, rcu_head);
986 return 0;
987}
988
989static struct genl_ops tcp_metrics_nl_ops[] = {
990 {
991 .cmd = TCP_METRICS_CMD_GET,
992 .doit = tcp_metrics_nl_cmd_get,
993 .dumpit = tcp_metrics_nl_dump,
994 .policy = tcp_metrics_nl_policy,
995 .flags = GENL_ADMIN_PERM,
996 },
997 {
998 .cmd = TCP_METRICS_CMD_DEL,
999 .doit = tcp_metrics_nl_cmd_del,
1000 .policy = tcp_metrics_nl_policy,
1001 .flags = GENL_ADMIN_PERM,
1002 },
1003};
1004
693static unsigned int tcpmhash_entries; 1005static unsigned int tcpmhash_entries;
694static int __init set_tcpmhash_entries(char *str) 1006static int __init set_tcpmhash_entries(char *str)
695{ 1007{
@@ -753,5 +1065,21 @@ static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
753 1065
754void __init tcp_metrics_init(void) 1066void __init tcp_metrics_init(void)
755{ 1067{
756 register_pernet_subsys(&tcp_net_metrics_ops); 1068 int ret;
1069
1070 ret = register_pernet_subsys(&tcp_net_metrics_ops);
1071 if (ret < 0)
1072 goto cleanup;
1073 ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
1074 tcp_metrics_nl_ops,
1075 ARRAY_SIZE(tcp_metrics_nl_ops));
1076 if (ret < 0)
1077 goto cleanup_subsys;
1078 return;
1079
1080cleanup_subsys:
1081 unregister_pernet_subsys(&tcp_net_metrics_ops);
1082
1083cleanup:
1084 return;
757} 1085}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index d9c9dcef2de..27536ba16c9 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -85,6 +85,8 @@ static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
85 * spinlock it. I do not want! Well, probability of misbehaviour 85 * spinlock it. I do not want! Well, probability of misbehaviour
86 * is ridiculously low and, seems, we could use some mb() tricks 86 * is ridiculously low and, seems, we could use some mb() tricks
87 * to avoid misread sequence numbers, states etc. --ANK 87 * to avoid misread sequence numbers, states etc. --ANK
88 *
89 * We don't need to initialize tmp_out.sack_ok as we don't use the results
88 */ 90 */
89enum tcp_tw_status 91enum tcp_tw_status
90tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, 92tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
@@ -387,8 +389,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
387 struct tcp_sock *oldtp = tcp_sk(sk); 389 struct tcp_sock *oldtp = tcp_sk(sk);
388 struct tcp_cookie_values *oldcvp = oldtp->cookie_values; 390 struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
389 391
390 newicsk->icsk_af_ops->sk_rx_dst_set(newsk, skb);
391
392 /* TCP Cookie Transactions require space for the cookie pair, 392 /* TCP Cookie Transactions require space for the cookie pair,
393 * as it differs for each connection. There is no need to 393 * as it differs for each connection. There is no need to
394 * copy any s_data_payload stored at the original socket. 394 * copy any s_data_payload stored at the original socket.
@@ -509,6 +509,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
509 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; 509 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
510 newtp->rx_opt.mss_clamp = req->mss; 510 newtp->rx_opt.mss_clamp = req->mss;
511 TCP_ECN_openreq_child(newtp, req); 511 TCP_ECN_openreq_child(newtp, req);
512 newtp->fastopen_rsk = NULL;
512 513
513 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); 514 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
514 } 515 }
@@ -517,13 +518,20 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
517EXPORT_SYMBOL(tcp_create_openreq_child); 518EXPORT_SYMBOL(tcp_create_openreq_child);
518 519
519/* 520/*
520 * Process an incoming packet for SYN_RECV sockets represented 521 * Process an incoming packet for SYN_RECV sockets represented as a
521 * as a request_sock. 522 * request_sock. Normally sk is the listener socket but for TFO it
523 * points to the child socket.
524 *
525 * XXX (TFO) - The current impl contains a special check for ack
526 * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
527 *
528 * We don't need to initialize tmp_opt.sack_ok as we don't use the results
522 */ 529 */
523 530
524struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, 531struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
525 struct request_sock *req, 532 struct request_sock *req,
526 struct request_sock **prev) 533 struct request_sock **prev,
534 bool fastopen)
527{ 535{
528 struct tcp_options_received tmp_opt; 536 struct tcp_options_received tmp_opt;
529 const u8 *hash_location; 537 const u8 *hash_location;
@@ -532,6 +540,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
532 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 540 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
533 bool paws_reject = false; 541 bool paws_reject = false;
534 542
543 BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
544
535 tmp_opt.saw_tstamp = 0; 545 tmp_opt.saw_tstamp = 0;
536 if (th->doff > (sizeof(struct tcphdr)>>2)) { 546 if (th->doff > (sizeof(struct tcphdr)>>2)) {
537 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); 547 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
@@ -567,6 +577,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
567 * 577 *
568 * Enforce "SYN-ACK" according to figure 8, figure 6 578 * Enforce "SYN-ACK" according to figure 8, figure 6
569 * of RFC793, fixed by RFC1122. 579 * of RFC793, fixed by RFC1122.
580 *
581 * Note that even if there is new data in the SYN packet
582 * they will be thrown away too.
570 */ 583 */
571 req->rsk_ops->rtx_syn_ack(sk, req, NULL); 584 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
572 return NULL; 585 return NULL;
@@ -624,9 +637,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
624 * sent (the segment carries an unacceptable ACK) ... 637 * sent (the segment carries an unacceptable ACK) ...
625 * a reset is sent." 638 * a reset is sent."
626 * 639 *
627 * Invalid ACK: reset will be sent by listening socket 640 * Invalid ACK: reset will be sent by listening socket.
641 * Note that the ACK validity check for a Fast Open socket is done
642 * elsewhere and is checked directly against the child socket rather
643 * than req because user data may have been sent out.
628 */ 644 */
629 if ((flg & TCP_FLAG_ACK) && 645 if ((flg & TCP_FLAG_ACK) && !fastopen &&
630 (TCP_SKB_CB(skb)->ack_seq != 646 (TCP_SKB_CB(skb)->ack_seq !=
631 tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) 647 tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
632 return sk; 648 return sk;
@@ -639,7 +655,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
639 /* RFC793: "first check sequence number". */ 655 /* RFC793: "first check sequence number". */
640 656
641 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 657 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
642 tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { 658 tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
643 /* Out of window: send ACK and drop. */ 659 /* Out of window: send ACK and drop. */
644 if (!(flg & TCP_FLAG_RST)) 660 if (!(flg & TCP_FLAG_RST))
645 req->rsk_ops->send_ack(sk, skb, req); 661 req->rsk_ops->send_ack(sk, skb, req);
@@ -650,7 +666,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
650 666
651 /* In sequence, PAWS is OK. */ 667 /* In sequence, PAWS is OK. */
652 668
653 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) 669 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
654 req->ts_recent = tmp_opt.rcv_tsval; 670 req->ts_recent = tmp_opt.rcv_tsval;
655 671
656 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { 672 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
@@ -669,10 +685,25 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
669 685
670 /* ACK sequence verified above, just make sure ACK is 686 /* ACK sequence verified above, just make sure ACK is
671 * set. If ACK not set, just silently drop the packet. 687 * set. If ACK not set, just silently drop the packet.
688 *
689 * XXX (TFO) - if we ever allow "data after SYN", the
690 * following check needs to be removed.
672 */ 691 */
673 if (!(flg & TCP_FLAG_ACK)) 692 if (!(flg & TCP_FLAG_ACK))
674 return NULL; 693 return NULL;
675 694
695 /* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */
696 if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
697 tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
698 else if (req->retrans) /* don't take RTT sample if retrans && ~TS */
699 tcp_rsk(req)->snt_synack = 0;
700
701 /* For Fast Open no more processing is needed (sk is the
702 * child socket).
703 */
704 if (fastopen)
705 return sk;
706
676 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ 707 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
677 if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && 708 if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
678 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { 709 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
@@ -680,10 +711,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
680 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); 711 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
681 return NULL; 712 return NULL;
682 } 713 }
683 if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
684 tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
685 else if (req->retrans) /* don't take RTT sample if retrans && ~TS */
686 tcp_rsk(req)->snt_synack = 0;
687 714
688 /* OK, ACK is valid, create big socket and 715 /* OK, ACK is valid, create big socket and
689 * feed this segment to it. It will repeat all 716 * feed this segment to it. It will repeat all
@@ -708,11 +735,21 @@ listen_overflow:
708 } 735 }
709 736
710embryonic_reset: 737embryonic_reset:
711 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); 738 if (!(flg & TCP_FLAG_RST)) {
712 if (!(flg & TCP_FLAG_RST)) 739 /* Received a bad SYN pkt - for TFO We try not to reset
740 * the local connection unless it's really necessary to
741 * avoid becoming vulnerable to outside attack aiming at
742 * resetting legit local connections.
743 */
713 req->rsk_ops->send_reset(sk, skb); 744 req->rsk_ops->send_reset(sk, skb);
714 745 } else if (fastopen) { /* received a valid RST pkt */
715 inet_csk_reqsk_queue_drop(sk, req, prev); 746 reqsk_fastopen_remove(sk, req, true);
747 tcp_reset(sk);
748 }
749 if (!fastopen) {
750 inet_csk_reqsk_queue_drop(sk, req, prev);
751 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
752 }
716 return NULL; 753 return NULL;
717} 754}
718EXPORT_SYMBOL(tcp_check_req); 755EXPORT_SYMBOL(tcp_check_req);
@@ -721,6 +758,12 @@ EXPORT_SYMBOL(tcp_check_req);
721 * Queue segment on the new socket if the new socket is active, 758 * Queue segment on the new socket if the new socket is active,
722 * otherwise we just shortcircuit this and continue with 759 * otherwise we just shortcircuit this and continue with
723 * the new socket. 760 * the new socket.
761 *
762 * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
763 * when entering. But other states are possible due to a race condition
764 * where after __inet_lookup_established() fails but before the listener
765 * locked is obtained, other packets cause the same connection to
766 * be created.
724 */ 767 */
725 768
726int tcp_child_process(struct sock *parent, struct sock *child, 769int tcp_child_process(struct sock *parent, struct sock *child,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 20dfd892c86..cfe6ffe1c17 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -702,7 +702,8 @@ static unsigned int tcp_synack_options(struct sock *sk,
702 unsigned int mss, struct sk_buff *skb, 702 unsigned int mss, struct sk_buff *skb,
703 struct tcp_out_options *opts, 703 struct tcp_out_options *opts,
704 struct tcp_md5sig_key **md5, 704 struct tcp_md5sig_key **md5,
705 struct tcp_extend_values *xvp) 705 struct tcp_extend_values *xvp,
706 struct tcp_fastopen_cookie *foc)
706{ 707{
707 struct inet_request_sock *ireq = inet_rsk(req); 708 struct inet_request_sock *ireq = inet_rsk(req);
708 unsigned int remaining = MAX_TCP_OPTION_SPACE; 709 unsigned int remaining = MAX_TCP_OPTION_SPACE;
@@ -747,7 +748,15 @@ static unsigned int tcp_synack_options(struct sock *sk,
747 if (unlikely(!ireq->tstamp_ok)) 748 if (unlikely(!ireq->tstamp_ok))
748 remaining -= TCPOLEN_SACKPERM_ALIGNED; 749 remaining -= TCPOLEN_SACKPERM_ALIGNED;
749 } 750 }
750 751 if (foc != NULL) {
752 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
753 need = (need + 3) & ~3U; /* Align to 32 bits */
754 if (remaining >= need) {
755 opts->options |= OPTION_FAST_OPEN_COOKIE;
756 opts->fastopen_cookie = foc;
757 remaining -= need;
758 }
759 }
751 /* Similar rationale to tcp_syn_options() applies here, too. 760 /* Similar rationale to tcp_syn_options() applies here, too.
752 * If the <SYN> options fit, the same options should fit now! 761 * If the <SYN> options fit, the same options should fit now!
753 */ 762 */
@@ -910,14 +919,18 @@ void tcp_release_cb(struct sock *sk)
910 if (flags & (1UL << TCP_TSQ_DEFERRED)) 919 if (flags & (1UL << TCP_TSQ_DEFERRED))
911 tcp_tsq_handler(sk); 920 tcp_tsq_handler(sk);
912 921
913 if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) 922 if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
914 tcp_write_timer_handler(sk); 923 tcp_write_timer_handler(sk);
915 924 __sock_put(sk);
916 if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) 925 }
926 if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
917 tcp_delack_timer_handler(sk); 927 tcp_delack_timer_handler(sk);
918 928 __sock_put(sk);
919 if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) 929 }
930 if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
920 sk->sk_prot->mtu_reduced(sk); 931 sk->sk_prot->mtu_reduced(sk);
932 __sock_put(sk);
933 }
921} 934}
922EXPORT_SYMBOL(tcp_release_cb); 935EXPORT_SYMBOL(tcp_release_cb);
923 936
@@ -2024,10 +2037,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2024 if (push_one) 2037 if (push_one)
2025 break; 2038 break;
2026 } 2039 }
2027 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery)
2028 tp->prr_out += sent_pkts;
2029 2040
2030 if (likely(sent_pkts)) { 2041 if (likely(sent_pkts)) {
2042 if (tcp_in_cwnd_reduction(sk))
2043 tp->prr_out += sent_pkts;
2031 tcp_cwnd_validate(sk); 2044 tcp_cwnd_validate(sk);
2032 return false; 2045 return false;
2033 } 2046 }
@@ -2529,7 +2542,7 @@ begin_fwd:
2529 } 2542 }
2530 NET_INC_STATS_BH(sock_net(sk), mib_idx); 2543 NET_INC_STATS_BH(sock_net(sk), mib_idx);
2531 2544
2532 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery) 2545 if (tcp_in_cwnd_reduction(sk))
2533 tp->prr_out += tcp_skb_pcount(skb); 2546 tp->prr_out += tcp_skb_pcount(skb);
2534 2547
2535 if (skb == tcp_write_queue_head(sk)) 2548 if (skb == tcp_write_queue_head(sk))
@@ -2654,7 +2667,8 @@ int tcp_send_synack(struct sock *sk)
2654 */ 2667 */
2655struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, 2668struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2656 struct request_sock *req, 2669 struct request_sock *req,
2657 struct request_values *rvp) 2670 struct request_values *rvp,
2671 struct tcp_fastopen_cookie *foc)
2658{ 2672{
2659 struct tcp_out_options opts; 2673 struct tcp_out_options opts;
2660 struct tcp_extend_values *xvp = tcp_xv(rvp); 2674 struct tcp_extend_values *xvp = tcp_xv(rvp);
@@ -2714,7 +2728,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2714#endif 2728#endif
2715 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2729 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2716 tcp_header_size = tcp_synack_options(sk, req, mss, 2730 tcp_header_size = tcp_synack_options(sk, req, mss,
2717 skb, &opts, &md5, xvp) 2731 skb, &opts, &md5, xvp, foc)
2718 + sizeof(*th); 2732 + sizeof(*th);
2719 2733
2720 skb_push(skb, tcp_header_size); 2734 skb_push(skb, tcp_header_size);
@@ -2768,7 +2782,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2768 } 2782 }
2769 2783
2770 th->seq = htonl(TCP_SKB_CB(skb)->seq); 2784 th->seq = htonl(TCP_SKB_CB(skb)->seq);
2771 th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); 2785 /* XXX data is queued and acked as is. No buffer/window check */
2786 th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
2772 2787
2773 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ 2788 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
2774 th->window = htons(min(req->rcv_wnd, 65535U)); 2789 th->window = htons(min(req->rcv_wnd, 65535U));
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 6df36ad55a3..fc04711e80c 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -252,7 +252,8 @@ static void tcp_delack_timer(unsigned long data)
252 inet_csk(sk)->icsk_ack.blocked = 1; 252 inet_csk(sk)->icsk_ack.blocked = 1;
253 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); 253 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
254 /* deleguate our work to tcp_release_cb() */ 254 /* deleguate our work to tcp_release_cb() */
255 set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags); 255 if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
256 sock_hold(sk);
256 } 257 }
257 bh_unlock_sock(sk); 258 bh_unlock_sock(sk);
258 sock_put(sk); 259 sock_put(sk);
@@ -304,6 +305,35 @@ static void tcp_probe_timer(struct sock *sk)
304} 305}
305 306
306/* 307/*
308 * Timer for Fast Open socket to retransmit SYNACK. Note that the
309 * sk here is the child socket, not the parent (listener) socket.
310 */
311static void tcp_fastopen_synack_timer(struct sock *sk)
312{
313 struct inet_connection_sock *icsk = inet_csk(sk);
314 int max_retries = icsk->icsk_syn_retries ? :
315 sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
316 struct request_sock *req;
317
318 req = tcp_sk(sk)->fastopen_rsk;
319 req->rsk_ops->syn_ack_timeout(sk, req);
320
321 if (req->retrans >= max_retries) {
322 tcp_write_err(sk);
323 return;
324 }
325 /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
326 * returned from rtx_syn_ack() to make it more persistent like
327 * regular retransmit because if the child socket has been accepted
328 * it's not good to give up too easily.
329 */
330 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
331 req->retrans++;
332 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
333 TCP_TIMEOUT_INIT << req->retrans, TCP_RTO_MAX);
334}
335
336/*
307 * The TCP retransmit timer. 337 * The TCP retransmit timer.
308 */ 338 */
309 339
@@ -316,7 +346,15 @@ void tcp_retransmit_timer(struct sock *sk)
316 tcp_resume_early_retransmit(sk); 346 tcp_resume_early_retransmit(sk);
317 return; 347 return;
318 } 348 }
319 349 if (tp->fastopen_rsk) {
350 BUG_ON(sk->sk_state != TCP_SYN_RECV &&
351 sk->sk_state != TCP_FIN_WAIT1);
352 tcp_fastopen_synack_timer(sk);
353 /* Before we receive ACK to our SYN-ACK don't retransmit
354 * anything else (e.g., data or FIN segments).
355 */
356 return;
357 }
320 if (!tp->packets_out) 358 if (!tp->packets_out)
321 goto out; 359 goto out;
322 360
@@ -481,7 +519,8 @@ static void tcp_write_timer(unsigned long data)
481 tcp_write_timer_handler(sk); 519 tcp_write_timer_handler(sk);
482 } else { 520 } else {
483 /* deleguate our work to tcp_release_cb() */ 521 /* deleguate our work to tcp_release_cb() */
484 set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags); 522 if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
523 sock_hold(sk);
485 } 524 }
486 bh_unlock_sock(sk); 525 bh_unlock_sock(sk);
487 sock_put(sk); 526 sock_put(sk);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6f6d1aca3c3..79c8dbe59b5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1226,6 +1226,11 @@ try_again:
1226 1226
1227 if (unlikely(err)) { 1227 if (unlikely(err)) {
1228 trace_kfree_skb(skb, udp_recvmsg); 1228 trace_kfree_skb(skb, udp_recvmsg);
1229 if (!peeked) {
1230 atomic_inc(&sk->sk_drops);
1231 UDP_INC_STATS_USER(sock_net(sk),
1232 UDP_MIB_INERRORS, is_udplite);
1233 }
1229 goto out_free; 1234 goto out_free;
1230 } 1235 }
1231 1236
@@ -2110,7 +2115,9 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
2110 bucket, src, srcp, dest, destp, sp->sk_state, 2115 bucket, src, srcp, dest, destp, sp->sk_state,
2111 sk_wmem_alloc_get(sp), 2116 sk_wmem_alloc_get(sp),
2112 sk_rmem_alloc_get(sp), 2117 sk_rmem_alloc_get(sp),
2113 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), 2118 0, 0L, 0,
2119 from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
2120 0, sock_i_ino(sp),
2114 atomic_read(&sp->sk_refcnt), sp, 2121 atomic_read(&sp->sk_refcnt), sp,
2115 atomic_read(&sp->sk_drops), len); 2122 atomic_read(&sp->sk_drops), len);
2116} 2123}
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 16d0960062b..505b30ad918 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -24,7 +24,9 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
24 if (!inet_diag_bc_sk(bc, sk)) 24 if (!inet_diag_bc_sk(bc, sk))
25 return 0; 25 return 0;
26 26
27 return inet_sk_diag_fill(sk, NULL, skb, req, NETLINK_CB(cb->skb).pid, 27 return inet_sk_diag_fill(sk, NULL, skb, req,
28 sk_user_ns(NETLINK_CB(cb->skb).ssk),
29 NETLINK_CB(cb->skb).portid,
28 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); 30 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
29} 31}
30 32
@@ -69,14 +71,15 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
69 goto out; 71 goto out;
70 72
71 err = inet_sk_diag_fill(sk, NULL, rep, req, 73 err = inet_sk_diag_fill(sk, NULL, rep, req,
72 NETLINK_CB(in_skb).pid, 74 sk_user_ns(NETLINK_CB(in_skb).ssk),
75 NETLINK_CB(in_skb).portid,
73 nlh->nlmsg_seq, 0, nlh); 76 nlh->nlmsg_seq, 0, nlh);
74 if (err < 0) { 77 if (err < 0) {
75 WARN_ON(err == -EMSGSIZE); 78 WARN_ON(err == -EMSGSIZE);
76 kfree_skb(rep); 79 kfree_skb(rep);
77 goto out; 80 goto out;
78 } 81 }
79 err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid, 82 err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
80 MSG_DONTWAIT); 83 MSG_DONTWAIT);
81 if (err > 0) 84 if (err > 0)
82 err = 0; 85 err = 0;