aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c27
-rw-r--r--net/ipv4/devinet.c67
-rw-r--r--net/ipv4/fib_frontend.c25
-rw-r--r--net/ipv4/fib_semantics.c8
-rw-r--r--net/ipv4/fib_trie.c15
-rw-r--r--net/ipv4/igmp.c38
-rw-r--r--net/ipv4/inet_connection_sock.c57
-rw-r--r--net/ipv4/inet_diag.c32
-rw-r--r--net/ipv4/inet_fragment.c9
-rw-r--r--net/ipv4/ip_fragment.c13
-rw-r--r--net/ipv4/ip_gre.c128
-rw-r--r--net/ipv4/ip_output.c74
-rw-r--r--net/ipv4/ip_vti.c5
-rw-r--r--net/ipv4/ipconfig.c43
-rw-r--r--net/ipv4/ipip.c51
-rw-r--r--net/ipv4/ipmr.c12
-rw-r--r--net/ipv4/netfilter.c41
-rw-r--r--net/ipv4/netfilter/Kconfig90
-rw-r--r--net/ipv4/netfilter/Makefile18
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c18
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c98
-rw-r--r--net/ipv4/netfilter/ipt_REDIRECT.c110
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c3
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c2
-rw-r--r--net/ipv4/netfilter/iptable_filter.c10
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c10
-rw-r--r--net/ipv4/netfilter/iptable_nat.c (renamed from net/ipv4/netfilter/nf_nat_standalone.c)264
-rw-r--r--net/ipv4/netfilter/iptable_raw.c10
-rw-r--r--net/ipv4/netfilter/iptable_security.c5
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c85
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c763
-rw-r--r--net/ipv4/netfilter/nf_nat_ftp.c137
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c71
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c458
-rw-r--r--net/ipv4/netfilter/nf_nat_irc.c99
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c281
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c21
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_common.c114
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_dccp.c106
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c30
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c24
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_sctp.c96
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_tcp.c91
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_udp.c82
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_udplite.c98
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_unknown.c52
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c214
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c572
-rw-r--r--net/ipv4/netfilter/nf_nat_tftp.c51
-rw-r--r--net/ipv4/proc.c4
-rw-r--r--net/ipv4/route.c30
-rw-r--r--net/ipv4/syncookies.c1
-rw-r--r--net/ipv4/sysctl_net_ipv4.c45
-rw-r--r--net/ipv4/tcp.c135
-rw-r--r--net/ipv4/tcp_fastopen.c83
-rw-r--r--net/ipv4/tcp_input.c281
-rw-r--r--net/ipv4/tcp_ipv4.c326
-rw-r--r--net/ipv4/tcp_metrics.c354
-rw-r--r--net/ipv4/tcp_minisocks.c75
-rw-r--r--net/ipv4/tcp_output.c27
-rw-r--r--net/ipv4/tcp_timer.c39
-rw-r--r--net/ipv4/udp_diag.c6
63 files changed, 2021 insertions, 4121 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index fe4582ca969a..766c59658563 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -212,6 +212,26 @@ int inet_listen(struct socket *sock, int backlog)
212 * we can only allow the backlog to be adjusted. 212 * we can only allow the backlog to be adjusted.
213 */ 213 */
214 if (old_state != TCP_LISTEN) { 214 if (old_state != TCP_LISTEN) {
215 /* Check special setups for testing purpose to enable TFO w/o
216 * requiring TCP_FASTOPEN sockopt.
217 * Note that only TCP sockets (SOCK_STREAM) will reach here.
218 * Also fastopenq may already been allocated because this
219 * socket was in TCP_LISTEN state previously but was
220 * shutdown() (rather than close()).
221 */
222 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
223 inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
224 if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
225 err = fastopen_init_queue(sk, backlog);
226 else if ((sysctl_tcp_fastopen &
227 TFO_SERVER_WO_SOCKOPT2) != 0)
228 err = fastopen_init_queue(sk,
229 ((uint)sysctl_tcp_fastopen) >> 16);
230 else
231 err = 0;
232 if (err)
233 goto out;
234 }
215 err = inet_csk_listen_start(sk, backlog); 235 err = inet_csk_listen_start(sk, backlog);
216 if (err) 236 if (err)
217 goto out; 237 goto out;
@@ -701,7 +721,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
701 721
702 sock_rps_record_flow(sk2); 722 sock_rps_record_flow(sk2);
703 WARN_ON(!((1 << sk2->sk_state) & 723 WARN_ON(!((1 << sk2->sk_state) &
704 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); 724 (TCPF_ESTABLISHED | TCPF_SYN_RECV |
725 TCPF_CLOSE_WAIT | TCPF_CLOSE)));
705 726
706 sock_graft(sk2, newsock); 727 sock_graft(sk2, newsock);
707 728
@@ -1364,7 +1385,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1364 if (*(u8 *)iph != 0x45) 1385 if (*(u8 *)iph != 0x45)
1365 goto out_unlock; 1386 goto out_unlock;
1366 1387
1367 if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) 1388 if (unlikely(ip_fast_csum((u8 *)iph, 5)))
1368 goto out_unlock; 1389 goto out_unlock;
1369 1390
1370 id = ntohl(*(__be32 *)&iph->id); 1391 id = ntohl(*(__be32 *)&iph->id);
@@ -1380,7 +1401,6 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1380 iph2 = ip_hdr(p); 1401 iph2 = ip_hdr(p);
1381 1402
1382 if ((iph->protocol ^ iph2->protocol) | 1403 if ((iph->protocol ^ iph2->protocol) |
1383 (iph->tos ^ iph2->tos) |
1384 ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | 1404 ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
1385 ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { 1405 ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
1386 NAPI_GRO_CB(p)->same_flow = 0; 1406 NAPI_GRO_CB(p)->same_flow = 0;
@@ -1390,6 +1410,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1390 /* All fields must match except length and checksum. */ 1410 /* All fields must match except length and checksum. */
1391 NAPI_GRO_CB(p)->flush |= 1411 NAPI_GRO_CB(p)->flush |=
1392 (iph->ttl ^ iph2->ttl) | 1412 (iph->ttl ^ iph2->ttl) |
1413 (iph->tos ^ iph2->tos) |
1393 ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); 1414 ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
1394 1415
1395 NAPI_GRO_CB(p)->flush |= flush; 1416 NAPI_GRO_CB(p)->flush |= flush;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index e12fad773852..2a6abc163ed2 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -94,25 +94,22 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
94 [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, 94 [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
95}; 95};
96 96
97/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE 97#define IN4_ADDR_HSIZE_SHIFT 8
98 * value. So if you change this define, make appropriate changes to 98#define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT)
99 * inet_addr_hash as well. 99
100 */
101#define IN4_ADDR_HSIZE 256
102static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; 100static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
103static DEFINE_SPINLOCK(inet_addr_hash_lock); 101static DEFINE_SPINLOCK(inet_addr_hash_lock);
104 102
105static inline unsigned int inet_addr_hash(struct net *net, __be32 addr) 103static u32 inet_addr_hash(struct net *net, __be32 addr)
106{ 104{
107 u32 val = (__force u32) addr ^ hash_ptr(net, 8); 105 u32 val = (__force u32) addr ^ net_hash_mix(net);
108 106
109 return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) & 107 return hash_32(val, IN4_ADDR_HSIZE_SHIFT);
110 (IN4_ADDR_HSIZE - 1));
111} 108}
112 109
113static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) 110static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
114{ 111{
115 unsigned int hash = inet_addr_hash(net, ifa->ifa_local); 112 u32 hash = inet_addr_hash(net, ifa->ifa_local);
116 113
117 spin_lock(&inet_addr_hash_lock); 114 spin_lock(&inet_addr_hash_lock);
118 hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); 115 hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
@@ -136,18 +133,18 @@ static void inet_hash_remove(struct in_ifaddr *ifa)
136 */ 133 */
137struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) 134struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
138{ 135{
139 unsigned int hash = inet_addr_hash(net, addr); 136 u32 hash = inet_addr_hash(net, addr);
140 struct net_device *result = NULL; 137 struct net_device *result = NULL;
141 struct in_ifaddr *ifa; 138 struct in_ifaddr *ifa;
142 struct hlist_node *node; 139 struct hlist_node *node;
143 140
144 rcu_read_lock(); 141 rcu_read_lock();
145 hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) { 142 hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) {
146 struct net_device *dev = ifa->ifa_dev->dev;
147
148 if (!net_eq(dev_net(dev), net))
149 continue;
150 if (ifa->ifa_local == addr) { 143 if (ifa->ifa_local == addr) {
144 struct net_device *dev = ifa->ifa_dev->dev;
145
146 if (!net_eq(dev_net(dev), net))
147 continue;
151 result = dev; 148 result = dev;
152 break; 149 break;
153 } 150 }
@@ -182,10 +179,10 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
182static void devinet_sysctl_register(struct in_device *idev); 179static void devinet_sysctl_register(struct in_device *idev);
183static void devinet_sysctl_unregister(struct in_device *idev); 180static void devinet_sysctl_unregister(struct in_device *idev);
184#else 181#else
185static inline void devinet_sysctl_register(struct in_device *idev) 182static void devinet_sysctl_register(struct in_device *idev)
186{ 183{
187} 184}
188static inline void devinet_sysctl_unregister(struct in_device *idev) 185static void devinet_sysctl_unregister(struct in_device *idev)
189{ 186{
190} 187}
191#endif 188#endif
@@ -205,7 +202,7 @@ static void inet_rcu_free_ifa(struct rcu_head *head)
205 kfree(ifa); 202 kfree(ifa);
206} 203}
207 204
208static inline void inet_free_ifa(struct in_ifaddr *ifa) 205static void inet_free_ifa(struct in_ifaddr *ifa)
209{ 206{
210 call_rcu(&ifa->rcu_head, inet_rcu_free_ifa); 207 call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
211} 208}
@@ -314,7 +311,7 @@ int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
314} 311}
315 312
316static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, 313static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
317 int destroy, struct nlmsghdr *nlh, u32 pid) 314 int destroy, struct nlmsghdr *nlh, u32 portid)
318{ 315{
319 struct in_ifaddr *promote = NULL; 316 struct in_ifaddr *promote = NULL;
320 struct in_ifaddr *ifa, *ifa1 = *ifap; 317 struct in_ifaddr *ifa, *ifa1 = *ifap;
@@ -348,7 +345,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
348 inet_hash_remove(ifa); 345 inet_hash_remove(ifa);
349 *ifap1 = ifa->ifa_next; 346 *ifap1 = ifa->ifa_next;
350 347
351 rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid); 348 rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid);
352 blocking_notifier_call_chain(&inetaddr_chain, 349 blocking_notifier_call_chain(&inetaddr_chain,
353 NETDEV_DOWN, ifa); 350 NETDEV_DOWN, ifa);
354 inet_free_ifa(ifa); 351 inet_free_ifa(ifa);
@@ -385,7 +382,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
385 is valid, it will try to restore deleted routes... Grr. 382 is valid, it will try to restore deleted routes... Grr.
386 So that, this order is correct. 383 So that, this order is correct.
387 */ 384 */
388 rtmsg_ifa(RTM_DELADDR, ifa1, nlh, pid); 385 rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid);
389 blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); 386 blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
390 387
391 if (promote) { 388 if (promote) {
@@ -398,7 +395,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
398 } 395 }
399 396
400 promote->ifa_flags &= ~IFA_F_SECONDARY; 397 promote->ifa_flags &= ~IFA_F_SECONDARY;
401 rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid); 398 rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid);
402 blocking_notifier_call_chain(&inetaddr_chain, 399 blocking_notifier_call_chain(&inetaddr_chain,
403 NETDEV_UP, promote); 400 NETDEV_UP, promote);
404 for (ifa = next_sec; ifa; ifa = ifa->ifa_next) { 401 for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
@@ -420,7 +417,7 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
420} 417}
421 418
422static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, 419static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
423 u32 pid) 420 u32 portid)
424{ 421{
425 struct in_device *in_dev = ifa->ifa_dev; 422 struct in_device *in_dev = ifa->ifa_dev;
426 struct in_ifaddr *ifa1, **ifap, **last_primary; 423 struct in_ifaddr *ifa1, **ifap, **last_primary;
@@ -467,7 +464,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
467 /* Send message first, then call notifier. 464 /* Send message first, then call notifier.
468 Notifier will trigger FIB update, so that 465 Notifier will trigger FIB update, so that
469 listeners of netlink will know about new ifaddr */ 466 listeners of netlink will know about new ifaddr */
470 rtmsg_ifa(RTM_NEWADDR, ifa, nlh, pid); 467 rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid);
471 blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); 468 blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
472 469
473 return 0; 470 return 0;
@@ -566,7 +563,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
566 !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa))) 563 !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa)))
567 continue; 564 continue;
568 565
569 __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).pid); 566 __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid);
570 return 0; 567 return 0;
571 } 568 }
572 569
@@ -652,14 +649,14 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
652 if (IS_ERR(ifa)) 649 if (IS_ERR(ifa))
653 return PTR_ERR(ifa); 650 return PTR_ERR(ifa);
654 651
655 return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).pid); 652 return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
656} 653}
657 654
658/* 655/*
659 * Determine a default network mask, based on the IP address. 656 * Determine a default network mask, based on the IP address.
660 */ 657 */
661 658
662static inline int inet_abc_len(__be32 addr) 659static int inet_abc_len(__be32 addr)
663{ 660{
664 int rc = -1; /* Something else, probably a multicast. */ 661 int rc = -1; /* Something else, probably a multicast. */
665 662
@@ -1124,7 +1121,7 @@ skip:
1124 } 1121 }
1125} 1122}
1126 1123
1127static inline bool inetdev_valid_mtu(unsigned int mtu) 1124static bool inetdev_valid_mtu(unsigned int mtu)
1128{ 1125{
1129 return mtu >= 68; 1126 return mtu >= 68;
1130} 1127}
@@ -1239,7 +1236,7 @@ static struct notifier_block ip_netdev_notifier = {
1239 .notifier_call = inetdev_event, 1236 .notifier_call = inetdev_event,
1240}; 1237};
1241 1238
1242static inline size_t inet_nlmsg_size(void) 1239static size_t inet_nlmsg_size(void)
1243{ 1240{
1244 return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) 1241 return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
1245 + nla_total_size(4) /* IFA_ADDRESS */ 1242 + nla_total_size(4) /* IFA_ADDRESS */
@@ -1249,12 +1246,12 @@ static inline size_t inet_nlmsg_size(void)
1249} 1246}
1250 1247
1251static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, 1248static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
1252 u32 pid, u32 seq, int event, unsigned int flags) 1249 u32 portid, u32 seq, int event, unsigned int flags)
1253{ 1250{
1254 struct ifaddrmsg *ifm; 1251 struct ifaddrmsg *ifm;
1255 struct nlmsghdr *nlh; 1252 struct nlmsghdr *nlh;
1256 1253
1257 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), flags); 1254 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags);
1258 if (nlh == NULL) 1255 if (nlh == NULL)
1259 return -EMSGSIZE; 1256 return -EMSGSIZE;
1260 1257
@@ -1316,7 +1313,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
1316 if (ip_idx < s_ip_idx) 1313 if (ip_idx < s_ip_idx)
1317 continue; 1314 continue;
1318 if (inet_fill_ifaddr(skb, ifa, 1315 if (inet_fill_ifaddr(skb, ifa,
1319 NETLINK_CB(cb->skb).pid, 1316 NETLINK_CB(cb->skb).portid,
1320 cb->nlh->nlmsg_seq, 1317 cb->nlh->nlmsg_seq,
1321 RTM_NEWADDR, NLM_F_MULTI) <= 0) { 1318 RTM_NEWADDR, NLM_F_MULTI) <= 0) {
1322 rcu_read_unlock(); 1319 rcu_read_unlock();
@@ -1338,7 +1335,7 @@ done:
1338} 1335}
1339 1336
1340static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, 1337static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
1341 u32 pid) 1338 u32 portid)
1342{ 1339{
1343 struct sk_buff *skb; 1340 struct sk_buff *skb;
1344 u32 seq = nlh ? nlh->nlmsg_seq : 0; 1341 u32 seq = nlh ? nlh->nlmsg_seq : 0;
@@ -1350,14 +1347,14 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
1350 if (skb == NULL) 1347 if (skb == NULL)
1351 goto errout; 1348 goto errout;
1352 1349
1353 err = inet_fill_ifaddr(skb, ifa, pid, seq, event, 0); 1350 err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0);
1354 if (err < 0) { 1351 if (err < 0) {
1355 /* -EMSGSIZE implies BUG in inet_nlmsg_size() */ 1352 /* -EMSGSIZE implies BUG in inet_nlmsg_size() */
1356 WARN_ON(err == -EMSGSIZE); 1353 WARN_ON(err == -EMSGSIZE);
1357 kfree_skb(skb); 1354 kfree_skb(skb);
1358 goto errout; 1355 goto errout;
1359 } 1356 }
1360 rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); 1357 rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
1361 return; 1358 return;
1362errout: 1359errout:
1363 if (err < 0) 1360 if (err < 0)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 8e2b475da9fa..68c93d1bb03a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -218,7 +218,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
218 scope = RT_SCOPE_UNIVERSE; 218 scope = RT_SCOPE_UNIVERSE;
219 if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { 219 if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
220 fl4.flowi4_oif = 0; 220 fl4.flowi4_oif = 0;
221 fl4.flowi4_iif = net->loopback_dev->ifindex; 221 fl4.flowi4_iif = LOOPBACK_IFINDEX;
222 fl4.daddr = ip_hdr(skb)->saddr; 222 fl4.daddr = ip_hdr(skb)->saddr;
223 fl4.saddr = 0; 223 fl4.saddr = 0;
224 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); 224 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
@@ -557,7 +557,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
557 cfg->fc_flags = rtm->rtm_flags; 557 cfg->fc_flags = rtm->rtm_flags;
558 cfg->fc_nlflags = nlh->nlmsg_flags; 558 cfg->fc_nlflags = nlh->nlmsg_flags;
559 559
560 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid; 560 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
561 cfg->fc_nlinfo.nlh = nlh; 561 cfg->fc_nlinfo.nlh = nlh;
562 cfg->fc_nlinfo.nl_net = net; 562 cfg->fc_nlinfo.nl_net = net;
563 563
@@ -955,7 +955,7 @@ static void nl_fib_input(struct sk_buff *skb)
955 struct fib_result_nl *frn; 955 struct fib_result_nl *frn;
956 struct nlmsghdr *nlh; 956 struct nlmsghdr *nlh;
957 struct fib_table *tb; 957 struct fib_table *tb;
958 u32 pid; 958 u32 portid;
959 959
960 net = sock_net(skb->sk); 960 net = sock_net(skb->sk);
961 nlh = nlmsg_hdr(skb); 961 nlh = nlmsg_hdr(skb);
@@ -973,10 +973,10 @@ static void nl_fib_input(struct sk_buff *skb)
973 973
974 nl_fib_lookup(frn, tb); 974 nl_fib_lookup(frn, tb);
975 975
976 pid = NETLINK_CB(skb).pid; /* pid of sending process */ 976 portid = NETLINK_CB(skb).portid; /* pid of sending process */
977 NETLINK_CB(skb).pid = 0; /* from kernel */ 977 NETLINK_CB(skb).portid = 0; /* from kernel */
978 NETLINK_CB(skb).dst_group = 0; /* unicast */ 978 NETLINK_CB(skb).dst_group = 0; /* unicast */
979 netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); 979 netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);
980} 980}
981 981
982static int __net_init nl_fib_lookup_init(struct net *net) 982static int __net_init nl_fib_lookup_init(struct net *net)
@@ -986,7 +986,7 @@ static int __net_init nl_fib_lookup_init(struct net *net)
986 .input = nl_fib_input, 986 .input = nl_fib_input,
987 }; 987 };
988 988
989 sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, THIS_MODULE, &cfg); 989 sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
990 if (sk == NULL) 990 if (sk == NULL)
991 return -EAFNOSUPPORT; 991 return -EAFNOSUPPORT;
992 net->ipv4.fibnl = sk; 992 net->ipv4.fibnl = sk;
@@ -1041,7 +1041,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
1041static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) 1041static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
1042{ 1042{
1043 struct net_device *dev = ptr; 1043 struct net_device *dev = ptr;
1044 struct in_device *in_dev = __in_dev_get_rtnl(dev); 1044 struct in_device *in_dev;
1045 struct net *net = dev_net(dev); 1045 struct net *net = dev_net(dev);
1046 1046
1047 if (event == NETDEV_UNREGISTER) { 1047 if (event == NETDEV_UNREGISTER) {
@@ -1050,8 +1050,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
1050 return NOTIFY_DONE; 1050 return NOTIFY_DONE;
1051 } 1051 }
1052 1052
1053 if (!in_dev) 1053 in_dev = __in_dev_get_rtnl(dev);
1054 return NOTIFY_DONE;
1055 1054
1056 switch (event) { 1055 switch (event) {
1057 case NETDEV_UP: 1056 case NETDEV_UP:
@@ -1062,16 +1061,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
1062 fib_sync_up(dev); 1061 fib_sync_up(dev);
1063#endif 1062#endif
1064 atomic_inc(&net->ipv4.dev_addr_genid); 1063 atomic_inc(&net->ipv4.dev_addr_genid);
1065 rt_cache_flush(dev_net(dev)); 1064 rt_cache_flush(net);
1066 break; 1065 break;
1067 case NETDEV_DOWN: 1066 case NETDEV_DOWN:
1068 fib_disable_ip(dev, 0); 1067 fib_disable_ip(dev, 0);
1069 break; 1068 break;
1070 case NETDEV_CHANGEMTU: 1069 case NETDEV_CHANGEMTU:
1071 case NETDEV_CHANGE: 1070 case NETDEV_CHANGE:
1072 rt_cache_flush(dev_net(dev)); 1071 rt_cache_flush(net);
1073 break;
1074 case NETDEV_UNREGISTER_BATCH:
1075 break; 1072 break;
1076 } 1073 }
1077 return NOTIFY_DONE; 1074 return NOTIFY_DONE;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index da80dc14cc76..3509065e409a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -391,7 +391,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
391 if (skb == NULL) 391 if (skb == NULL)
392 goto errout; 392 goto errout;
393 393
394 err = fib_dump_info(skb, info->pid, seq, event, tb_id, 394 err = fib_dump_info(skb, info->portid, seq, event, tb_id,
395 fa->fa_type, key, dst_len, 395 fa->fa_type, key, dst_len,
396 fa->fa_tos, fa->fa_info, nlm_flags); 396 fa->fa_tos, fa->fa_info, nlm_flags);
397 if (err < 0) { 397 if (err < 0) {
@@ -400,7 +400,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
400 kfree_skb(skb); 400 kfree_skb(skb);
401 goto errout; 401 goto errout;
402 } 402 }
403 rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, 403 rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,
404 info->nlh, GFP_KERNEL); 404 info->nlh, GFP_KERNEL);
405 return; 405 return;
406errout: 406errout:
@@ -989,14 +989,14 @@ failure:
989 return ERR_PTR(err); 989 return ERR_PTR(err);
990} 990}
991 991
992int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 992int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
993 u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, 993 u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
994 struct fib_info *fi, unsigned int flags) 994 struct fib_info *fi, unsigned int flags)
995{ 995{
996 struct nlmsghdr *nlh; 996 struct nlmsghdr *nlh;
997 struct rtmsg *rtm; 997 struct rtmsg *rtm;
998 998
999 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags); 999 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
1000 if (nlh == NULL) 1000 if (nlh == NULL)
1001 return -EMSGSIZE; 1001 return -EMSGSIZE;
1002 1002
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index d1b93595b4a7..31d771ca9a70 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1550,7 +1550,8 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1550 * state.directly. 1550 * state.directly.
1551 */ 1551 */
1552 if (pref_mismatch) { 1552 if (pref_mismatch) {
1553 int mp = KEYLENGTH - fls(pref_mismatch); 1553 /* fls(x) = __fls(x) + 1 */
1554 int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
1554 1555
1555 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0) 1556 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1556 goto backtrace; 1557 goto backtrace;
@@ -1655,7 +1656,12 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1655 if (!l) 1656 if (!l)
1656 return -ESRCH; 1657 return -ESRCH;
1657 1658
1658 fa_head = get_fa_head(l, plen); 1659 li = find_leaf_info(l, plen);
1660
1661 if (!li)
1662 return -ESRCH;
1663
1664 fa_head = &li->falh;
1659 fa = fib_find_alias(fa_head, tos, 0); 1665 fa = fib_find_alias(fa_head, tos, 0);
1660 1666
1661 if (!fa) 1667 if (!fa)
@@ -1691,9 +1697,6 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1691 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, 1697 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1692 &cfg->fc_nlinfo, 0); 1698 &cfg->fc_nlinfo, 0);
1693 1699
1694 l = fib_find_node(t, key);
1695 li = find_leaf_info(l, plen);
1696
1697 list_del_rcu(&fa->fa_list); 1700 list_del_rcu(&fa->fa_list);
1698 1701
1699 if (!plen) 1702 if (!plen)
@@ -1870,7 +1873,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1870 continue; 1873 continue;
1871 } 1874 }
1872 1875
1873 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, 1876 if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1874 cb->nlh->nlmsg_seq, 1877 cb->nlh->nlmsg_seq,
1875 RTM_NEWROUTE, 1878 RTM_NEWROUTE,
1876 tb->tb_id, 1879 tb->tb_id,
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 6699f23e6f55..736ab70fd179 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -815,14 +815,15 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
815 return 1; 815 return 1;
816} 816}
817 817
818static void igmp_heard_report(struct in_device *in_dev, __be32 group) 818/* return true if packet was dropped */
819static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
819{ 820{
820 struct ip_mc_list *im; 821 struct ip_mc_list *im;
821 822
822 /* Timers are only set for non-local groups */ 823 /* Timers are only set for non-local groups */
823 824
824 if (group == IGMP_ALL_HOSTS) 825 if (group == IGMP_ALL_HOSTS)
825 return; 826 return false;
826 827
827 rcu_read_lock(); 828 rcu_read_lock();
828 for_each_pmc_rcu(in_dev, im) { 829 for_each_pmc_rcu(in_dev, im) {
@@ -832,9 +833,11 @@ static void igmp_heard_report(struct in_device *in_dev, __be32 group)
832 } 833 }
833 } 834 }
834 rcu_read_unlock(); 835 rcu_read_unlock();
836 return false;
835} 837}
836 838
837static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, 839/* return true if packet was dropped */
840static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
838 int len) 841 int len)
839{ 842{
840 struct igmphdr *ih = igmp_hdr(skb); 843 struct igmphdr *ih = igmp_hdr(skb);
@@ -866,7 +869,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
866 /* clear deleted report items */ 869 /* clear deleted report items */
867 igmpv3_clear_delrec(in_dev); 870 igmpv3_clear_delrec(in_dev);
868 } else if (len < 12) { 871 } else if (len < 12) {
869 return; /* ignore bogus packet; freed by caller */ 872 return true; /* ignore bogus packet; freed by caller */
870 } else if (IGMP_V1_SEEN(in_dev)) { 873 } else if (IGMP_V1_SEEN(in_dev)) {
871 /* This is a v3 query with v1 queriers present */ 874 /* This is a v3 query with v1 queriers present */
872 max_delay = IGMP_Query_Response_Interval; 875 max_delay = IGMP_Query_Response_Interval;
@@ -883,13 +886,13 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
883 max_delay = 1; /* can't mod w/ 0 */ 886 max_delay = 1; /* can't mod w/ 0 */
884 } else { /* v3 */ 887 } else { /* v3 */
885 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) 888 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
886 return; 889 return true;
887 890
888 ih3 = igmpv3_query_hdr(skb); 891 ih3 = igmpv3_query_hdr(skb);
889 if (ih3->nsrcs) { 892 if (ih3->nsrcs) {
890 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query) 893 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
891 + ntohs(ih3->nsrcs)*sizeof(__be32))) 894 + ntohs(ih3->nsrcs)*sizeof(__be32)))
892 return; 895 return true;
893 ih3 = igmpv3_query_hdr(skb); 896 ih3 = igmpv3_query_hdr(skb);
894 } 897 }
895 898
@@ -901,9 +904,9 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
901 in_dev->mr_qrv = ih3->qrv; 904 in_dev->mr_qrv = ih3->qrv;
902 if (!group) { /* general query */ 905 if (!group) { /* general query */
903 if (ih3->nsrcs) 906 if (ih3->nsrcs)
904 return; /* no sources allowed */ 907 return false; /* no sources allowed */
905 igmp_gq_start_timer(in_dev); 908 igmp_gq_start_timer(in_dev);
906 return; 909 return false;
907 } 910 }
908 /* mark sources to include, if group & source-specific */ 911 /* mark sources to include, if group & source-specific */
909 mark = ih3->nsrcs != 0; 912 mark = ih3->nsrcs != 0;
@@ -939,6 +942,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
939 igmp_mod_timer(im, max_delay); 942 igmp_mod_timer(im, max_delay);
940 } 943 }
941 rcu_read_unlock(); 944 rcu_read_unlock();
945 return false;
942} 946}
943 947
944/* called in rcu_read_lock() section */ 948/* called in rcu_read_lock() section */
@@ -948,6 +952,7 @@ int igmp_rcv(struct sk_buff *skb)
948 struct igmphdr *ih; 952 struct igmphdr *ih;
949 struct in_device *in_dev = __in_dev_get_rcu(skb->dev); 953 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
950 int len = skb->len; 954 int len = skb->len;
955 bool dropped = true;
951 956
952 if (in_dev == NULL) 957 if (in_dev == NULL)
953 goto drop; 958 goto drop;
@@ -969,7 +974,7 @@ int igmp_rcv(struct sk_buff *skb)
969 ih = igmp_hdr(skb); 974 ih = igmp_hdr(skb);
970 switch (ih->type) { 975 switch (ih->type) {
971 case IGMP_HOST_MEMBERSHIP_QUERY: 976 case IGMP_HOST_MEMBERSHIP_QUERY:
972 igmp_heard_query(in_dev, skb, len); 977 dropped = igmp_heard_query(in_dev, skb, len);
973 break; 978 break;
974 case IGMP_HOST_MEMBERSHIP_REPORT: 979 case IGMP_HOST_MEMBERSHIP_REPORT:
975 case IGMPV2_HOST_MEMBERSHIP_REPORT: 980 case IGMPV2_HOST_MEMBERSHIP_REPORT:
@@ -979,7 +984,7 @@ int igmp_rcv(struct sk_buff *skb)
979 /* don't rely on MC router hearing unicast reports */ 984 /* don't rely on MC router hearing unicast reports */
980 if (skb->pkt_type == PACKET_MULTICAST || 985 if (skb->pkt_type == PACKET_MULTICAST ||
981 skb->pkt_type == PACKET_BROADCAST) 986 skb->pkt_type == PACKET_BROADCAST)
982 igmp_heard_report(in_dev, ih->group); 987 dropped = igmp_heard_report(in_dev, ih->group);
983 break; 988 break;
984 case IGMP_PIM: 989 case IGMP_PIM:
985#ifdef CONFIG_IP_PIMSM_V1 990#ifdef CONFIG_IP_PIMSM_V1
@@ -997,7 +1002,10 @@ int igmp_rcv(struct sk_buff *skb)
997 } 1002 }
998 1003
999drop: 1004drop:
1000 kfree_skb(skb); 1005 if (dropped)
1006 kfree_skb(skb);
1007 else
1008 consume_skb(skb);
1001 return 0; 1009 return 0;
1002} 1010}
1003 1011
@@ -1896,6 +1904,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1896 rtnl_unlock(); 1904 rtnl_unlock();
1897 return ret; 1905 return ret;
1898} 1906}
1907EXPORT_SYMBOL(ip_mc_leave_group);
1899 1908
1900int ip_mc_source(int add, int omode, struct sock *sk, struct 1909int ip_mc_source(int add, int omode, struct sock *sk, struct
1901 ip_mreq_source *mreqs, int ifindex) 1910 ip_mreq_source *mreqs, int ifindex)
@@ -2435,6 +2444,8 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
2435 struct ip_mc_list *im = (struct ip_mc_list *)v; 2444 struct ip_mc_list *im = (struct ip_mc_list *)v;
2436 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); 2445 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2437 char *querier; 2446 char *querier;
2447 long delta;
2448
2438#ifdef CONFIG_IP_MULTICAST 2449#ifdef CONFIG_IP_MULTICAST
2439 querier = IGMP_V1_SEEN(state->in_dev) ? "V1" : 2450 querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
2440 IGMP_V2_SEEN(state->in_dev) ? "V2" : 2451 IGMP_V2_SEEN(state->in_dev) ? "V2" :
@@ -2448,11 +2459,12 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
2448 state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); 2459 state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
2449 } 2460 }
2450 2461
2462 delta = im->timer.expires - jiffies;
2451 seq_printf(seq, 2463 seq_printf(seq,
2452 "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n", 2464 "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
2453 im->multiaddr, im->users, 2465 im->multiaddr, im->users,
2454 im->tm_running, im->tm_running ? 2466 im->tm_running,
2455 jiffies_to_clock_t(im->timer.expires-jiffies) : 0, 2467 im->tm_running ? jiffies_delta_to_clock_t(delta) : 0,
2456 im->reporter); 2468 im->reporter);
2457 } 2469 }
2458 return 0; 2470 return 0;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 7f75f21d7b83..f0c5b9c1a957 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -283,7 +283,9 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
283struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) 283struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
284{ 284{
285 struct inet_connection_sock *icsk = inet_csk(sk); 285 struct inet_connection_sock *icsk = inet_csk(sk);
286 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
286 struct sock *newsk; 287 struct sock *newsk;
288 struct request_sock *req;
287 int error; 289 int error;
288 290
289 lock_sock(sk); 291 lock_sock(sk);
@@ -296,7 +298,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
296 goto out_err; 298 goto out_err;
297 299
298 /* Find already established connection */ 300 /* Find already established connection */
299 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { 301 if (reqsk_queue_empty(queue)) {
300 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 302 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
301 303
302 /* If this is a non blocking socket don't sleep */ 304 /* If this is a non blocking socket don't sleep */
@@ -308,14 +310,32 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
308 if (error) 310 if (error)
309 goto out_err; 311 goto out_err;
310 } 312 }
311 313 req = reqsk_queue_remove(queue);
312 newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); 314 newsk = req->sk;
313 WARN_ON(newsk->sk_state == TCP_SYN_RECV); 315
316 sk_acceptq_removed(sk);
317 if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) {
318 spin_lock_bh(&queue->fastopenq->lock);
319 if (tcp_rsk(req)->listener) {
320 /* We are still waiting for the final ACK from 3WHS
321 * so can't free req now. Instead, we set req->sk to
322 * NULL to signify that the child socket is taken
323 * so reqsk_fastopen_remove() will free the req
324 * when 3WHS finishes (or is aborted).
325 */
326 req->sk = NULL;
327 req = NULL;
328 }
329 spin_unlock_bh(&queue->fastopenq->lock);
330 }
314out: 331out:
315 release_sock(sk); 332 release_sock(sk);
333 if (req)
334 __reqsk_free(req);
316 return newsk; 335 return newsk;
317out_err: 336out_err:
318 newsk = NULL; 337 newsk = NULL;
338 req = NULL;
319 *err = error; 339 *err = error;
320 goto out; 340 goto out;
321} 341}
@@ -720,13 +740,14 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start);
720void inet_csk_listen_stop(struct sock *sk) 740void inet_csk_listen_stop(struct sock *sk)
721{ 741{
722 struct inet_connection_sock *icsk = inet_csk(sk); 742 struct inet_connection_sock *icsk = inet_csk(sk);
743 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
723 struct request_sock *acc_req; 744 struct request_sock *acc_req;
724 struct request_sock *req; 745 struct request_sock *req;
725 746
726 inet_csk_delete_keepalive_timer(sk); 747 inet_csk_delete_keepalive_timer(sk);
727 748
728 /* make all the listen_opt local to us */ 749 /* make all the listen_opt local to us */
729 acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); 750 acc_req = reqsk_queue_yank_acceptq(queue);
730 751
731 /* Following specs, it would be better either to send FIN 752 /* Following specs, it would be better either to send FIN
732 * (and enter FIN-WAIT-1, it is normal close) 753 * (and enter FIN-WAIT-1, it is normal close)
@@ -736,7 +757,7 @@ void inet_csk_listen_stop(struct sock *sk)
736 * To be honest, we are not able to make either 757 * To be honest, we are not able to make either
737 * of the variants now. --ANK 758 * of the variants now. --ANK
738 */ 759 */
739 reqsk_queue_destroy(&icsk->icsk_accept_queue); 760 reqsk_queue_destroy(queue);
740 761
741 while ((req = acc_req) != NULL) { 762 while ((req = acc_req) != NULL) {
742 struct sock *child = req->sk; 763 struct sock *child = req->sk;
@@ -754,6 +775,19 @@ void inet_csk_listen_stop(struct sock *sk)
754 775
755 percpu_counter_inc(sk->sk_prot->orphan_count); 776 percpu_counter_inc(sk->sk_prot->orphan_count);
756 777
778 if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) {
779 BUG_ON(tcp_sk(child)->fastopen_rsk != req);
780 BUG_ON(sk != tcp_rsk(req)->listener);
781
782 /* Paranoid, to prevent race condition if
783 * an inbound pkt destined for child is
784 * blocked by sock lock in tcp_v4_rcv().
785 * Also to satisfy an assertion in
786 * tcp_v4_destroy_sock().
787 */
788 tcp_sk(child)->fastopen_rsk = NULL;
789 sock_put(sk);
790 }
757 inet_csk_destroy_sock(child); 791 inet_csk_destroy_sock(child);
758 792
759 bh_unlock_sock(child); 793 bh_unlock_sock(child);
@@ -763,6 +797,17 @@ void inet_csk_listen_stop(struct sock *sk)
763 sk_acceptq_removed(sk); 797 sk_acceptq_removed(sk);
764 __reqsk_free(req); 798 __reqsk_free(req);
765 } 799 }
800 if (queue->fastopenq != NULL) {
801 /* Free all the reqs queued in rskq_rst_head. */
802 spin_lock_bh(&queue->fastopenq->lock);
803 acc_req = queue->fastopenq->rskq_rst_head;
804 queue->fastopenq->rskq_rst_head = NULL;
805 spin_unlock_bh(&queue->fastopenq->lock);
806 while ((req = acc_req) != NULL) {
807 acc_req = req->dl_next;
808 __reqsk_free(req);
809 }
810 }
766 WARN_ON(sk->sk_ack_backlog); 811 WARN_ON(sk->sk_ack_backlog);
767} 812}
768EXPORT_SYMBOL_GPL(inet_csk_listen_stop); 813EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 8bc005b1435f..535584c00f91 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -70,7 +70,7 @@ static inline void inet_diag_unlock_handler(
70int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, 70int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
71 struct sk_buff *skb, struct inet_diag_req_v2 *req, 71 struct sk_buff *skb, struct inet_diag_req_v2 *req,
72 struct user_namespace *user_ns, 72 struct user_namespace *user_ns,
73 u32 pid, u32 seq, u16 nlmsg_flags, 73 u32 portid, u32 seq, u16 nlmsg_flags,
74 const struct nlmsghdr *unlh) 74 const struct nlmsghdr *unlh)
75{ 75{
76 const struct inet_sock *inet = inet_sk(sk); 76 const struct inet_sock *inet = inet_sk(sk);
@@ -84,7 +84,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
84 handler = inet_diag_table[req->sdiag_protocol]; 84 handler = inet_diag_table[req->sdiag_protocol];
85 BUG_ON(handler == NULL); 85 BUG_ON(handler == NULL);
86 86
87 nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r), 87 nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
88 nlmsg_flags); 88 nlmsg_flags);
89 if (!nlh) 89 if (!nlh)
90 return -EMSGSIZE; 90 return -EMSGSIZE;
@@ -201,23 +201,23 @@ EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
201static int inet_csk_diag_fill(struct sock *sk, 201static int inet_csk_diag_fill(struct sock *sk,
202 struct sk_buff *skb, struct inet_diag_req_v2 *req, 202 struct sk_buff *skb, struct inet_diag_req_v2 *req,
203 struct user_namespace *user_ns, 203 struct user_namespace *user_ns,
204 u32 pid, u32 seq, u16 nlmsg_flags, 204 u32 portid, u32 seq, u16 nlmsg_flags,
205 const struct nlmsghdr *unlh) 205 const struct nlmsghdr *unlh)
206{ 206{
207 return inet_sk_diag_fill(sk, inet_csk(sk), 207 return inet_sk_diag_fill(sk, inet_csk(sk),
208 skb, req, user_ns, pid, seq, nlmsg_flags, unlh); 208 skb, req, user_ns, portid, seq, nlmsg_flags, unlh);
209} 209}
210 210
211static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, 211static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
212 struct sk_buff *skb, struct inet_diag_req_v2 *req, 212 struct sk_buff *skb, struct inet_diag_req_v2 *req,
213 u32 pid, u32 seq, u16 nlmsg_flags, 213 u32 portid, u32 seq, u16 nlmsg_flags,
214 const struct nlmsghdr *unlh) 214 const struct nlmsghdr *unlh)
215{ 215{
216 long tmo; 216 long tmo;
217 struct inet_diag_msg *r; 217 struct inet_diag_msg *r;
218 struct nlmsghdr *nlh; 218 struct nlmsghdr *nlh;
219 219
220 nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r), 220 nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
221 nlmsg_flags); 221 nlmsg_flags);
222 if (!nlh) 222 if (!nlh)
223 return -EMSGSIZE; 223 return -EMSGSIZE;
@@ -260,14 +260,14 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
260static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, 260static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
261 struct inet_diag_req_v2 *r, 261 struct inet_diag_req_v2 *r,
262 struct user_namespace *user_ns, 262 struct user_namespace *user_ns,
263 u32 pid, u32 seq, u16 nlmsg_flags, 263 u32 portid, u32 seq, u16 nlmsg_flags,
264 const struct nlmsghdr *unlh) 264 const struct nlmsghdr *unlh)
265{ 265{
266 if (sk->sk_state == TCP_TIME_WAIT) 266 if (sk->sk_state == TCP_TIME_WAIT)
267 return inet_twsk_diag_fill((struct inet_timewait_sock *)sk, 267 return inet_twsk_diag_fill((struct inet_timewait_sock *)sk,
268 skb, r, pid, seq, nlmsg_flags, 268 skb, r, portid, seq, nlmsg_flags,
269 unlh); 269 unlh);
270 return inet_csk_diag_fill(sk, skb, r, user_ns, pid, seq, nlmsg_flags, unlh); 270 return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, nlmsg_flags, unlh);
271} 271}
272 272
273int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb, 273int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb,
@@ -316,14 +316,14 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
316 316
317 err = sk_diag_fill(sk, rep, req, 317 err = sk_diag_fill(sk, rep, req,
318 sk_user_ns(NETLINK_CB(in_skb).ssk), 318 sk_user_ns(NETLINK_CB(in_skb).ssk),
319 NETLINK_CB(in_skb).pid, 319 NETLINK_CB(in_skb).portid,
320 nlh->nlmsg_seq, 0, nlh); 320 nlh->nlmsg_seq, 0, nlh);
321 if (err < 0) { 321 if (err < 0) {
322 WARN_ON(err == -EMSGSIZE); 322 WARN_ON(err == -EMSGSIZE);
323 nlmsg_free(rep); 323 nlmsg_free(rep);
324 goto out; 324 goto out;
325 } 325 }
326 err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid, 326 err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
327 MSG_DONTWAIT); 327 MSG_DONTWAIT);
328 if (err > 0) 328 if (err > 0)
329 err = 0; 329 err = 0;
@@ -557,7 +557,7 @@ static int inet_csk_diag_dump(struct sock *sk,
557 557
558 return inet_csk_diag_fill(sk, skb, r, 558 return inet_csk_diag_fill(sk, skb, r,
559 sk_user_ns(NETLINK_CB(cb->skb).ssk), 559 sk_user_ns(NETLINK_CB(cb->skb).ssk),
560 NETLINK_CB(cb->skb).pid, 560 NETLINK_CB(cb->skb).portid,
561 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); 561 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
562} 562}
563 563
@@ -592,14 +592,14 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
592 } 592 }
593 593
594 return inet_twsk_diag_fill(tw, skb, r, 594 return inet_twsk_diag_fill(tw, skb, r,
595 NETLINK_CB(cb->skb).pid, 595 NETLINK_CB(cb->skb).portid,
596 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); 596 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
597} 597}
598 598
599static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, 599static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
600 struct request_sock *req, 600 struct request_sock *req,
601 struct user_namespace *user_ns, 601 struct user_namespace *user_ns,
602 u32 pid, u32 seq, 602 u32 portid, u32 seq,
603 const struct nlmsghdr *unlh) 603 const struct nlmsghdr *unlh)
604{ 604{
605 const struct inet_request_sock *ireq = inet_rsk(req); 605 const struct inet_request_sock *ireq = inet_rsk(req);
@@ -608,7 +608,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
608 struct nlmsghdr *nlh; 608 struct nlmsghdr *nlh;
609 long tmo; 609 long tmo;
610 610
611 nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r), 611 nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
612 NLM_F_MULTI); 612 NLM_F_MULTI);
613 if (!nlh) 613 if (!nlh)
614 return -EMSGSIZE; 614 return -EMSGSIZE;
@@ -711,7 +711,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
711 711
712 err = inet_diag_fill_req(skb, sk, req, 712 err = inet_diag_fill_req(skb, sk, req,
713 sk_user_ns(NETLINK_CB(cb->skb).ssk), 713 sk_user_ns(NETLINK_CB(cb->skb).ssk),
714 NETLINK_CB(cb->skb).pid, 714 NETLINK_CB(cb->skb).portid,
715 cb->nlh->nlmsg_seq, cb->nlh); 715 cb->nlh->nlmsg_seq, cb->nlh);
716 if (err < 0) { 716 if (err < 0) {
717 cb->args[3] = j + 1; 717 cb->args[3] = j + 1;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 85190e69297b..4750d2b74d79 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -89,7 +89,7 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
89 nf->low_thresh = 0; 89 nf->low_thresh = 0;
90 90
91 local_bh_disable(); 91 local_bh_disable();
92 inet_frag_evictor(nf, f); 92 inet_frag_evictor(nf, f, true);
93 local_bh_enable(); 93 local_bh_enable();
94} 94}
95EXPORT_SYMBOL(inet_frags_exit_net); 95EXPORT_SYMBOL(inet_frags_exit_net);
@@ -158,11 +158,16 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
158} 158}
159EXPORT_SYMBOL(inet_frag_destroy); 159EXPORT_SYMBOL(inet_frag_destroy);
160 160
161int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f) 161int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
162{ 162{
163 struct inet_frag_queue *q; 163 struct inet_frag_queue *q;
164 int work, evicted = 0; 164 int work, evicted = 0;
165 165
166 if (!force) {
167 if (atomic_read(&nf->mem) <= nf->high_thresh)
168 return 0;
169 }
170
166 work = atomic_read(&nf->mem) - nf->low_thresh; 171 work = atomic_read(&nf->mem) - nf->low_thresh;
167 while (work > 0) { 172 while (work > 0) {
168 read_lock(&f->lock); 173 read_lock(&f->lock);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8d07c973409c..448e68546827 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -219,7 +219,7 @@ static void ip_evictor(struct net *net)
219{ 219{
220 int evicted; 220 int evicted;
221 221
222 evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags); 222 evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
223 if (evicted) 223 if (evicted)
224 IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted); 224 IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
225} 225}
@@ -523,6 +523,10 @@ found:
523 if (offset == 0) 523 if (offset == 0)
524 qp->q.last_in |= INET_FRAG_FIRST_IN; 524 qp->q.last_in |= INET_FRAG_FIRST_IN;
525 525
526 if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
527 skb->len + ihl > qp->q.max_size)
528 qp->q.max_size = skb->len + ihl;
529
526 if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && 530 if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
527 qp->q.meat == qp->q.len) 531 qp->q.meat == qp->q.len)
528 return ip_frag_reasm(qp, prev, dev); 532 return ip_frag_reasm(qp, prev, dev);
@@ -646,9 +650,11 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
646 head->next = NULL; 650 head->next = NULL;
647 head->dev = dev; 651 head->dev = dev;
648 head->tstamp = qp->q.stamp; 652 head->tstamp = qp->q.stamp;
653 IPCB(head)->frag_max_size = qp->q.max_size;
649 654
650 iph = ip_hdr(head); 655 iph = ip_hdr(head);
651 iph->frag_off = 0; 656 /* max_size != 0 implies at least one fragment had IP_DF set */
657 iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
652 iph->tot_len = htons(len); 658 iph->tot_len = htons(len);
653 iph->tos |= ecn; 659 iph->tos |= ecn;
654 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); 660 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
@@ -678,8 +684,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
678 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); 684 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
679 685
680 /* Start by cleaning up the memory. */ 686 /* Start by cleaning up the memory. */
681 if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) 687 ip_evictor(net);
682 ip_evictor(net);
683 688
684 /* Lookup (or create) queue header */ 689 /* Lookup (or create) queue header */
685 if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { 690 if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index b062a98574f2..7240f8e2dd45 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -120,6 +120,10 @@
120 Alexey Kuznetsov. 120 Alexey Kuznetsov.
121 */ 121 */
122 122
123static bool log_ecn_error = true;
124module_param(log_ecn_error, bool, 0644);
125MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
126
123static struct rtnl_link_ops ipgre_link_ops __read_mostly; 127static struct rtnl_link_ops ipgre_link_ops __read_mostly;
124static int ipgre_tunnel_init(struct net_device *dev); 128static int ipgre_tunnel_init(struct net_device *dev);
125static void ipgre_tunnel_setup(struct net_device *dev); 129static void ipgre_tunnel_setup(struct net_device *dev);
@@ -204,7 +208,9 @@ static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
204 tot->rx_crc_errors = dev->stats.rx_crc_errors; 208 tot->rx_crc_errors = dev->stats.rx_crc_errors;
205 tot->rx_fifo_errors = dev->stats.rx_fifo_errors; 209 tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
206 tot->rx_length_errors = dev->stats.rx_length_errors; 210 tot->rx_length_errors = dev->stats.rx_length_errors;
211 tot->rx_frame_errors = dev->stats.rx_frame_errors;
207 tot->rx_errors = dev->stats.rx_errors; 212 tot->rx_errors = dev->stats.rx_errors;
213
208 tot->tx_fifo_errors = dev->stats.tx_fifo_errors; 214 tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
209 tot->tx_carrier_errors = dev->stats.tx_carrier_errors; 215 tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
210 tot->tx_dropped = dev->stats.tx_dropped; 216 tot->tx_dropped = dev->stats.tx_dropped;
@@ -214,11 +220,25 @@ static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
214 return tot; 220 return tot;
215} 221}
216 222
223/* Does key in tunnel parameters match packet */
224static bool ipgre_key_match(const struct ip_tunnel_parm *p,
225 __be16 flags, __be32 key)
226{
227 if (p->i_flags & GRE_KEY) {
228 if (flags & GRE_KEY)
229 return key == p->i_key;
230 else
231 return false; /* key expected, none present */
232 } else
233 return !(flags & GRE_KEY);
234}
235
217/* Given src, dst and key, find appropriate for input tunnel. */ 236/* Given src, dst and key, find appropriate for input tunnel. */
218 237
219static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev, 238static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
220 __be32 remote, __be32 local, 239 __be32 remote, __be32 local,
221 __be32 key, __be16 gre_proto) 240 __be16 flags, __be32 key,
241 __be16 gre_proto)
222{ 242{
223 struct net *net = dev_net(dev); 243 struct net *net = dev_net(dev);
224 int link = dev->ifindex; 244 int link = dev->ifindex;
@@ -233,10 +253,12 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
233 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) { 253 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
234 if (local != t->parms.iph.saddr || 254 if (local != t->parms.iph.saddr ||
235 remote != t->parms.iph.daddr || 255 remote != t->parms.iph.daddr ||
236 key != t->parms.i_key ||
237 !(t->dev->flags & IFF_UP)) 256 !(t->dev->flags & IFF_UP))
238 continue; 257 continue;
239 258
259 if (!ipgre_key_match(&t->parms, flags, key))
260 continue;
261
240 if (t->dev->type != ARPHRD_IPGRE && 262 if (t->dev->type != ARPHRD_IPGRE &&
241 t->dev->type != dev_type) 263 t->dev->type != dev_type)
242 continue; 264 continue;
@@ -257,10 +279,12 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
257 279
258 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) { 280 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
259 if (remote != t->parms.iph.daddr || 281 if (remote != t->parms.iph.daddr ||
260 key != t->parms.i_key ||
261 !(t->dev->flags & IFF_UP)) 282 !(t->dev->flags & IFF_UP))
262 continue; 283 continue;
263 284
285 if (!ipgre_key_match(&t->parms, flags, key))
286 continue;
287
264 if (t->dev->type != ARPHRD_IPGRE && 288 if (t->dev->type != ARPHRD_IPGRE &&
265 t->dev->type != dev_type) 289 t->dev->type != dev_type)
266 continue; 290 continue;
@@ -283,10 +307,12 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
283 if ((local != t->parms.iph.saddr && 307 if ((local != t->parms.iph.saddr &&
284 (local != t->parms.iph.daddr || 308 (local != t->parms.iph.daddr ||
285 !ipv4_is_multicast(local))) || 309 !ipv4_is_multicast(local))) ||
286 key != t->parms.i_key ||
287 !(t->dev->flags & IFF_UP)) 310 !(t->dev->flags & IFF_UP))
288 continue; 311 continue;
289 312
313 if (!ipgre_key_match(&t->parms, flags, key))
314 continue;
315
290 if (t->dev->type != ARPHRD_IPGRE && 316 if (t->dev->type != ARPHRD_IPGRE &&
291 t->dev->type != dev_type) 317 t->dev->type != dev_type)
292 continue; 318 continue;
@@ -489,6 +515,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
489 const int code = icmp_hdr(skb)->code; 515 const int code = icmp_hdr(skb)->code;
490 struct ip_tunnel *t; 516 struct ip_tunnel *t;
491 __be16 flags; 517 __be16 flags;
518 __be32 key = 0;
492 519
493 flags = p[0]; 520 flags = p[0];
494 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { 521 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
@@ -505,6 +532,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
505 if (skb_headlen(skb) < grehlen) 532 if (skb_headlen(skb) < grehlen)
506 return; 533 return;
507 534
535 if (flags & GRE_KEY)
536 key = *(((__be32 *)p) + (grehlen / 4) - 1);
537
508 switch (type) { 538 switch (type) {
509 default: 539 default:
510 case ICMP_PARAMETERPROB: 540 case ICMP_PARAMETERPROB:
@@ -533,49 +563,34 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
533 break; 563 break;
534 } 564 }
535 565
536 rcu_read_lock();
537 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, 566 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
538 flags & GRE_KEY ? 567 flags, key, p[1]);
539 *(((__be32 *)p) + (grehlen / 4) - 1) : 0, 568
540 p[1]);
541 if (t == NULL) 569 if (t == NULL)
542 goto out; 570 return;
543 571
544 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { 572 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
545 ipv4_update_pmtu(skb, dev_net(skb->dev), info, 573 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
546 t->parms.link, 0, IPPROTO_GRE, 0); 574 t->parms.link, 0, IPPROTO_GRE, 0);
547 goto out; 575 return;
548 } 576 }
549 if (type == ICMP_REDIRECT) { 577 if (type == ICMP_REDIRECT) {
550 ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, 578 ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
551 IPPROTO_GRE, 0); 579 IPPROTO_GRE, 0);
552 goto out; 580 return;
553 } 581 }
554 if (t->parms.iph.daddr == 0 || 582 if (t->parms.iph.daddr == 0 ||
555 ipv4_is_multicast(t->parms.iph.daddr)) 583 ipv4_is_multicast(t->parms.iph.daddr))
556 goto out; 584 return;
557 585
558 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) 586 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
559 goto out; 587 return;
560 588
561 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) 589 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
562 t->err_count++; 590 t->err_count++;
563 else 591 else
564 t->err_count = 1; 592 t->err_count = 1;
565 t->err_time = jiffies; 593 t->err_time = jiffies;
566out:
567 rcu_read_unlock();
568}
569
570static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
571{
572 if (INET_ECN_is_ce(iph->tos)) {
573 if (skb->protocol == htons(ETH_P_IP)) {
574 IP_ECN_set_ce(ip_hdr(skb));
575 } else if (skb->protocol == htons(ETH_P_IPV6)) {
576 IP6_ECN_set_ce(ipv6_hdr(skb));
577 }
578 }
579} 594}
580 595
581static inline u8 596static inline u8
@@ -600,9 +615,10 @@ static int ipgre_rcv(struct sk_buff *skb)
600 struct ip_tunnel *tunnel; 615 struct ip_tunnel *tunnel;
601 int offset = 4; 616 int offset = 4;
602 __be16 gre_proto; 617 __be16 gre_proto;
618 int err;
603 619
604 if (!pskb_may_pull(skb, 16)) 620 if (!pskb_may_pull(skb, 16))
605 goto drop_nolock; 621 goto drop;
606 622
607 iph = ip_hdr(skb); 623 iph = ip_hdr(skb);
608 h = skb->data; 624 h = skb->data;
@@ -613,7 +629,7 @@ static int ipgre_rcv(struct sk_buff *skb)
613 - We do not support routing headers. 629 - We do not support routing headers.
614 */ 630 */
615 if (flags&(GRE_VERSION|GRE_ROUTING)) 631 if (flags&(GRE_VERSION|GRE_ROUTING))
616 goto drop_nolock; 632 goto drop;
617 633
618 if (flags&GRE_CSUM) { 634 if (flags&GRE_CSUM) {
619 switch (skb->ip_summed) { 635 switch (skb->ip_summed) {
@@ -641,10 +657,10 @@ static int ipgre_rcv(struct sk_buff *skb)
641 657
642 gre_proto = *(__be16 *)(h + 2); 658 gre_proto = *(__be16 *)(h + 2);
643 659
644 rcu_read_lock(); 660 tunnel = ipgre_tunnel_lookup(skb->dev,
645 if ((tunnel = ipgre_tunnel_lookup(skb->dev, 661 iph->saddr, iph->daddr, flags, key,
646 iph->saddr, iph->daddr, key, 662 gre_proto);
647 gre_proto))) { 663 if (tunnel) {
648 struct pcpu_tstats *tstats; 664 struct pcpu_tstats *tstats;
649 665
650 secpath_reset(skb); 666 secpath_reset(skb);
@@ -703,27 +719,33 @@ static int ipgre_rcv(struct sk_buff *skb)
703 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 719 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
704 } 720 }
705 721
722 __skb_tunnel_rx(skb, tunnel->dev);
723
724 skb_reset_network_header(skb);
725 err = IP_ECN_decapsulate(iph, skb);
726 if (unlikely(err)) {
727 if (log_ecn_error)
728 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
729 &iph->saddr, iph->tos);
730 if (err > 1) {
731 ++tunnel->dev->stats.rx_frame_errors;
732 ++tunnel->dev->stats.rx_errors;
733 goto drop;
734 }
735 }
736
706 tstats = this_cpu_ptr(tunnel->dev->tstats); 737 tstats = this_cpu_ptr(tunnel->dev->tstats);
707 u64_stats_update_begin(&tstats->syncp); 738 u64_stats_update_begin(&tstats->syncp);
708 tstats->rx_packets++; 739 tstats->rx_packets++;
709 tstats->rx_bytes += skb->len; 740 tstats->rx_bytes += skb->len;
710 u64_stats_update_end(&tstats->syncp); 741 u64_stats_update_end(&tstats->syncp);
711 742
712 __skb_tunnel_rx(skb, tunnel->dev); 743 gro_cells_receive(&tunnel->gro_cells, skb);
713
714 skb_reset_network_header(skb);
715 ipgre_ecn_decapsulate(iph, skb);
716
717 netif_rx(skb);
718
719 rcu_read_unlock();
720 return 0; 744 return 0;
721 } 745 }
722 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 746 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
723 747
724drop: 748drop:
725 rcu_read_unlock();
726drop_nolock:
727 kfree_skb(skb); 749 kfree_skb(skb);
728 return 0; 750 return 0;
729} 751}
@@ -745,6 +767,10 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
745 __be32 dst; 767 __be32 dst;
746 int mtu; 768 int mtu;
747 769
770 if (skb->ip_summed == CHECKSUM_PARTIAL &&
771 skb_checksum_help(skb))
772 goto tx_error;
773
748 if (dev->type == ARPHRD_ETHER) 774 if (dev->type == ARPHRD_ETHER)
749 IPCB(skb)->flags = 0; 775 IPCB(skb)->flags = 0;
750 776
@@ -1292,10 +1318,18 @@ static const struct net_device_ops ipgre_netdev_ops = {
1292 1318
1293static void ipgre_dev_free(struct net_device *dev) 1319static void ipgre_dev_free(struct net_device *dev)
1294{ 1320{
1321 struct ip_tunnel *tunnel = netdev_priv(dev);
1322
1323 gro_cells_destroy(&tunnel->gro_cells);
1295 free_percpu(dev->tstats); 1324 free_percpu(dev->tstats);
1296 free_netdev(dev); 1325 free_netdev(dev);
1297} 1326}
1298 1327
1328#define GRE_FEATURES (NETIF_F_SG | \
1329 NETIF_F_FRAGLIST | \
1330 NETIF_F_HIGHDMA | \
1331 NETIF_F_HW_CSUM)
1332
1299static void ipgre_tunnel_setup(struct net_device *dev) 1333static void ipgre_tunnel_setup(struct net_device *dev)
1300{ 1334{
1301 dev->netdev_ops = &ipgre_netdev_ops; 1335 dev->netdev_ops = &ipgre_netdev_ops;
@@ -1309,12 +1343,16 @@ static void ipgre_tunnel_setup(struct net_device *dev)
1309 dev->addr_len = 4; 1343 dev->addr_len = 4;
1310 dev->features |= NETIF_F_NETNS_LOCAL; 1344 dev->features |= NETIF_F_NETNS_LOCAL;
1311 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 1345 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1346
1347 dev->features |= GRE_FEATURES;
1348 dev->hw_features |= GRE_FEATURES;
1312} 1349}
1313 1350
1314static int ipgre_tunnel_init(struct net_device *dev) 1351static int ipgre_tunnel_init(struct net_device *dev)
1315{ 1352{
1316 struct ip_tunnel *tunnel; 1353 struct ip_tunnel *tunnel;
1317 struct iphdr *iph; 1354 struct iphdr *iph;
1355 int err;
1318 1356
1319 tunnel = netdev_priv(dev); 1357 tunnel = netdev_priv(dev);
1320 iph = &tunnel->parms.iph; 1358 iph = &tunnel->parms.iph;
@@ -1341,6 +1379,12 @@ static int ipgre_tunnel_init(struct net_device *dev)
1341 if (!dev->tstats) 1379 if (!dev->tstats)
1342 return -ENOMEM; 1380 return -ENOMEM;
1343 1381
1382 err = gro_cells_init(&tunnel->gro_cells, dev);
1383 if (err) {
1384 free_percpu(dev->tstats);
1385 return err;
1386 }
1387
1344 return 0; 1388 return 0;
1345} 1389}
1346 1390
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index c196d749daf2..24a29a39e9a8 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -467,7 +467,9 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
467 467
468 iph = ip_hdr(skb); 468 iph = ip_hdr(skb);
469 469
470 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { 470 if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) ||
471 (IPCB(skb)->frag_max_size &&
472 IPCB(skb)->frag_max_size > dst_mtu(&rt->dst)))) {
471 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 473 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
472 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 474 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
473 htonl(ip_skb_dst_mtu(skb))); 475 htonl(ip_skb_dst_mtu(skb)));
@@ -791,6 +793,7 @@ static int __ip_append_data(struct sock *sk,
791 struct flowi4 *fl4, 793 struct flowi4 *fl4,
792 struct sk_buff_head *queue, 794 struct sk_buff_head *queue,
793 struct inet_cork *cork, 795 struct inet_cork *cork,
796 struct page_frag *pfrag,
794 int getfrag(void *from, char *to, int offset, 797 int getfrag(void *from, char *to, int offset,
795 int len, int odd, struct sk_buff *skb), 798 int len, int odd, struct sk_buff *skb),
796 void *from, int length, int transhdrlen, 799 void *from, int length, int transhdrlen,
@@ -985,47 +988,30 @@ alloc_new_skb:
985 } 988 }
986 } else { 989 } else {
987 int i = skb_shinfo(skb)->nr_frags; 990 int i = skb_shinfo(skb)->nr_frags;
988 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
989 struct page *page = cork->page;
990 int off = cork->off;
991 unsigned int left;
992
993 if (page && (left = PAGE_SIZE - off) > 0) {
994 if (copy >= left)
995 copy = left;
996 if (page != skb_frag_page(frag)) {
997 if (i == MAX_SKB_FRAGS) {
998 err = -EMSGSIZE;
999 goto error;
1000 }
1001 skb_fill_page_desc(skb, i, page, off, 0);
1002 skb_frag_ref(skb, i);
1003 frag = &skb_shinfo(skb)->frags[i];
1004 }
1005 } else if (i < MAX_SKB_FRAGS) {
1006 if (copy > PAGE_SIZE)
1007 copy = PAGE_SIZE;
1008 page = alloc_pages(sk->sk_allocation, 0);
1009 if (page == NULL) {
1010 err = -ENOMEM;
1011 goto error;
1012 }
1013 cork->page = page;
1014 cork->off = 0;
1015 991
1016 skb_fill_page_desc(skb, i, page, 0, 0); 992 err = -ENOMEM;
1017 frag = &skb_shinfo(skb)->frags[i]; 993 if (!sk_page_frag_refill(sk, pfrag))
1018 } else {
1019 err = -EMSGSIZE;
1020 goto error;
1021 }
1022 if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
1023 offset, copy, skb->len, skb) < 0) {
1024 err = -EFAULT;
1025 goto error; 994 goto error;
995
996 if (!skb_can_coalesce(skb, i, pfrag->page,
997 pfrag->offset)) {
998 err = -EMSGSIZE;
999 if (i == MAX_SKB_FRAGS)
1000 goto error;
1001
1002 __skb_fill_page_desc(skb, i, pfrag->page,
1003 pfrag->offset, 0);
1004 skb_shinfo(skb)->nr_frags = ++i;
1005 get_page(pfrag->page);
1026 } 1006 }
1027 cork->off += copy; 1007 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1028 skb_frag_size_add(frag, copy); 1008 if (getfrag(from,
1009 page_address(pfrag->page) + pfrag->offset,
1010 offset, copy, skb->len, skb) < 0)
1011 goto error_efault;
1012
1013 pfrag->offset += copy;
1014 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1029 skb->len += copy; 1015 skb->len += copy;
1030 skb->data_len += copy; 1016 skb->data_len += copy;
1031 skb->truesize += copy; 1017 skb->truesize += copy;
@@ -1037,6 +1023,8 @@ alloc_new_skb:
1037 1023
1038 return 0; 1024 return 0;
1039 1025
1026error_efault:
1027 err = -EFAULT;
1040error: 1028error:
1041 cork->length -= length; 1029 cork->length -= length;
1042 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1030 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
@@ -1077,8 +1065,6 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1077 cork->dst = &rt->dst; 1065 cork->dst = &rt->dst;
1078 cork->length = 0; 1066 cork->length = 0;
1079 cork->tx_flags = ipc->tx_flags; 1067 cork->tx_flags = ipc->tx_flags;
1080 cork->page = NULL;
1081 cork->off = 0;
1082 1068
1083 return 0; 1069 return 0;
1084} 1070}
@@ -1115,7 +1101,8 @@ int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1115 transhdrlen = 0; 1101 transhdrlen = 0;
1116 } 1102 }
1117 1103
1118 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag, 1104 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
1105 sk_page_frag(sk), getfrag,
1119 from, length, transhdrlen, flags); 1106 from, length, transhdrlen, flags);
1120} 1107}
1121 1108
@@ -1437,7 +1424,8 @@ struct sk_buff *ip_make_skb(struct sock *sk,
1437 if (err) 1424 if (err)
1438 return ERR_PTR(err); 1425 return ERR_PTR(err);
1439 1426
1440 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag, 1427 err = __ip_append_data(sk, fl4, &queue, &cork,
1428 &current->task_frag, getfrag,
1441 from, length, transhdrlen, flags); 1429 from, length, transhdrlen, flags);
1442 if (err) { 1430 if (err) {
1443 __ip_flush_pending_frames(sk, &queue, &cork); 1431 __ip_flush_pending_frames(sk, &queue, &cork);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 3511ffba7bd4..978bca4818ae 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -304,7 +304,6 @@ static int vti_err(struct sk_buff *skb, u32 info)
304 304
305 err = -ENOENT; 305 err = -ENOENT;
306 306
307 rcu_read_lock();
308 t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); 307 t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
309 if (t == NULL) 308 if (t == NULL)
310 goto out; 309 goto out;
@@ -326,7 +325,6 @@ static int vti_err(struct sk_buff *skb, u32 info)
326 t->err_count = 1; 325 t->err_count = 1;
327 t->err_time = jiffies; 326 t->err_time = jiffies;
328out: 327out:
329 rcu_read_unlock();
330 return err; 328 return err;
331} 329}
332 330
@@ -336,7 +334,6 @@ static int vti_rcv(struct sk_buff *skb)
336 struct ip_tunnel *tunnel; 334 struct ip_tunnel *tunnel;
337 const struct iphdr *iph = ip_hdr(skb); 335 const struct iphdr *iph = ip_hdr(skb);
338 336
339 rcu_read_lock();
340 tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); 337 tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
341 if (tunnel != NULL) { 338 if (tunnel != NULL) {
342 struct pcpu_tstats *tstats; 339 struct pcpu_tstats *tstats;
@@ -348,10 +345,8 @@ static int vti_rcv(struct sk_buff *skb)
348 u64_stats_update_end(&tstats->syncp); 345 u64_stats_update_end(&tstats->syncp);
349 346
350 skb->dev = tunnel->dev; 347 skb->dev = tunnel->dev;
351 rcu_read_unlock();
352 return 1; 348 return 1;
353 } 349 }
354 rcu_read_unlock();
355 350
356 return -1; 351 return -1;
357} 352}
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 67e8a6b086ea..798358b10717 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -583,6 +583,17 @@ static void __init ic_rarp_send_if(struct ic_device *d)
583#endif 583#endif
584 584
585/* 585/*
586 * Predefine Nameservers
587 */
588static inline void __init ic_nameservers_predef(void)
589{
590 int i;
591
592 for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
593 ic_nameservers[i] = NONE;
594}
595
596/*
586 * DHCP/BOOTP support. 597 * DHCP/BOOTP support.
587 */ 598 */
588 599
@@ -747,10 +758,7 @@ static void __init ic_bootp_init_ext(u8 *e)
747 */ 758 */
748static inline void __init ic_bootp_init(void) 759static inline void __init ic_bootp_init(void)
749{ 760{
750 int i; 761 ic_nameservers_predef();
751
752 for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
753 ic_nameservers[i] = NONE;
754 762
755 dev_add_pack(&bootp_packet_type); 763 dev_add_pack(&bootp_packet_type);
756} 764}
@@ -1379,6 +1387,7 @@ static int __init ip_auto_config(void)
1379 int retries = CONF_OPEN_RETRIES; 1387 int retries = CONF_OPEN_RETRIES;
1380#endif 1388#endif
1381 int err; 1389 int err;
1390 unsigned int i;
1382 1391
1383#ifdef CONFIG_PROC_FS 1392#ifdef CONFIG_PROC_FS
1384 proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops); 1393 proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops);
@@ -1499,7 +1508,15 @@ static int __init ip_auto_config(void)
1499 &ic_servaddr, &root_server_addr, root_server_path); 1508 &ic_servaddr, &root_server_addr, root_server_path);
1500 if (ic_dev_mtu) 1509 if (ic_dev_mtu)
1501 pr_cont(", mtu=%d", ic_dev_mtu); 1510 pr_cont(", mtu=%d", ic_dev_mtu);
1502 pr_cont("\n"); 1511 for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
1512 if (ic_nameservers[i] != NONE) {
1513 pr_info(" nameserver%u=%pI4",
1514 i, &ic_nameservers[i]);
1515 break;
1516 }
1517 for (i++; i < CONF_NAMESERVERS_MAX; i++)
1518 if (ic_nameservers[i] != NONE)
1519 pr_cont(", nameserver%u=%pI4\n", i, &ic_nameservers[i]);
1503#endif /* !SILENT */ 1520#endif /* !SILENT */
1504 1521
1505 return 0; 1522 return 0;
@@ -1570,6 +1587,8 @@ static int __init ip_auto_config_setup(char *addrs)
1570 return 1; 1587 return 1;
1571 } 1588 }
1572 1589
1590 ic_nameservers_predef();
1591
1573 /* Parse string for static IP assignment. */ 1592 /* Parse string for static IP assignment. */
1574 ip = addrs; 1593 ip = addrs;
1575 while (ip && *ip) { 1594 while (ip && *ip) {
@@ -1613,6 +1632,20 @@ static int __init ip_auto_config_setup(char *addrs)
1613 ic_enable = 0; 1632 ic_enable = 0;
1614 } 1633 }
1615 break; 1634 break;
1635 case 7:
1636 if (CONF_NAMESERVERS_MAX >= 1) {
1637 ic_nameservers[0] = in_aton(ip);
1638 if (ic_nameservers[0] == ANY)
1639 ic_nameservers[0] = NONE;
1640 }
1641 break;
1642 case 8:
1643 if (CONF_NAMESERVERS_MAX >= 2) {
1644 ic_nameservers[1] = in_aton(ip);
1645 if (ic_nameservers[1] == ANY)
1646 ic_nameservers[1] = NONE;
1647 }
1648 break;
1616 } 1649 }
1617 } 1650 }
1618 ip = cp; 1651 ip = cp;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 99af1f0cc658..e15b45297c09 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -120,6 +120,10 @@
120#define HASH_SIZE 16 120#define HASH_SIZE 16
121#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) 121#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
122 122
123static bool log_ecn_error = true;
124module_param(log_ecn_error, bool, 0644);
125MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
126
123static int ipip_net_id __read_mostly; 127static int ipip_net_id __read_mostly;
124struct ipip_net { 128struct ipip_net {
125 struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; 129 struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
@@ -365,8 +369,6 @@ static int ipip_err(struct sk_buff *skb, u32 info)
365 } 369 }
366 370
367 err = -ENOENT; 371 err = -ENOENT;
368
369 rcu_read_lock();
370 t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); 372 t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
371 if (t == NULL) 373 if (t == NULL)
372 goto out; 374 goto out;
@@ -398,34 +400,22 @@ static int ipip_err(struct sk_buff *skb, u32 info)
398 t->err_count = 1; 400 t->err_count = 1;
399 t->err_time = jiffies; 401 t->err_time = jiffies;
400out: 402out:
401 rcu_read_unlock();
402 return err;
403}
404
405static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,
406 struct sk_buff *skb)
407{
408 struct iphdr *inner_iph = ip_hdr(skb);
409 403
410 if (INET_ECN_is_ce(outer_iph->tos)) 404 return err;
411 IP_ECN_set_ce(inner_iph);
412} 405}
413 406
414static int ipip_rcv(struct sk_buff *skb) 407static int ipip_rcv(struct sk_buff *skb)
415{ 408{
416 struct ip_tunnel *tunnel; 409 struct ip_tunnel *tunnel;
417 const struct iphdr *iph = ip_hdr(skb); 410 const struct iphdr *iph = ip_hdr(skb);
411 int err;
418 412
419 rcu_read_lock();
420 tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); 413 tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
421 if (tunnel != NULL) { 414 if (tunnel != NULL) {
422 struct pcpu_tstats *tstats; 415 struct pcpu_tstats *tstats;
423 416
424 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 417 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
425 rcu_read_unlock(); 418 goto drop;
426 kfree_skb(skb);
427 return 0;
428 }
429 419
430 secpath_reset(skb); 420 secpath_reset(skb);
431 421
@@ -434,24 +424,35 @@ static int ipip_rcv(struct sk_buff *skb)
434 skb->protocol = htons(ETH_P_IP); 424 skb->protocol = htons(ETH_P_IP);
435 skb->pkt_type = PACKET_HOST; 425 skb->pkt_type = PACKET_HOST;
436 426
427 __skb_tunnel_rx(skb, tunnel->dev);
428
429 err = IP_ECN_decapsulate(iph, skb);
430 if (unlikely(err)) {
431 if (log_ecn_error)
432 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
433 &iph->saddr, iph->tos);
434 if (err > 1) {
435 ++tunnel->dev->stats.rx_frame_errors;
436 ++tunnel->dev->stats.rx_errors;
437 goto drop;
438 }
439 }
440
437 tstats = this_cpu_ptr(tunnel->dev->tstats); 441 tstats = this_cpu_ptr(tunnel->dev->tstats);
438 u64_stats_update_begin(&tstats->syncp); 442 u64_stats_update_begin(&tstats->syncp);
439 tstats->rx_packets++; 443 tstats->rx_packets++;
440 tstats->rx_bytes += skb->len; 444 tstats->rx_bytes += skb->len;
441 u64_stats_update_end(&tstats->syncp); 445 u64_stats_update_end(&tstats->syncp);
442 446
443 __skb_tunnel_rx(skb, tunnel->dev);
444
445 ipip_ecn_decapsulate(iph, skb);
446
447 netif_rx(skb); 447 netif_rx(skb);
448
449 rcu_read_unlock();
450 return 0; 448 return 0;
451 } 449 }
452 rcu_read_unlock();
453 450
454 return -1; 451 return -1;
452
453drop:
454 kfree_skb(skb);
455 return 0;
455} 456}
456 457
457/* 458/*
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index ebdf06f938bf..1daa95c2a0ba 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -626,7 +626,7 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
626 e->error = -ETIMEDOUT; 626 e->error = -ETIMEDOUT;
627 memset(&e->msg, 0, sizeof(e->msg)); 627 memset(&e->msg, 0, sizeof(e->msg));
628 628
629 rtnl_unicast(skb, net, NETLINK_CB(skb).pid); 629 rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
630 } else { 630 } else {
631 kfree_skb(skb); 631 kfree_skb(skb);
632 } 632 }
@@ -870,7 +870,7 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
870 memset(&e->msg, 0, sizeof(e->msg)); 870 memset(&e->msg, 0, sizeof(e->msg));
871 } 871 }
872 872
873 rtnl_unicast(skb, net, NETLINK_CB(skb).pid); 873 rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
874 } else { 874 } else {
875 ip_mr_forward(net, mrt, skb, c, 0); 875 ip_mr_forward(net, mrt, skb, c, 0);
876 } 876 }
@@ -1808,7 +1808,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
1808 .flowi4_oif = (rt_is_output_route(rt) ? 1808 .flowi4_oif = (rt_is_output_route(rt) ?
1809 skb->dev->ifindex : 0), 1809 skb->dev->ifindex : 0),
1810 .flowi4_iif = (rt_is_output_route(rt) ? 1810 .flowi4_iif = (rt_is_output_route(rt) ?
1811 net->loopback_dev->ifindex : 1811 LOOPBACK_IFINDEX :
1812 skb->dev->ifindex), 1812 skb->dev->ifindex),
1813 .flowi4_mark = skb->mark, 1813 .flowi4_mark = skb->mark,
1814 }; 1814 };
@@ -2117,12 +2117,12 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
2117} 2117}
2118 2118
2119static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, 2119static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2120 u32 pid, u32 seq, struct mfc_cache *c) 2120 u32 portid, u32 seq, struct mfc_cache *c)
2121{ 2121{
2122 struct nlmsghdr *nlh; 2122 struct nlmsghdr *nlh;
2123 struct rtmsg *rtm; 2123 struct rtmsg *rtm;
2124 2124
2125 nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI); 2125 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
2126 if (nlh == NULL) 2126 if (nlh == NULL)
2127 return -EMSGSIZE; 2127 return -EMSGSIZE;
2128 2128
@@ -2176,7 +2176,7 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2176 if (e < s_e) 2176 if (e < s_e)
2177 goto next_entry; 2177 goto next_entry;
2178 if (ipmr_fill_mroute(mrt, skb, 2178 if (ipmr_fill_mroute(mrt, skb,
2179 NETLINK_CB(cb->skb).pid, 2179 NETLINK_CB(cb->skb).portid,
2180 cb->nlh->nlmsg_seq, 2180 cb->nlh->nlmsg_seq,
2181 mfc) < 0) 2181 mfc) < 0)
2182 goto done; 2182 goto done;
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index ed1b36783192..4c0cf63dd92e 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -72,43 +72,6 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
72} 72}
73EXPORT_SYMBOL(ip_route_me_harder); 73EXPORT_SYMBOL(ip_route_me_harder);
74 74
75#ifdef CONFIG_XFRM
76int ip_xfrm_me_harder(struct sk_buff *skb)
77{
78 struct flowi fl;
79 unsigned int hh_len;
80 struct dst_entry *dst;
81
82 if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
83 return 0;
84 if (xfrm_decode_session(skb, &fl, AF_INET) < 0)
85 return -1;
86
87 dst = skb_dst(skb);
88 if (dst->xfrm)
89 dst = ((struct xfrm_dst *)dst)->route;
90 dst_hold(dst);
91
92 dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0);
93 if (IS_ERR(dst))
94 return -1;
95
96 skb_dst_drop(skb);
97 skb_dst_set(skb, dst);
98
99 /* Change in oif may mean change in hh_len. */
100 hh_len = skb_dst(skb)->dev->hard_header_len;
101 if (skb_headroom(skb) < hh_len &&
102 pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
103 return -1;
104 return 0;
105}
106EXPORT_SYMBOL(ip_xfrm_me_harder);
107#endif
108
109void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *);
110EXPORT_SYMBOL(ip_nat_decode_session);
111
112/* 75/*
113 * Extra routing may needed on local out, as the QUEUE target never 76 * Extra routing may needed on local out, as the QUEUE target never
114 * returns control to the table. 77 * returns control to the table.
@@ -225,12 +188,12 @@ static const struct nf_afinfo nf_ip_afinfo = {
225 .route_key_size = sizeof(struct ip_rt_info), 188 .route_key_size = sizeof(struct ip_rt_info),
226}; 189};
227 190
228static int ipv4_netfilter_init(void) 191static int __init ipv4_netfilter_init(void)
229{ 192{
230 return nf_register_afinfo(&nf_ip_afinfo); 193 return nf_register_afinfo(&nf_ip_afinfo);
231} 194}
232 195
233static void ipv4_netfilter_fini(void) 196static void __exit ipv4_netfilter_fini(void)
234{ 197{
235 nf_unregister_afinfo(&nf_ip_afinfo); 198 nf_unregister_afinfo(&nf_ip_afinfo);
236} 199}
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index fcc543cd987a..d8d6f2a5bf12 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -143,25 +143,22 @@ config IP_NF_TARGET_ULOG
143 To compile it as a module, choose M here. If unsure, say N. 143 To compile it as a module, choose M here. If unsure, say N.
144 144
145# NAT + specific targets: nf_conntrack 145# NAT + specific targets: nf_conntrack
146config NF_NAT 146config NF_NAT_IPV4
147 tristate "Full NAT" 147 tristate "IPv4 NAT"
148 depends on NF_CONNTRACK_IPV4 148 depends on NF_CONNTRACK_IPV4
149 default m if NETFILTER_ADVANCED=n 149 default m if NETFILTER_ADVANCED=n
150 select NF_NAT
150 help 151 help
151 The Full NAT option allows masquerading, port forwarding and other 152 The IPv4 NAT option allows masquerading, port forwarding and other
152 forms of full Network Address Port Translation. It is controlled by 153 forms of full Network Address Port Translation. It is controlled by
153 the `nat' table in iptables: see the man page for iptables(8). 154 the `nat' table in iptables: see the man page for iptables(8).
154 155
155 To compile it as a module, choose M here. If unsure, say N. 156 To compile it as a module, choose M here. If unsure, say N.
156 157
157config NF_NAT_NEEDED 158if NF_NAT_IPV4
158 bool
159 depends on NF_NAT
160 default y
161 159
162config IP_NF_TARGET_MASQUERADE 160config IP_NF_TARGET_MASQUERADE
163 tristate "MASQUERADE target support" 161 tristate "MASQUERADE target support"
164 depends on NF_NAT
165 default m if NETFILTER_ADVANCED=n 162 default m if NETFILTER_ADVANCED=n
166 help 163 help
167 Masquerading is a special case of NAT: all outgoing connections are 164 Masquerading is a special case of NAT: all outgoing connections are
@@ -174,30 +171,27 @@ config IP_NF_TARGET_MASQUERADE
174 171
175config IP_NF_TARGET_NETMAP 172config IP_NF_TARGET_NETMAP
176 tristate "NETMAP target support" 173 tristate "NETMAP target support"
177 depends on NF_NAT
178 depends on NETFILTER_ADVANCED 174 depends on NETFILTER_ADVANCED
179 help 175 select NETFILTER_XT_TARGET_NETMAP
180 NETMAP is an implementation of static 1:1 NAT mapping of network 176 ---help---
181 addresses. It maps the network address part, while keeping the host 177 This is a backwards-compat option for the user's convenience
182 address part intact. 178 (e.g. when running oldconfig). It selects
183 179 CONFIG_NETFILTER_XT_TARGET_NETMAP.
184 To compile it as a module, choose M here. If unsure, say N.
185 180
186config IP_NF_TARGET_REDIRECT 181config IP_NF_TARGET_REDIRECT
187 tristate "REDIRECT target support" 182 tristate "REDIRECT target support"
188 depends on NF_NAT
189 depends on NETFILTER_ADVANCED 183 depends on NETFILTER_ADVANCED
190 help 184 select NETFILTER_XT_TARGET_REDIRECT
191 REDIRECT is a special case of NAT: all incoming connections are 185 ---help---
192 mapped onto the incoming interface's address, causing the packets to 186 This is a backwards-compat option for the user's convenience
193 come to the local machine instead of passing through. This is 187 (e.g. when running oldconfig). It selects
194 useful for transparent proxies. 188 CONFIG_NETFILTER_XT_TARGET_REDIRECT.
195 189
196 To compile it as a module, choose M here. If unsure, say N. 190endif
197 191
198config NF_NAT_SNMP_BASIC 192config NF_NAT_SNMP_BASIC
199 tristate "Basic SNMP-ALG support" 193 tristate "Basic SNMP-ALG support"
200 depends on NF_CONNTRACK_SNMP && NF_NAT 194 depends on NF_CONNTRACK_SNMP && NF_NAT_IPV4
201 depends on NETFILTER_ADVANCED 195 depends on NETFILTER_ADVANCED
202 default NF_NAT && NF_CONNTRACK_SNMP 196 default NF_NAT && NF_CONNTRACK_SNMP
203 ---help--- 197 ---help---
@@ -219,61 +213,21 @@ config NF_NAT_SNMP_BASIC
219# <expr> '&&' <expr> (6) 213# <expr> '&&' <expr> (6)
220# 214#
221# (6) Returns the result of min(/expr/, /expr/). 215# (6) Returns the result of min(/expr/, /expr/).
222config NF_NAT_PROTO_DCCP
223 tristate
224 depends on NF_NAT && NF_CT_PROTO_DCCP
225 default NF_NAT && NF_CT_PROTO_DCCP
226 216
227config NF_NAT_PROTO_GRE 217config NF_NAT_PROTO_GRE
228 tristate 218 tristate
229 depends on NF_NAT && NF_CT_PROTO_GRE 219 depends on NF_NAT_IPV4 && NF_CT_PROTO_GRE
230
231config NF_NAT_PROTO_UDPLITE
232 tristate
233 depends on NF_NAT && NF_CT_PROTO_UDPLITE
234 default NF_NAT && NF_CT_PROTO_UDPLITE
235
236config NF_NAT_PROTO_SCTP
237 tristate
238 default NF_NAT && NF_CT_PROTO_SCTP
239 depends on NF_NAT && NF_CT_PROTO_SCTP
240 select LIBCRC32C
241
242config NF_NAT_FTP
243 tristate
244 depends on NF_CONNTRACK && NF_NAT
245 default NF_NAT && NF_CONNTRACK_FTP
246
247config NF_NAT_IRC
248 tristate
249 depends on NF_CONNTRACK && NF_NAT
250 default NF_NAT && NF_CONNTRACK_IRC
251
252config NF_NAT_TFTP
253 tristate
254 depends on NF_CONNTRACK && NF_NAT
255 default NF_NAT && NF_CONNTRACK_TFTP
256
257config NF_NAT_AMANDA
258 tristate
259 depends on NF_CONNTRACK && NF_NAT
260 default NF_NAT && NF_CONNTRACK_AMANDA
261 220
262config NF_NAT_PPTP 221config NF_NAT_PPTP
263 tristate 222 tristate
264 depends on NF_CONNTRACK && NF_NAT 223 depends on NF_CONNTRACK && NF_NAT_IPV4
265 default NF_NAT && NF_CONNTRACK_PPTP 224 default NF_NAT_IPV4 && NF_CONNTRACK_PPTP
266 select NF_NAT_PROTO_GRE 225 select NF_NAT_PROTO_GRE
267 226
268config NF_NAT_H323 227config NF_NAT_H323
269 tristate 228 tristate
270 depends on NF_CONNTRACK && NF_NAT 229 depends on NF_CONNTRACK && NF_NAT_IPV4
271 default NF_NAT && NF_CONNTRACK_H323 230 default NF_NAT_IPV4 && NF_CONNTRACK_H323
272
273config NF_NAT_SIP
274 tristate
275 depends on NF_CONNTRACK && NF_NAT
276 default NF_NAT && NF_CONNTRACK_SIP
277 231
278# mangle + specific targets 232# mangle + specific targets
279config IP_NF_MANGLE 233config IP_NF_MANGLE
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index c20674dc9452..007b128eecc9 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -10,32 +10,22 @@ nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o
10endif 10endif
11endif 11endif
12 12
13nf_nat-y := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o
14iptable_nat-y := nf_nat_rule.o nf_nat_standalone.o
15
16# connection tracking 13# connection tracking
17obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o 14obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
18 15
19obj-$(CONFIG_NF_NAT) += nf_nat.o 16nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
17obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
20 18
21# defrag 19# defrag
22obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o 20obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
23 21
24# NAT helpers (nf_conntrack) 22# NAT helpers (nf_conntrack)
25obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
26obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
27obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o 23obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
28obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o
29obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o 24obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
30obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o
31obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o 25obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
32obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
33 26
34# NAT protocols (nf_nat) 27# NAT protocols (nf_nat)
35obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
36obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o 28obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
37obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
38obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
39 29
40# generic IP tables 30# generic IP tables
41obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o 31obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
@@ -43,7 +33,7 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
43# the three instances of ip_tables 33# the three instances of ip_tables
44obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o 34obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
45obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o 35obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
46obj-$(CONFIG_NF_NAT) += iptable_nat.o 36obj-$(CONFIG_NF_NAT_IPV4) += iptable_nat.o
47obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o 37obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
48obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o 38obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
49 39
@@ -55,8 +45,6 @@ obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o
55obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o 45obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
56obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o 46obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
57obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o 47obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
58obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
59obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
60obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o 48obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
61obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o 49obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
62 50
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index cbb6a1a6f6f7..5d5d4d1be9c2 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -19,9 +19,9 @@
19#include <net/ip.h> 19#include <net/ip.h>
20#include <net/checksum.h> 20#include <net/checksum.h>
21#include <net/route.h> 21#include <net/route.h>
22#include <net/netfilter/nf_nat_rule.h>
23#include <linux/netfilter_ipv4.h> 22#include <linux/netfilter_ipv4.h>
24#include <linux/netfilter/x_tables.h> 23#include <linux/netfilter/x_tables.h>
24#include <net/netfilter/nf_nat.h>
25 25
26MODULE_LICENSE("GPL"); 26MODULE_LICENSE("GPL");
27MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); 27MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
@@ -49,7 +49,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
49 struct nf_conn *ct; 49 struct nf_conn *ct;
50 struct nf_conn_nat *nat; 50 struct nf_conn_nat *nat;
51 enum ip_conntrack_info ctinfo; 51 enum ip_conntrack_info ctinfo;
52 struct nf_nat_ipv4_range newrange; 52 struct nf_nat_range newrange;
53 const struct nf_nat_ipv4_multi_range_compat *mr; 53 const struct nf_nat_ipv4_multi_range_compat *mr;
54 const struct rtable *rt; 54 const struct rtable *rt;
55 __be32 newsrc, nh; 55 __be32 newsrc, nh;
@@ -80,10 +80,13 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
80 nat->masq_index = par->out->ifindex; 80 nat->masq_index = par->out->ifindex;
81 81
82 /* Transfer from original range. */ 82 /* Transfer from original range. */
83 newrange = ((struct nf_nat_ipv4_range) 83 memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
84 { mr->range[0].flags | NF_NAT_RANGE_MAP_IPS, 84 memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
85 newsrc, newsrc, 85 newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
86 mr->range[0].min, mr->range[0].max }); 86 newrange.min_addr.ip = newsrc;
87 newrange.max_addr.ip = newsrc;
88 newrange.min_proto = mr->range[0].min;
89 newrange.max_proto = mr->range[0].max;
87 90
88 /* Hand modified range to generic setup. */ 91 /* Hand modified range to generic setup. */
89 return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); 92 return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
@@ -96,7 +99,8 @@ device_cmp(struct nf_conn *i, void *ifindex)
96 99
97 if (!nat) 100 if (!nat)
98 return 0; 101 return 0;
99 102 if (nf_ct_l3num(i) != NFPROTO_IPV4)
103 return 0;
100 return nat->masq_index == (int)(long)ifindex; 104 return nat->masq_index == (int)(long)ifindex;
101} 105}
102 106
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
deleted file mode 100644
index b5bfbbabf70d..000000000000
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ /dev/null
@@ -1,98 +0,0 @@
1/* NETMAP - static NAT mapping of IP network addresses (1:1).
2 * The mapping can be applied to source (POSTROUTING),
3 * destination (PREROUTING), or both (with separate rules).
4 */
5
6/* (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13#include <linux/ip.h>
14#include <linux/module.h>
15#include <linux/netdevice.h>
16#include <linux/netfilter.h>
17#include <linux/netfilter_ipv4.h>
18#include <linux/netfilter/x_tables.h>
19#include <net/netfilter/nf_nat_rule.h>
20
21MODULE_LICENSE("GPL");
22MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>");
23MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets");
24
25static int netmap_tg_check(const struct xt_tgchk_param *par)
26{
27 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
28
29 if (!(mr->range[0].flags & NF_NAT_RANGE_MAP_IPS)) {
30 pr_debug("bad MAP_IPS.\n");
31 return -EINVAL;
32 }
33 if (mr->rangesize != 1) {
34 pr_debug("bad rangesize %u.\n", mr->rangesize);
35 return -EINVAL;
36 }
37 return 0;
38}
39
40static unsigned int
41netmap_tg(struct sk_buff *skb, const struct xt_action_param *par)
42{
43 struct nf_conn *ct;
44 enum ip_conntrack_info ctinfo;
45 __be32 new_ip, netmask;
46 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
47 struct nf_nat_ipv4_range newrange;
48
49 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
50 par->hooknum == NF_INET_POST_ROUTING ||
51 par->hooknum == NF_INET_LOCAL_OUT ||
52 par->hooknum == NF_INET_LOCAL_IN);
53 ct = nf_ct_get(skb, &ctinfo);
54
55 netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
56
57 if (par->hooknum == NF_INET_PRE_ROUTING ||
58 par->hooknum == NF_INET_LOCAL_OUT)
59 new_ip = ip_hdr(skb)->daddr & ~netmask;
60 else
61 new_ip = ip_hdr(skb)->saddr & ~netmask;
62 new_ip |= mr->range[0].min_ip & netmask;
63
64 newrange = ((struct nf_nat_ipv4_range)
65 { mr->range[0].flags | NF_NAT_RANGE_MAP_IPS,
66 new_ip, new_ip,
67 mr->range[0].min, mr->range[0].max });
68
69 /* Hand modified range to generic setup. */
70 return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
71}
72
73static struct xt_target netmap_tg_reg __read_mostly = {
74 .name = "NETMAP",
75 .family = NFPROTO_IPV4,
76 .target = netmap_tg,
77 .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
78 .table = "nat",
79 .hooks = (1 << NF_INET_PRE_ROUTING) |
80 (1 << NF_INET_POST_ROUTING) |
81 (1 << NF_INET_LOCAL_OUT) |
82 (1 << NF_INET_LOCAL_IN),
83 .checkentry = netmap_tg_check,
84 .me = THIS_MODULE
85};
86
87static int __init netmap_tg_init(void)
88{
89 return xt_register_target(&netmap_tg_reg);
90}
91
92static void __exit netmap_tg_exit(void)
93{
94 xt_unregister_target(&netmap_tg_reg);
95}
96
97module_init(netmap_tg_init);
98module_exit(netmap_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
deleted file mode 100644
index 7c0103a5203e..000000000000
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ /dev/null
@@ -1,110 +0,0 @@
1/* Redirect. Simple mapping which alters dst to a local IP address. */
2/* (C) 1999-2001 Paul `Rusty' Russell
3 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10#include <linux/types.h>
11#include <linux/ip.h>
12#include <linux/timer.h>
13#include <linux/module.h>
14#include <linux/netfilter.h>
15#include <linux/netdevice.h>
16#include <linux/if.h>
17#include <linux/inetdevice.h>
18#include <net/protocol.h>
19#include <net/checksum.h>
20#include <linux/netfilter_ipv4.h>
21#include <linux/netfilter/x_tables.h>
22#include <net/netfilter/nf_nat_rule.h>
23
24MODULE_LICENSE("GPL");
25MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
26MODULE_DESCRIPTION("Xtables: Connection redirection to localhost");
27
28/* FIXME: Take multiple ranges --RR */
29static int redirect_tg_check(const struct xt_tgchk_param *par)
30{
31 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
32
33 if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) {
34 pr_debug("bad MAP_IPS.\n");
35 return -EINVAL;
36 }
37 if (mr->rangesize != 1) {
38 pr_debug("bad rangesize %u.\n", mr->rangesize);
39 return -EINVAL;
40 }
41 return 0;
42}
43
44static unsigned int
45redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
46{
47 struct nf_conn *ct;
48 enum ip_conntrack_info ctinfo;
49 __be32 newdst;
50 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
51 struct nf_nat_ipv4_range newrange;
52
53 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
54 par->hooknum == NF_INET_LOCAL_OUT);
55
56 ct = nf_ct_get(skb, &ctinfo);
57 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
58
59 /* Local packets: make them go to loopback */
60 if (par->hooknum == NF_INET_LOCAL_OUT)
61 newdst = htonl(0x7F000001);
62 else {
63 struct in_device *indev;
64 struct in_ifaddr *ifa;
65
66 newdst = 0;
67
68 rcu_read_lock();
69 indev = __in_dev_get_rcu(skb->dev);
70 if (indev && (ifa = indev->ifa_list))
71 newdst = ifa->ifa_local;
72 rcu_read_unlock();
73
74 if (!newdst)
75 return NF_DROP;
76 }
77
78 /* Transfer from original range. */
79 newrange = ((struct nf_nat_ipv4_range)
80 { mr->range[0].flags | NF_NAT_RANGE_MAP_IPS,
81 newdst, newdst,
82 mr->range[0].min, mr->range[0].max });
83
84 /* Hand modified range to generic setup. */
85 return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
86}
87
88static struct xt_target redirect_tg_reg __read_mostly = {
89 .name = "REDIRECT",
90 .family = NFPROTO_IPV4,
91 .target = redirect_tg,
92 .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
93 .table = "nat",
94 .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT),
95 .checkentry = redirect_tg_check,
96 .me = THIS_MODULE,
97};
98
99static int __init redirect_tg_init(void)
100{
101 return xt_register_target(&redirect_tg_reg);
102}
103
104static void __exit redirect_tg_exit(void)
105{
106 xt_unregister_target(&redirect_tg_reg);
107}
108
109module_init(redirect_tg_init);
110module_exit(redirect_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 1109f7f6c254..b5ef3cba2250 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -396,8 +396,7 @@ static int __init ulog_tg_init(void)
396 for (i = 0; i < ULOG_MAXNLGROUPS; i++) 396 for (i = 0; i < ULOG_MAXNLGROUPS; i++)
397 setup_timer(&ulog_buffers[i].timer, ulog_timer, i); 397 setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
398 398
399 nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, 399 nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, &cfg);
400 THIS_MODULE, &cfg);
401 if (!nflognl) 400 if (!nflognl)
402 return -ENOMEM; 401 return -ENOMEM;
403 402
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index 31371be8174b..c30130062cd6 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -85,7 +85,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
85 return ipv4_is_local_multicast(iph->daddr) ^ invert; 85 return ipv4_is_local_multicast(iph->daddr) ^ invert;
86 flow.flowi4_iif = 0; 86 flow.flowi4_iif = 0;
87 } else { 87 } else {
88 flow.flowi4_iif = dev_net(par->in)->loopback_dev->ifindex; 88 flow.flowi4_iif = LOOPBACK_IFINDEX;
89 } 89 }
90 90
91 flow.daddr = iph->saddr; 91 flow.daddr = iph->saddr;
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 851acec852d2..6b3da5cf54e9 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -69,9 +69,7 @@ static int __net_init iptable_filter_net_init(struct net *net)
69 net->ipv4.iptable_filter = 69 net->ipv4.iptable_filter =
70 ipt_register_table(net, &packet_filter, repl); 70 ipt_register_table(net, &packet_filter, repl);
71 kfree(repl); 71 kfree(repl);
72 if (IS_ERR(net->ipv4.iptable_filter)) 72 return PTR_RET(net->ipv4.iptable_filter);
73 return PTR_ERR(net->ipv4.iptable_filter);
74 return 0;
75} 73}
76 74
77static void __net_exit iptable_filter_net_exit(struct net *net) 75static void __net_exit iptable_filter_net_exit(struct net *net)
@@ -96,14 +94,10 @@ static int __init iptable_filter_init(void)
96 filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook); 94 filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
97 if (IS_ERR(filter_ops)) { 95 if (IS_ERR(filter_ops)) {
98 ret = PTR_ERR(filter_ops); 96 ret = PTR_ERR(filter_ops);
99 goto cleanup_table; 97 unregister_pernet_subsys(&iptable_filter_net_ops);
100 } 98 }
101 99
102 return ret; 100 return ret;
103
104 cleanup_table:
105 unregister_pernet_subsys(&iptable_filter_net_ops);
106 return ret;
107} 101}
108 102
109static void __exit iptable_filter_fini(void) 103static void __exit iptable_filter_fini(void)
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index aef5d1fbe77d..85d88f206447 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -104,9 +104,7 @@ static int __net_init iptable_mangle_net_init(struct net *net)
104 net->ipv4.iptable_mangle = 104 net->ipv4.iptable_mangle =
105 ipt_register_table(net, &packet_mangler, repl); 105 ipt_register_table(net, &packet_mangler, repl);
106 kfree(repl); 106 kfree(repl);
107 if (IS_ERR(net->ipv4.iptable_mangle)) 107 return PTR_RET(net->ipv4.iptable_mangle);
108 return PTR_ERR(net->ipv4.iptable_mangle);
109 return 0;
110} 108}
111 109
112static void __net_exit iptable_mangle_net_exit(struct net *net) 110static void __net_exit iptable_mangle_net_exit(struct net *net)
@@ -131,14 +129,10 @@ static int __init iptable_mangle_init(void)
131 mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook); 129 mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);
132 if (IS_ERR(mangle_ops)) { 130 if (IS_ERR(mangle_ops)) {
133 ret = PTR_ERR(mangle_ops); 131 ret = PTR_ERR(mangle_ops);
134 goto cleanup_table; 132 unregister_pernet_subsys(&iptable_mangle_net_ops);
135 } 133 }
136 134
137 return ret; 135 return ret;
138
139 cleanup_table:
140 unregister_pernet_subsys(&iptable_mangle_net_ops);
141 return ret;
142} 136}
143 137
144static void __exit iptable_mangle_fini(void) 138static void __exit iptable_mangle_fini(void)
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/iptable_nat.c
index 3828a4229822..9e0ffaf1d942 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -1,84 +1,71 @@
1/* (C) 1999-2001 Paul `Rusty' Russell 1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 2 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
3 * (C) 2011 Patrick McHardy <kaber@trash.net>
3 * 4 *
4 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 6 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
7 */ 8 */
8#include <linux/types.h> 9
9#include <linux/icmp.h> 10#include <linux/module.h>
10#include <linux/gfp.h>
11#include <linux/ip.h>
12#include <linux/netfilter.h> 11#include <linux/netfilter.h>
13#include <linux/netfilter_ipv4.h> 12#include <linux/netfilter_ipv4.h>
14#include <linux/module.h> 13#include <linux/netfilter_ipv4/ip_tables.h>
15#include <linux/skbuff.h> 14#include <linux/ip.h>
16#include <linux/proc_fs.h>
17#include <net/ip.h> 15#include <net/ip.h>
18#include <net/checksum.h>
19#include <linux/spinlock.h>
20 16
21#include <net/netfilter/nf_conntrack.h>
22#include <net/netfilter/nf_conntrack_core.h>
23#include <net/netfilter/nf_conntrack_extend.h>
24#include <net/netfilter/nf_nat.h> 17#include <net/netfilter/nf_nat.h>
25#include <net/netfilter/nf_nat_rule.h>
26#include <net/netfilter/nf_nat_protocol.h>
27#include <net/netfilter/nf_nat_core.h> 18#include <net/netfilter/nf_nat_core.h>
28#include <net/netfilter/nf_nat_helper.h> 19#include <net/netfilter/nf_nat_l3proto.h>
29#include <linux/netfilter_ipv4/ip_tables.h> 20
21static const struct xt_table nf_nat_ipv4_table = {
22 .name = "nat",
23 .valid_hooks = (1 << NF_INET_PRE_ROUTING) |
24 (1 << NF_INET_POST_ROUTING) |
25 (1 << NF_INET_LOCAL_OUT) |
26 (1 << NF_INET_LOCAL_IN),
27 .me = THIS_MODULE,
28 .af = NFPROTO_IPV4,
29};
30 30
31#ifdef CONFIG_XFRM 31static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
32static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
33{ 32{
34 struct flowi4 *fl4 = &fl->u.ip4; 33 /* Force range to this IP; let proto decide mapping for
35 const struct nf_conn *ct; 34 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
36 const struct nf_conntrack_tuple *t; 35 */
37 enum ip_conntrack_info ctinfo; 36 struct nf_nat_range range;
38 enum ip_conntrack_dir dir; 37
39 unsigned long statusbit; 38 range.flags = 0;
40 39 pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
41 ct = nf_ct_get(skb, &ctinfo); 40 HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ?
42 if (ct == NULL) 41 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
43 return; 42 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
44 dir = CTINFO2DIR(ctinfo); 43
45 t = &ct->tuplehash[dir].tuple; 44 return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
46 45}
47 if (dir == IP_CT_DIR_ORIGINAL)
48 statusbit = IPS_DST_NAT;
49 else
50 statusbit = IPS_SRC_NAT;
51
52 if (ct->status & statusbit) {
53 fl4->daddr = t->dst.u3.ip;
54 if (t->dst.protonum == IPPROTO_TCP ||
55 t->dst.protonum == IPPROTO_UDP ||
56 t->dst.protonum == IPPROTO_UDPLITE ||
57 t->dst.protonum == IPPROTO_DCCP ||
58 t->dst.protonum == IPPROTO_SCTP)
59 fl4->fl4_dport = t->dst.u.tcp.port;
60 }
61 46
62 statusbit ^= IPS_NAT_MASK; 47static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum,
48 const struct net_device *in,
49 const struct net_device *out,
50 struct nf_conn *ct)
51{
52 struct net *net = nf_ct_net(ct);
53 unsigned int ret;
63 54
64 if (ct->status & statusbit) { 55 ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table);
65 fl4->saddr = t->src.u3.ip; 56 if (ret == NF_ACCEPT) {
66 if (t->dst.protonum == IPPROTO_TCP || 57 if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
67 t->dst.protonum == IPPROTO_UDP || 58 ret = alloc_null_binding(ct, hooknum);
68 t->dst.protonum == IPPROTO_UDPLITE ||
69 t->dst.protonum == IPPROTO_DCCP ||
70 t->dst.protonum == IPPROTO_SCTP)
71 fl4->fl4_sport = t->src.u.tcp.port;
72 } 59 }
60 return ret;
73} 61}
74#endif
75 62
76static unsigned int 63static unsigned int
77nf_nat_fn(unsigned int hooknum, 64nf_nat_ipv4_fn(unsigned int hooknum,
78 struct sk_buff *skb, 65 struct sk_buff *skb,
79 const struct net_device *in, 66 const struct net_device *in,
80 const struct net_device *out, 67 const struct net_device *out,
81 int (*okfn)(struct sk_buff *)) 68 int (*okfn)(struct sk_buff *))
82{ 69{
83 struct nf_conn *ct; 70 struct nf_conn *ct;
84 enum ip_conntrack_info ctinfo; 71 enum ip_conntrack_info ctinfo;
@@ -87,14 +74,16 @@ nf_nat_fn(unsigned int hooknum,
87 enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum); 74 enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
88 75
89 /* We never see fragments: conntrack defrags on pre-routing 76 /* We never see fragments: conntrack defrags on pre-routing
90 and local-out, and nf_nat_out protects post-routing. */ 77 * and local-out, and nf_nat_out protects post-routing.
78 */
91 NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb))); 79 NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
92 80
93 ct = nf_ct_get(skb, &ctinfo); 81 ct = nf_ct_get(skb, &ctinfo);
94 /* Can't track? It's not due to stress, or conntrack would 82 /* Can't track? It's not due to stress, or conntrack would
95 have dropped it. Hence it's the user's responsibilty to 83 * have dropped it. Hence it's the user's responsibilty to
96 packet filter it out, or implement conntrack/NAT for that 84 * packet filter it out, or implement conntrack/NAT for that
97 protocol. 8) --RR */ 85 * protocol. 8) --RR
86 */
98 if (!ct) 87 if (!ct)
99 return NF_ACCEPT; 88 return NF_ACCEPT;
100 89
@@ -118,17 +107,17 @@ nf_nat_fn(unsigned int hooknum,
118 case IP_CT_RELATED: 107 case IP_CT_RELATED:
119 case IP_CT_RELATED_REPLY: 108 case IP_CT_RELATED_REPLY:
120 if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { 109 if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
121 if (!nf_nat_icmp_reply_translation(ct, ctinfo, 110 if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
122 hooknum, skb)) 111 hooknum))
123 return NF_DROP; 112 return NF_DROP;
124 else 113 else
125 return NF_ACCEPT; 114 return NF_ACCEPT;
126 } 115 }
127 /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ 116 /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
128 case IP_CT_NEW: 117 case IP_CT_NEW:
129
130 /* Seen it before? This can happen for loopback, retrans, 118 /* Seen it before? This can happen for loopback, retrans,
131 or local packets.. */ 119 * or local packets.
120 */
132 if (!nf_nat_initialized(ct, maniptype)) { 121 if (!nf_nat_initialized(ct, maniptype)) {
133 unsigned int ret; 122 unsigned int ret;
134 123
@@ -151,16 +140,16 @@ nf_nat_fn(unsigned int hooknum,
151} 140}
152 141
153static unsigned int 142static unsigned int
154nf_nat_in(unsigned int hooknum, 143nf_nat_ipv4_in(unsigned int hooknum,
155 struct sk_buff *skb, 144 struct sk_buff *skb,
156 const struct net_device *in, 145 const struct net_device *in,
157 const struct net_device *out, 146 const struct net_device *out,
158 int (*okfn)(struct sk_buff *)) 147 int (*okfn)(struct sk_buff *))
159{ 148{
160 unsigned int ret; 149 unsigned int ret;
161 __be32 daddr = ip_hdr(skb)->daddr; 150 __be32 daddr = ip_hdr(skb)->daddr;
162 151
163 ret = nf_nat_fn(hooknum, skb, in, out, okfn); 152 ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
164 if (ret != NF_DROP && ret != NF_STOLEN && 153 if (ret != NF_DROP && ret != NF_STOLEN &&
165 daddr != ip_hdr(skb)->daddr) 154 daddr != ip_hdr(skb)->daddr)
166 skb_dst_drop(skb); 155 skb_dst_drop(skb);
@@ -169,11 +158,11 @@ nf_nat_in(unsigned int hooknum,
169} 158}
170 159
171static unsigned int 160static unsigned int
172nf_nat_out(unsigned int hooknum, 161nf_nat_ipv4_out(unsigned int hooknum,
173 struct sk_buff *skb, 162 struct sk_buff *skb,
174 const struct net_device *in, 163 const struct net_device *in,
175 const struct net_device *out, 164 const struct net_device *out,
176 int (*okfn)(struct sk_buff *)) 165 int (*okfn)(struct sk_buff *))
177{ 166{
178#ifdef CONFIG_XFRM 167#ifdef CONFIG_XFRM
179 const struct nf_conn *ct; 168 const struct nf_conn *ct;
@@ -186,29 +175,30 @@ nf_nat_out(unsigned int hooknum,
186 ip_hdrlen(skb) < sizeof(struct iphdr)) 175 ip_hdrlen(skb) < sizeof(struct iphdr))
187 return NF_ACCEPT; 176 return NF_ACCEPT;
188 177
189 ret = nf_nat_fn(hooknum, skb, in, out, okfn); 178 ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
190#ifdef CONFIG_XFRM 179#ifdef CONFIG_XFRM
191 if (ret != NF_DROP && ret != NF_STOLEN && 180 if (ret != NF_DROP && ret != NF_STOLEN &&
181 !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
192 (ct = nf_ct_get(skb, &ctinfo)) != NULL) { 182 (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
193 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 183 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
194 184
195 if ((ct->tuplehash[dir].tuple.src.u3.ip != 185 if ((ct->tuplehash[dir].tuple.src.u3.ip !=
196 ct->tuplehash[!dir].tuple.dst.u3.ip) || 186 ct->tuplehash[!dir].tuple.dst.u3.ip) ||
197 (ct->tuplehash[dir].tuple.src.u.all != 187 (ct->tuplehash[dir].tuple.src.u.all !=
198 ct->tuplehash[!dir].tuple.dst.u.all) 188 ct->tuplehash[!dir].tuple.dst.u.all))
199 ) 189 if (nf_xfrm_me_harder(skb, AF_INET) < 0)
200 return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP; 190 ret = NF_DROP;
201 } 191 }
202#endif 192#endif
203 return ret; 193 return ret;
204} 194}
205 195
206static unsigned int 196static unsigned int
207nf_nat_local_fn(unsigned int hooknum, 197nf_nat_ipv4_local_fn(unsigned int hooknum,
208 struct sk_buff *skb, 198 struct sk_buff *skb,
209 const struct net_device *in, 199 const struct net_device *in,
210 const struct net_device *out, 200 const struct net_device *out,
211 int (*okfn)(struct sk_buff *)) 201 int (*okfn)(struct sk_buff *))
212{ 202{
213 const struct nf_conn *ct; 203 const struct nf_conn *ct;
214 enum ip_conntrack_info ctinfo; 204 enum ip_conntrack_info ctinfo;
@@ -219,7 +209,7 @@ nf_nat_local_fn(unsigned int hooknum,
219 ip_hdrlen(skb) < sizeof(struct iphdr)) 209 ip_hdrlen(skb) < sizeof(struct iphdr))
220 return NF_ACCEPT; 210 return NF_ACCEPT;
221 211
222 ret = nf_nat_fn(hooknum, skb, in, out, okfn); 212 ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
223 if (ret != NF_DROP && ret != NF_STOLEN && 213 if (ret != NF_DROP && ret != NF_STOLEN &&
224 (ct = nf_ct_get(skb, &ctinfo)) != NULL) { 214 (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
225 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 215 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
@@ -230,21 +220,20 @@ nf_nat_local_fn(unsigned int hooknum,
230 ret = NF_DROP; 220 ret = NF_DROP;
231 } 221 }
232#ifdef CONFIG_XFRM 222#ifdef CONFIG_XFRM
233 else if (ct->tuplehash[dir].tuple.dst.u.all != 223 else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
224 ct->tuplehash[dir].tuple.dst.u.all !=
234 ct->tuplehash[!dir].tuple.src.u.all) 225 ct->tuplehash[!dir].tuple.src.u.all)
235 if (ip_xfrm_me_harder(skb)) 226 if (nf_xfrm_me_harder(skb, AF_INET) < 0)
236 ret = NF_DROP; 227 ret = NF_DROP;
237#endif 228#endif
238 } 229 }
239 return ret; 230 return ret;
240} 231}
241 232
242/* We must be after connection tracking and before packet filtering. */ 233static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
243
244static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
245 /* Before packet filtering, change destination */ 234 /* Before packet filtering, change destination */
246 { 235 {
247 .hook = nf_nat_in, 236 .hook = nf_nat_ipv4_in,
248 .owner = THIS_MODULE, 237 .owner = THIS_MODULE,
249 .pf = NFPROTO_IPV4, 238 .pf = NFPROTO_IPV4,
250 .hooknum = NF_INET_PRE_ROUTING, 239 .hooknum = NF_INET_PRE_ROUTING,
@@ -252,7 +241,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
252 }, 241 },
253 /* After packet filtering, change source */ 242 /* After packet filtering, change source */
254 { 243 {
255 .hook = nf_nat_out, 244 .hook = nf_nat_ipv4_out,
256 .owner = THIS_MODULE, 245 .owner = THIS_MODULE,
257 .pf = NFPROTO_IPV4, 246 .pf = NFPROTO_IPV4,
258 .hooknum = NF_INET_POST_ROUTING, 247 .hooknum = NF_INET_POST_ROUTING,
@@ -260,7 +249,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
260 }, 249 },
261 /* Before packet filtering, change destination */ 250 /* Before packet filtering, change destination */
262 { 251 {
263 .hook = nf_nat_local_fn, 252 .hook = nf_nat_ipv4_local_fn,
264 .owner = THIS_MODULE, 253 .owner = THIS_MODULE,
265 .pf = NFPROTO_IPV4, 254 .pf = NFPROTO_IPV4,
266 .hooknum = NF_INET_LOCAL_OUT, 255 .hooknum = NF_INET_LOCAL_OUT,
@@ -268,7 +257,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
268 }, 257 },
269 /* After packet filtering, change source */ 258 /* After packet filtering, change source */
270 { 259 {
271 .hook = nf_nat_fn, 260 .hook = nf_nat_ipv4_fn,
272 .owner = THIS_MODULE, 261 .owner = THIS_MODULE,
273 .pf = NFPROTO_IPV4, 262 .pf = NFPROTO_IPV4,
274 .hooknum = NF_INET_LOCAL_IN, 263 .hooknum = NF_INET_LOCAL_IN,
@@ -276,51 +265,56 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
276 }, 265 },
277}; 266};
278 267
279static int __init nf_nat_standalone_init(void) 268static int __net_init iptable_nat_net_init(struct net *net)
280{ 269{
281 int ret = 0; 270 struct ipt_replace *repl;
271
272 repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
273 if (repl == NULL)
274 return -ENOMEM;
275 net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
276 kfree(repl);
277 if (IS_ERR(net->ipv4.nat_table))
278 return PTR_ERR(net->ipv4.nat_table);
279 return 0;
280}
282 281
283 need_ipv4_conntrack(); 282static void __net_exit iptable_nat_net_exit(struct net *net)
283{
284 ipt_unregister_table(net, net->ipv4.nat_table);
285}
284 286
285#ifdef CONFIG_XFRM 287static struct pernet_operations iptable_nat_net_ops = {
286 BUG_ON(ip_nat_decode_session != NULL); 288 .init = iptable_nat_net_init,
287 RCU_INIT_POINTER(ip_nat_decode_session, nat_decode_session); 289 .exit = iptable_nat_net_exit,
288#endif 290};
289 ret = nf_nat_rule_init();
290 if (ret < 0) {
291 pr_err("nf_nat_init: can't setup rules.\n");
292 goto cleanup_decode_session;
293 }
294 ret = nf_register_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops));
295 if (ret < 0) {
296 pr_err("nf_nat_init: can't register hooks.\n");
297 goto cleanup_rule_init;
298 }
299 return ret;
300 291
301 cleanup_rule_init: 292static int __init iptable_nat_init(void)
302 nf_nat_rule_cleanup(); 293{
303 cleanup_decode_session: 294 int err;
304#ifdef CONFIG_XFRM 295
305 RCU_INIT_POINTER(ip_nat_decode_session, NULL); 296 err = register_pernet_subsys(&iptable_nat_net_ops);
306 synchronize_net(); 297 if (err < 0)
307#endif 298 goto err1;
308 return ret; 299
300 err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
301 if (err < 0)
302 goto err2;
303 return 0;
304
305err2:
306 unregister_pernet_subsys(&iptable_nat_net_ops);
307err1:
308 return err;
309} 309}
310 310
311static void __exit nf_nat_standalone_fini(void) 311static void __exit iptable_nat_exit(void)
312{ 312{
313 nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); 313 nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
314 nf_nat_rule_cleanup(); 314 unregister_pernet_subsys(&iptable_nat_net_ops);
315#ifdef CONFIG_XFRM
316 RCU_INIT_POINTER(ip_nat_decode_session, NULL);
317 synchronize_net();
318#endif
319 /* Conntrack caches are unregistered in nf_conntrack_cleanup */
320} 315}
321 316
322module_init(nf_nat_standalone_init); 317module_init(iptable_nat_init);
323module_exit(nf_nat_standalone_fini); 318module_exit(iptable_nat_exit);
324 319
325MODULE_LICENSE("GPL"); 320MODULE_LICENSE("GPL");
326MODULE_ALIAS("ip_nat");
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 07fb710cd722..03d9696d3c6e 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -48,9 +48,7 @@ static int __net_init iptable_raw_net_init(struct net *net)
48 net->ipv4.iptable_raw = 48 net->ipv4.iptable_raw =
49 ipt_register_table(net, &packet_raw, repl); 49 ipt_register_table(net, &packet_raw, repl);
50 kfree(repl); 50 kfree(repl);
51 if (IS_ERR(net->ipv4.iptable_raw)) 51 return PTR_RET(net->ipv4.iptable_raw);
52 return PTR_ERR(net->ipv4.iptable_raw);
53 return 0;
54} 52}
55 53
56static void __net_exit iptable_raw_net_exit(struct net *net) 54static void __net_exit iptable_raw_net_exit(struct net *net)
@@ -75,14 +73,10 @@ static int __init iptable_raw_init(void)
75 rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook); 73 rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);
76 if (IS_ERR(rawtable_ops)) { 74 if (IS_ERR(rawtable_ops)) {
77 ret = PTR_ERR(rawtable_ops); 75 ret = PTR_ERR(rawtable_ops);
78 goto cleanup_table; 76 unregister_pernet_subsys(&iptable_raw_net_ops);
79 } 77 }
80 78
81 return ret; 79 return ret;
82
83 cleanup_table:
84 unregister_pernet_subsys(&iptable_raw_net_ops);
85 return ret;
86} 80}
87 81
88static void __exit iptable_raw_fini(void) 82static void __exit iptable_raw_fini(void)
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index be45bdc4c602..b283d8e2601a 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -66,10 +66,7 @@ static int __net_init iptable_security_net_init(struct net *net)
66 net->ipv4.iptable_security = 66 net->ipv4.iptable_security =
67 ipt_register_table(net, &security_table, repl); 67 ipt_register_table(net, &security_table, repl);
68 kfree(repl); 68 kfree(repl);
69 if (IS_ERR(net->ipv4.iptable_security)) 69 return PTR_RET(net->ipv4.iptable_security);
70 return PTR_ERR(net->ipv4.iptable_security);
71
72 return 0;
73} 70}
74 71
75static void __net_exit iptable_security_net_exit(struct net *net) 72static void __net_exit iptable_security_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index e7ff2dcab6ce..fcdd0c2406e6 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -29,11 +29,6 @@
29#include <net/netfilter/ipv4/nf_defrag_ipv4.h> 29#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
30#include <net/netfilter/nf_log.h> 30#include <net/netfilter/nf_log.h>
31 31
32int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb,
33 struct nf_conn *ct,
34 enum ip_conntrack_info ctinfo);
35EXPORT_SYMBOL_GPL(nf_nat_seq_adjust_hook);
36
37static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, 32static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
38 struct nf_conntrack_tuple *tuple) 33 struct nf_conntrack_tuple *tuple)
39{ 34{
@@ -149,7 +144,8 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
149 typeof(nf_nat_seq_adjust_hook) seq_adjust; 144 typeof(nf_nat_seq_adjust_hook) seq_adjust;
150 145
151 seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); 146 seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook);
152 if (!seq_adjust || !seq_adjust(skb, ct, ctinfo)) { 147 if (!seq_adjust ||
148 !seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
153 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); 149 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
154 return NF_DROP; 150 return NF_DROP;
155 } 151 }
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
deleted file mode 100644
index 3c04d24e2976..000000000000
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ /dev/null
@@ -1,85 +0,0 @@
1/* Amanda extension for TCP NAT alteration.
2 * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
3 * based on a copy of HW's ip_nat_irc.c as well as other modules
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version
8 * 2 of the License, or (at your option) any later version.
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/skbuff.h>
14#include <linux/udp.h>
15
16#include <net/netfilter/nf_conntrack_helper.h>
17#include <net/netfilter/nf_conntrack_expect.h>
18#include <net/netfilter/nf_nat_helper.h>
19#include <net/netfilter/nf_nat_rule.h>
20#include <linux/netfilter/nf_conntrack_amanda.h>
21
22MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
23MODULE_DESCRIPTION("Amanda NAT helper");
24MODULE_LICENSE("GPL");
25MODULE_ALIAS("ip_nat_amanda");
26
27static unsigned int help(struct sk_buff *skb,
28 enum ip_conntrack_info ctinfo,
29 unsigned int matchoff,
30 unsigned int matchlen,
31 struct nf_conntrack_expect *exp)
32{
33 char buffer[sizeof("65535")];
34 u_int16_t port;
35 unsigned int ret;
36
37 /* Connection comes from client. */
38 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
39 exp->dir = IP_CT_DIR_ORIGINAL;
40
41 /* When you see the packet, we need to NAT it the same as the
42 * this one (ie. same IP: it will be TCP and master is UDP). */
43 exp->expectfn = nf_nat_follow_master;
44
45 /* Try to get same port: if not, try to change it. */
46 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
47 int res;
48
49 exp->tuple.dst.u.tcp.port = htons(port);
50 res = nf_ct_expect_related(exp);
51 if (res == 0)
52 break;
53 else if (res != -EBUSY) {
54 port = 0;
55 break;
56 }
57 }
58
59 if (port == 0)
60 return NF_DROP;
61
62 sprintf(buffer, "%u", port);
63 ret = nf_nat_mangle_udp_packet(skb, exp->master, ctinfo,
64 matchoff, matchlen,
65 buffer, strlen(buffer));
66 if (ret != NF_ACCEPT)
67 nf_ct_unexpect_related(exp);
68 return ret;
69}
70
71static void __exit nf_nat_amanda_fini(void)
72{
73 RCU_INIT_POINTER(nf_nat_amanda_hook, NULL);
74 synchronize_rcu();
75}
76
77static int __init nf_nat_amanda_init(void)
78{
79 BUG_ON(nf_nat_amanda_hook != NULL);
80 RCU_INIT_POINTER(nf_nat_amanda_hook, help);
81 return 0;
82}
83
84module_init(nf_nat_amanda_init);
85module_exit(nf_nat_amanda_fini);
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
deleted file mode 100644
index 44b082fd48ab..000000000000
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ /dev/null
@@ -1,763 +0,0 @@
1/* NAT for netfilter; shared with compatibility layer. */
2
3/* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/types.h>
13#include <linux/timer.h>
14#include <linux/skbuff.h>
15#include <linux/gfp.h>
16#include <net/checksum.h>
17#include <net/icmp.h>
18#include <net/ip.h>
19#include <net/tcp.h> /* For tcp_prot in getorigdst */
20#include <linux/icmp.h>
21#include <linux/udp.h>
22#include <linux/jhash.h>
23
24#include <linux/netfilter_ipv4.h>
25#include <net/netfilter/nf_conntrack.h>
26#include <net/netfilter/nf_conntrack_core.h>
27#include <net/netfilter/nf_nat.h>
28#include <net/netfilter/nf_nat_protocol.h>
29#include <net/netfilter/nf_nat_core.h>
30#include <net/netfilter/nf_nat_helper.h>
31#include <net/netfilter/nf_conntrack_helper.h>
32#include <net/netfilter/nf_conntrack_l3proto.h>
33#include <net/netfilter/nf_conntrack_zones.h>
34
35static DEFINE_SPINLOCK(nf_nat_lock);
36
37static struct nf_conntrack_l3proto *l3proto __read_mostly;
38
39#define MAX_IP_NAT_PROTO 256
40static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO]
41 __read_mostly;
42
43static inline const struct nf_nat_protocol *
44__nf_nat_proto_find(u_int8_t protonum)
45{
46 return rcu_dereference(nf_nat_protos[protonum]);
47}
48
49/* We keep an extra hash for each conntrack, for fast searching. */
50static inline unsigned int
51hash_by_src(const struct net *net, u16 zone,
52 const struct nf_conntrack_tuple *tuple)
53{
54 unsigned int hash;
55
56 /* Original src, to ensure we map it consistently if poss. */
57 hash = jhash_3words((__force u32)tuple->src.u3.ip,
58 (__force u32)tuple->src.u.all ^ zone,
59 tuple->dst.protonum, nf_conntrack_hash_rnd);
60 return ((u64)hash * net->ipv4.nat_htable_size) >> 32;
61}
62
63/* Is this tuple already taken? (not by us) */
64int
65nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
66 const struct nf_conn *ignored_conntrack)
67{
68 /* Conntrack tracking doesn't keep track of outgoing tuples; only
69 incoming ones. NAT means they don't have a fixed mapping,
70 so we invert the tuple and look for the incoming reply.
71
72 We could keep a separate hash if this proves too slow. */
73 struct nf_conntrack_tuple reply;
74
75 nf_ct_invert_tuplepr(&reply, tuple);
76 return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
77}
78EXPORT_SYMBOL(nf_nat_used_tuple);
79
80/* If we source map this tuple so reply looks like reply_tuple, will
81 * that meet the constraints of range. */
82static int
83in_range(const struct nf_conntrack_tuple *tuple,
84 const struct nf_nat_ipv4_range *range)
85{
86 const struct nf_nat_protocol *proto;
87 int ret = 0;
88
89 /* If we are supposed to map IPs, then we must be in the
90 range specified, otherwise let this drag us onto a new src IP. */
91 if (range->flags & NF_NAT_RANGE_MAP_IPS) {
92 if (ntohl(tuple->src.u3.ip) < ntohl(range->min_ip) ||
93 ntohl(tuple->src.u3.ip) > ntohl(range->max_ip))
94 return 0;
95 }
96
97 rcu_read_lock();
98 proto = __nf_nat_proto_find(tuple->dst.protonum);
99 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) ||
100 proto->in_range(tuple, NF_NAT_MANIP_SRC,
101 &range->min, &range->max))
102 ret = 1;
103 rcu_read_unlock();
104
105 return ret;
106}
107
108static inline int
109same_src(const struct nf_conn *ct,
110 const struct nf_conntrack_tuple *tuple)
111{
112 const struct nf_conntrack_tuple *t;
113
114 t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
115 return (t->dst.protonum == tuple->dst.protonum &&
116 t->src.u3.ip == tuple->src.u3.ip &&
117 t->src.u.all == tuple->src.u.all);
118}
119
120/* Only called for SRC manip */
121static int
122find_appropriate_src(struct net *net, u16 zone,
123 const struct nf_conntrack_tuple *tuple,
124 struct nf_conntrack_tuple *result,
125 const struct nf_nat_ipv4_range *range)
126{
127 unsigned int h = hash_by_src(net, zone, tuple);
128 const struct nf_conn_nat *nat;
129 const struct nf_conn *ct;
130 const struct hlist_node *n;
131
132 rcu_read_lock();
133 hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) {
134 ct = nat->ct;
135 if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) {
136 /* Copy source part from reply tuple. */
137 nf_ct_invert_tuplepr(result,
138 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
139 result->dst = tuple->dst;
140
141 if (in_range(result, range)) {
142 rcu_read_unlock();
143 return 1;
144 }
145 }
146 }
147 rcu_read_unlock();
148 return 0;
149}
150
151/* For [FUTURE] fragmentation handling, we want the least-used
152 src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
153 if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
154 1-65535, we don't do pro-rata allocation based on ports; we choose
155 the ip with the lowest src-ip/dst-ip/proto usage.
156*/
157static void
158find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
159 const struct nf_nat_ipv4_range *range,
160 const struct nf_conn *ct,
161 enum nf_nat_manip_type maniptype)
162{
163 __be32 *var_ipp;
164 /* Host order */
165 u_int32_t minip, maxip, j;
166
167 /* No IP mapping? Do nothing. */
168 if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
169 return;
170
171 if (maniptype == NF_NAT_MANIP_SRC)
172 var_ipp = &tuple->src.u3.ip;
173 else
174 var_ipp = &tuple->dst.u3.ip;
175
176 /* Fast path: only one choice. */
177 if (range->min_ip == range->max_ip) {
178 *var_ipp = range->min_ip;
179 return;
180 }
181
182 /* Hashing source and destination IPs gives a fairly even
183 * spread in practice (if there are a small number of IPs
184 * involved, there usually aren't that many connections
185 * anyway). The consistency means that servers see the same
186 * client coming from the same IP (some Internet Banking sites
187 * like this), even across reboots. */
188 minip = ntohl(range->min_ip);
189 maxip = ntohl(range->max_ip);
190 j = jhash_2words((__force u32)tuple->src.u3.ip,
191 range->flags & NF_NAT_RANGE_PERSISTENT ?
192 0 : (__force u32)tuple->dst.u3.ip ^ zone, 0);
193 j = ((u64)j * (maxip - minip + 1)) >> 32;
194 *var_ipp = htonl(minip + j);
195}
196
197/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
198 * we change the source to map into the range. For NF_INET_PRE_ROUTING
199 * and NF_INET_LOCAL_OUT, we change the destination to map into the
200 * range. It might not be possible to get a unique tuple, but we try.
201 * At worst (or if we race), we will end up with a final duplicate in
202 * __ip_conntrack_confirm and drop the packet. */
203static void
204get_unique_tuple(struct nf_conntrack_tuple *tuple,
205 const struct nf_conntrack_tuple *orig_tuple,
206 const struct nf_nat_ipv4_range *range,
207 struct nf_conn *ct,
208 enum nf_nat_manip_type maniptype)
209{
210 struct net *net = nf_ct_net(ct);
211 const struct nf_nat_protocol *proto;
212 u16 zone = nf_ct_zone(ct);
213
214 /* 1) If this srcip/proto/src-proto-part is currently mapped,
215 and that same mapping gives a unique tuple within the given
216 range, use that.
217
218 This is only required for source (ie. NAT/masq) mappings.
219 So far, we don't do local source mappings, so multiple
220 manips not an issue. */
221 if (maniptype == NF_NAT_MANIP_SRC &&
222 !(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) {
223 /* try the original tuple first */
224 if (in_range(orig_tuple, range)) {
225 if (!nf_nat_used_tuple(orig_tuple, ct)) {
226 *tuple = *orig_tuple;
227 return;
228 }
229 } else if (find_appropriate_src(net, zone, orig_tuple, tuple,
230 range)) {
231 pr_debug("get_unique_tuple: Found current src map\n");
232 if (!nf_nat_used_tuple(tuple, ct))
233 return;
234 }
235 }
236
237 /* 2) Select the least-used IP/proto combination in the given
238 range. */
239 *tuple = *orig_tuple;
240 find_best_ips_proto(zone, tuple, range, ct, maniptype);
241
242 /* 3) The per-protocol part of the manip is made to map into
243 the range to make a unique tuple. */
244
245 rcu_read_lock();
246 proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
247
248 /* Only bother mapping if it's not already in range and unique */
249 if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM)) {
250 if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
251 if (proto->in_range(tuple, maniptype, &range->min,
252 &range->max) &&
253 (range->min.all == range->max.all ||
254 !nf_nat_used_tuple(tuple, ct)))
255 goto out;
256 } else if (!nf_nat_used_tuple(tuple, ct)) {
257 goto out;
258 }
259 }
260
261 /* Last change: get protocol to try to obtain unique tuple. */
262 proto->unique_tuple(tuple, range, maniptype, ct);
263out:
264 rcu_read_unlock();
265}
266
267unsigned int
268nf_nat_setup_info(struct nf_conn *ct,
269 const struct nf_nat_ipv4_range *range,
270 enum nf_nat_manip_type maniptype)
271{
272 struct net *net = nf_ct_net(ct);
273 struct nf_conntrack_tuple curr_tuple, new_tuple;
274 struct nf_conn_nat *nat;
275
276 /* nat helper or nfctnetlink also setup binding */
277 nat = nfct_nat(ct);
278 if (!nat) {
279 nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
280 if (nat == NULL) {
281 pr_debug("failed to add NAT extension\n");
282 return NF_ACCEPT;
283 }
284 }
285
286 NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC ||
287 maniptype == NF_NAT_MANIP_DST);
288 BUG_ON(nf_nat_initialized(ct, maniptype));
289
290 /* What we've got will look like inverse of reply. Normally
291 this is what is in the conntrack, except for prior
292 manipulations (future optimization: if num_manips == 0,
293 orig_tp =
294 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
295 nf_ct_invert_tuplepr(&curr_tuple,
296 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
297
298 get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
299
300 if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
301 struct nf_conntrack_tuple reply;
302
303 /* Alter conntrack table so will recognize replies. */
304 nf_ct_invert_tuplepr(&reply, &new_tuple);
305 nf_conntrack_alter_reply(ct, &reply);
306
307 /* Non-atomic: we own this at the moment. */
308 if (maniptype == NF_NAT_MANIP_SRC)
309 ct->status |= IPS_SRC_NAT;
310 else
311 ct->status |= IPS_DST_NAT;
312 }
313
314 if (maniptype == NF_NAT_MANIP_SRC) {
315 unsigned int srchash;
316
317 srchash = hash_by_src(net, nf_ct_zone(ct),
318 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
319 spin_lock_bh(&nf_nat_lock);
320 /* nf_conntrack_alter_reply might re-allocate extension area */
321 nat = nfct_nat(ct);
322 nat->ct = ct;
323 hlist_add_head_rcu(&nat->bysource,
324 &net->ipv4.nat_bysource[srchash]);
325 spin_unlock_bh(&nf_nat_lock);
326 }
327
328 /* It's done. */
329 if (maniptype == NF_NAT_MANIP_DST)
330 ct->status |= IPS_DST_NAT_DONE;
331 else
332 ct->status |= IPS_SRC_NAT_DONE;
333
334 return NF_ACCEPT;
335}
336EXPORT_SYMBOL(nf_nat_setup_info);
337
338/* Returns true if succeeded. */
339static bool
340manip_pkt(u_int16_t proto,
341 struct sk_buff *skb,
342 unsigned int iphdroff,
343 const struct nf_conntrack_tuple *target,
344 enum nf_nat_manip_type maniptype)
345{
346 struct iphdr *iph;
347 const struct nf_nat_protocol *p;
348
349 if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
350 return false;
351
352 iph = (void *)skb->data + iphdroff;
353
354 /* Manipulate protcol part. */
355
356 /* rcu_read_lock()ed by nf_hook_slow */
357 p = __nf_nat_proto_find(proto);
358 if (!p->manip_pkt(skb, iphdroff, target, maniptype))
359 return false;
360
361 iph = (void *)skb->data + iphdroff;
362
363 if (maniptype == NF_NAT_MANIP_SRC) {
364 csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
365 iph->saddr = target->src.u3.ip;
366 } else {
367 csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
368 iph->daddr = target->dst.u3.ip;
369 }
370 return true;
371}
372
373/* Do packet manipulations according to nf_nat_setup_info. */
374unsigned int nf_nat_packet(struct nf_conn *ct,
375 enum ip_conntrack_info ctinfo,
376 unsigned int hooknum,
377 struct sk_buff *skb)
378{
379 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
380 unsigned long statusbit;
381 enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
382
383 if (mtype == NF_NAT_MANIP_SRC)
384 statusbit = IPS_SRC_NAT;
385 else
386 statusbit = IPS_DST_NAT;
387
388 /* Invert if this is reply dir. */
389 if (dir == IP_CT_DIR_REPLY)
390 statusbit ^= IPS_NAT_MASK;
391
392 /* Non-atomic: these bits don't change. */
393 if (ct->status & statusbit) {
394 struct nf_conntrack_tuple target;
395
396 /* We are aiming to look like inverse of other direction. */
397 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
398
399 if (!manip_pkt(target.dst.protonum, skb, 0, &target, mtype))
400 return NF_DROP;
401 }
402 return NF_ACCEPT;
403}
404EXPORT_SYMBOL_GPL(nf_nat_packet);
405
406/* Dir is direction ICMP is coming from (opposite to packet it contains) */
407int nf_nat_icmp_reply_translation(struct nf_conn *ct,
408 enum ip_conntrack_info ctinfo,
409 unsigned int hooknum,
410 struct sk_buff *skb)
411{
412 struct {
413 struct icmphdr icmp;
414 struct iphdr ip;
415 } *inside;
416 struct nf_conntrack_tuple target;
417 int hdrlen = ip_hdrlen(skb);
418 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
419 unsigned long statusbit;
420 enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
421
422 if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
423 return 0;
424
425 inside = (void *)skb->data + hdrlen;
426
427 /* We're actually going to mangle it beyond trivial checksum
428 adjustment, so make sure the current checksum is correct. */
429 if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
430 return 0;
431
432 /* Must be RELATED */
433 NF_CT_ASSERT(skb->nfctinfo == IP_CT_RELATED ||
434 skb->nfctinfo == IP_CT_RELATED_REPLY);
435
436 /* Redirects on non-null nats must be dropped, else they'll
437 start talking to each other without our translation, and be
438 confused... --RR */
439 if (inside->icmp.type == ICMP_REDIRECT) {
440 /* If NAT isn't finished, assume it and drop. */
441 if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
442 return 0;
443
444 if (ct->status & IPS_NAT_MASK)
445 return 0;
446 }
447
448 if (manip == NF_NAT_MANIP_SRC)
449 statusbit = IPS_SRC_NAT;
450 else
451 statusbit = IPS_DST_NAT;
452
453 /* Invert if this is reply dir. */
454 if (dir == IP_CT_DIR_REPLY)
455 statusbit ^= IPS_NAT_MASK;
456
457 if (!(ct->status & statusbit))
458 return 1;
459
460 pr_debug("icmp_reply_translation: translating error %p manip %u "
461 "dir %s\n", skb, manip,
462 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
463
464 /* Change inner back to look like incoming packet. We do the
465 opposite manip on this hook to normal, because it might not
466 pass all hooks (locally-generated ICMP). Consider incoming
467 packet: PREROUTING (DST manip), routing produces ICMP, goes
468 through POSTROUTING (which must correct the DST manip). */
469 if (!manip_pkt(inside->ip.protocol, skb, hdrlen + sizeof(inside->icmp),
470 &ct->tuplehash[!dir].tuple, !manip))
471 return 0;
472
473 if (skb->ip_summed != CHECKSUM_PARTIAL) {
474 /* Reloading "inside" here since manip_pkt inner. */
475 inside = (void *)skb->data + hdrlen;
476 inside->icmp.checksum = 0;
477 inside->icmp.checksum =
478 csum_fold(skb_checksum(skb, hdrlen,
479 skb->len - hdrlen, 0));
480 }
481
482 /* Change outer to look the reply to an incoming packet
483 * (proto 0 means don't invert per-proto part). */
484 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
485 if (!manip_pkt(0, skb, 0, &target, manip))
486 return 0;
487
488 return 1;
489}
490EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
491
492/* Protocol registration. */
493int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
494{
495 int ret = 0;
496
497 spin_lock_bh(&nf_nat_lock);
498 if (rcu_dereference_protected(
499 nf_nat_protos[proto->protonum],
500 lockdep_is_held(&nf_nat_lock)
501 ) != &nf_nat_unknown_protocol) {
502 ret = -EBUSY;
503 goto out;
504 }
505 RCU_INIT_POINTER(nf_nat_protos[proto->protonum], proto);
506 out:
507 spin_unlock_bh(&nf_nat_lock);
508 return ret;
509}
510EXPORT_SYMBOL(nf_nat_protocol_register);
511
512/* No one stores the protocol anywhere; simply delete it. */
513void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto)
514{
515 spin_lock_bh(&nf_nat_lock);
516 RCU_INIT_POINTER(nf_nat_protos[proto->protonum],
517 &nf_nat_unknown_protocol);
518 spin_unlock_bh(&nf_nat_lock);
519 synchronize_rcu();
520}
521EXPORT_SYMBOL(nf_nat_protocol_unregister);
522
523/* No one using conntrack by the time this called. */
524static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
525{
526 struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
527
528 if (nat == NULL || nat->ct == NULL)
529 return;
530
531 NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
532
533 spin_lock_bh(&nf_nat_lock);
534 hlist_del_rcu(&nat->bysource);
535 spin_unlock_bh(&nf_nat_lock);
536}
537
538static void nf_nat_move_storage(void *new, void *old)
539{
540 struct nf_conn_nat *new_nat = new;
541 struct nf_conn_nat *old_nat = old;
542 struct nf_conn *ct = old_nat->ct;
543
544 if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
545 return;
546
547 spin_lock_bh(&nf_nat_lock);
548 hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
549 spin_unlock_bh(&nf_nat_lock);
550}
551
552static struct nf_ct_ext_type nat_extend __read_mostly = {
553 .len = sizeof(struct nf_conn_nat),
554 .align = __alignof__(struct nf_conn_nat),
555 .destroy = nf_nat_cleanup_conntrack,
556 .move = nf_nat_move_storage,
557 .id = NF_CT_EXT_NAT,
558 .flags = NF_CT_EXT_F_PREALLOC,
559};
560
561#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
562
563#include <linux/netfilter/nfnetlink.h>
564#include <linux/netfilter/nfnetlink_conntrack.h>
565
566static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
567 [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 },
568 [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 },
569};
570
571static int nfnetlink_parse_nat_proto(struct nlattr *attr,
572 const struct nf_conn *ct,
573 struct nf_nat_ipv4_range *range)
574{
575 struct nlattr *tb[CTA_PROTONAT_MAX+1];
576 const struct nf_nat_protocol *npt;
577 int err;
578
579 err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy);
580 if (err < 0)
581 return err;
582
583 rcu_read_lock();
584 npt = __nf_nat_proto_find(nf_ct_protonum(ct));
585 if (npt->nlattr_to_range)
586 err = npt->nlattr_to_range(tb, range);
587 rcu_read_unlock();
588 return err;
589}
590
591static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
592 [CTA_NAT_MINIP] = { .type = NLA_U32 },
593 [CTA_NAT_MAXIP] = { .type = NLA_U32 },
594 [CTA_NAT_PROTO] = { .type = NLA_NESTED },
595};
596
597static int
598nfnetlink_parse_nat(const struct nlattr *nat,
599 const struct nf_conn *ct, struct nf_nat_ipv4_range *range)
600{
601 struct nlattr *tb[CTA_NAT_MAX+1];
602 int err;
603
604 memset(range, 0, sizeof(*range));
605
606 err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy);
607 if (err < 0)
608 return err;
609
610 if (tb[CTA_NAT_MINIP])
611 range->min_ip = nla_get_be32(tb[CTA_NAT_MINIP]);
612
613 if (!tb[CTA_NAT_MAXIP])
614 range->max_ip = range->min_ip;
615 else
616 range->max_ip = nla_get_be32(tb[CTA_NAT_MAXIP]);
617
618 if (range->min_ip)
619 range->flags |= NF_NAT_RANGE_MAP_IPS;
620
621 if (!tb[CTA_NAT_PROTO])
622 return 0;
623
624 err = nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range);
625 if (err < 0)
626 return err;
627
628 return 0;
629}
630
631static int
632nfnetlink_parse_nat_setup(struct nf_conn *ct,
633 enum nf_nat_manip_type manip,
634 const struct nlattr *attr)
635{
636 struct nf_nat_ipv4_range range;
637
638 if (nfnetlink_parse_nat(attr, ct, &range) < 0)
639 return -EINVAL;
640 if (nf_nat_initialized(ct, manip))
641 return -EEXIST;
642
643 return nf_nat_setup_info(ct, &range, manip);
644}
645#else
646static int
647nfnetlink_parse_nat_setup(struct nf_conn *ct,
648 enum nf_nat_manip_type manip,
649 const struct nlattr *attr)
650{
651 return -EOPNOTSUPP;
652}
653#endif
654
655static int __net_init nf_nat_net_init(struct net *net)
656{
657 /* Leave them the same for the moment. */
658 net->ipv4.nat_htable_size = net->ct.htable_size;
659 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0);
660 if (!net->ipv4.nat_bysource)
661 return -ENOMEM;
662 return 0;
663}
664
665/* Clear NAT section of all conntracks, in case we're loaded again. */
666static int clean_nat(struct nf_conn *i, void *data)
667{
668 struct nf_conn_nat *nat = nfct_nat(i);
669
670 if (!nat)
671 return 0;
672 memset(nat, 0, sizeof(*nat));
673 i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
674 return 0;
675}
676
677static void __net_exit nf_nat_net_exit(struct net *net)
678{
679 nf_ct_iterate_cleanup(net, &clean_nat, NULL);
680 synchronize_rcu();
681 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size);
682}
683
684static struct pernet_operations nf_nat_net_ops = {
685 .init = nf_nat_net_init,
686 .exit = nf_nat_net_exit,
687};
688
689static struct nf_ct_helper_expectfn follow_master_nat = {
690 .name = "nat-follow-master",
691 .expectfn = nf_nat_follow_master,
692};
693
694static struct nfq_ct_nat_hook nfq_ct_nat = {
695 .seq_adjust = nf_nat_tcp_seq_adjust,
696};
697
698static int __init nf_nat_init(void)
699{
700 size_t i;
701 int ret;
702
703 need_ipv4_conntrack();
704
705 ret = nf_ct_extend_register(&nat_extend);
706 if (ret < 0) {
707 printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
708 return ret;
709 }
710
711 ret = register_pernet_subsys(&nf_nat_net_ops);
712 if (ret < 0)
713 goto cleanup_extend;
714
715 /* Sew in builtin protocols. */
716 spin_lock_bh(&nf_nat_lock);
717 for (i = 0; i < MAX_IP_NAT_PROTO; i++)
718 RCU_INIT_POINTER(nf_nat_protos[i], &nf_nat_unknown_protocol);
719 RCU_INIT_POINTER(nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp);
720 RCU_INIT_POINTER(nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp);
721 RCU_INIT_POINTER(nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp);
722 spin_unlock_bh(&nf_nat_lock);
723
724 /* Initialize fake conntrack so that NAT will skip it */
725 nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
726
727 l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET);
728
729 nf_ct_helper_expectfn_register(&follow_master_nat);
730
731 BUG_ON(nf_nat_seq_adjust_hook != NULL);
732 RCU_INIT_POINTER(nf_nat_seq_adjust_hook, nf_nat_seq_adjust);
733 BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
734 RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook,
735 nfnetlink_parse_nat_setup);
736 BUG_ON(nf_ct_nat_offset != NULL);
737 RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset);
738 RCU_INIT_POINTER(nfq_ct_nat_hook, &nfq_ct_nat);
739 return 0;
740
741 cleanup_extend:
742 nf_ct_extend_unregister(&nat_extend);
743 return ret;
744}
745
746static void __exit nf_nat_cleanup(void)
747{
748 unregister_pernet_subsys(&nf_nat_net_ops);
749 nf_ct_l3proto_put(l3proto);
750 nf_ct_extend_unregister(&nat_extend);
751 nf_ct_helper_expectfn_unregister(&follow_master_nat);
752 RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL);
753 RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
754 RCU_INIT_POINTER(nf_ct_nat_offset, NULL);
755 RCU_INIT_POINTER(nfq_ct_nat_hook, NULL);
756 synchronize_net();
757}
758
759MODULE_LICENSE("GPL");
760MODULE_ALIAS("nf-nat-ipv4");
761
762module_init(nf_nat_init);
763module_exit(nf_nat_cleanup);
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
deleted file mode 100644
index e462a957d080..000000000000
--- a/net/ipv4/netfilter/nf_nat_ftp.c
+++ /dev/null
@@ -1,137 +0,0 @@
1/* FTP extension for TCP NAT alteration. */
2
3/* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/moduleparam.h>
13#include <linux/ip.h>
14#include <linux/tcp.h>
15#include <linux/netfilter_ipv4.h>
16#include <net/netfilter/nf_nat.h>
17#include <net/netfilter/nf_nat_helper.h>
18#include <net/netfilter/nf_nat_rule.h>
19#include <net/netfilter/nf_conntrack_helper.h>
20#include <net/netfilter/nf_conntrack_expect.h>
21#include <linux/netfilter/nf_conntrack_ftp.h>
22
23MODULE_LICENSE("GPL");
24MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
25MODULE_DESCRIPTION("ftp NAT helper");
26MODULE_ALIAS("ip_nat_ftp");
27
28/* FIXME: Time out? --RR */
29
30static int nf_nat_ftp_fmt_cmd(enum nf_ct_ftp_type type,
31 char *buffer, size_t buflen,
32 __be32 addr, u16 port)
33{
34 switch (type) {
35 case NF_CT_FTP_PORT:
36 case NF_CT_FTP_PASV:
37 return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u",
38 ((unsigned char *)&addr)[0],
39 ((unsigned char *)&addr)[1],
40 ((unsigned char *)&addr)[2],
41 ((unsigned char *)&addr)[3],
42 port >> 8,
43 port & 0xFF);
44 case NF_CT_FTP_EPRT:
45 return snprintf(buffer, buflen, "|1|%pI4|%u|", &addr, port);
46 case NF_CT_FTP_EPSV:
47 return snprintf(buffer, buflen, "|||%u|", port);
48 }
49
50 return 0;
51}
52
53/* So, this packet has hit the connection tracking matching code.
54 Mangle it, and change the expectation to match the new version. */
55static unsigned int nf_nat_ftp(struct sk_buff *skb,
56 enum ip_conntrack_info ctinfo,
57 enum nf_ct_ftp_type type,
58 unsigned int matchoff,
59 unsigned int matchlen,
60 struct nf_conntrack_expect *exp)
61{
62 __be32 newip;
63 u_int16_t port;
64 int dir = CTINFO2DIR(ctinfo);
65 struct nf_conn *ct = exp->master;
66 char buffer[sizeof("|1|255.255.255.255|65535|")];
67 unsigned int buflen;
68
69 pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
70
71 /* Connection will come from wherever this packet goes, hence !dir */
72 newip = ct->tuplehash[!dir].tuple.dst.u3.ip;
73 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
74 exp->dir = !dir;
75
76 /* When you see the packet, we need to NAT it the same as the
77 * this one. */
78 exp->expectfn = nf_nat_follow_master;
79
80 /* Try to get same port: if not, try to change it. */
81 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
82 int ret;
83
84 exp->tuple.dst.u.tcp.port = htons(port);
85 ret = nf_ct_expect_related(exp);
86 if (ret == 0)
87 break;
88 else if (ret != -EBUSY) {
89 port = 0;
90 break;
91 }
92 }
93
94 if (port == 0)
95 return NF_DROP;
96
97 buflen = nf_nat_ftp_fmt_cmd(type, buffer, sizeof(buffer), newip, port);
98 if (!buflen)
99 goto out;
100
101 pr_debug("calling nf_nat_mangle_tcp_packet\n");
102
103 if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff,
104 matchlen, buffer, buflen))
105 goto out;
106
107 return NF_ACCEPT;
108
109out:
110 nf_ct_unexpect_related(exp);
111 return NF_DROP;
112}
113
114static void __exit nf_nat_ftp_fini(void)
115{
116 RCU_INIT_POINTER(nf_nat_ftp_hook, NULL);
117 synchronize_rcu();
118}
119
120static int __init nf_nat_ftp_init(void)
121{
122 BUG_ON(nf_nat_ftp_hook != NULL);
123 RCU_INIT_POINTER(nf_nat_ftp_hook, nf_nat_ftp);
124 return 0;
125}
126
127/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
128static int warn_set(const char *val, struct kernel_param *kp)
129{
130 printk(KERN_INFO KBUILD_MODNAME
131 ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
132 return 0;
133}
134module_param_call(ports, warn_set, NULL, NULL, 0);
135
136module_init(nf_nat_ftp_init);
137module_exit(nf_nat_ftp_fini);
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index c6784a18c1c4..9c3db10b22d3 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -15,13 +15,12 @@
15 15
16#include <net/netfilter/nf_nat.h> 16#include <net/netfilter/nf_nat.h>
17#include <net/netfilter/nf_nat_helper.h> 17#include <net/netfilter/nf_nat_helper.h>
18#include <net/netfilter/nf_nat_rule.h>
19#include <net/netfilter/nf_conntrack_helper.h> 18#include <net/netfilter/nf_conntrack_helper.h>
20#include <net/netfilter/nf_conntrack_expect.h> 19#include <net/netfilter/nf_conntrack_expect.h>
21#include <linux/netfilter/nf_conntrack_h323.h> 20#include <linux/netfilter/nf_conntrack_h323.h>
22 21
23/****************************************************************************/ 22/****************************************************************************/
24static int set_addr(struct sk_buff *skb, 23static int set_addr(struct sk_buff *skb, unsigned int protoff,
25 unsigned char **data, int dataoff, 24 unsigned char **data, int dataoff,
26 unsigned int addroff, __be32 ip, __be16 port) 25 unsigned int addroff, __be32 ip, __be16 port)
27{ 26{
@@ -40,7 +39,7 @@ static int set_addr(struct sk_buff *skb,
40 39
41 if (ip_hdr(skb)->protocol == IPPROTO_TCP) { 40 if (ip_hdr(skb)->protocol == IPPROTO_TCP) {
42 if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, 41 if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
43 addroff, sizeof(buf), 42 protoff, addroff, sizeof(buf),
44 (char *) &buf, sizeof(buf))) { 43 (char *) &buf, sizeof(buf))) {
45 net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n"); 44 net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n");
46 return -1; 45 return -1;
@@ -54,7 +53,7 @@ static int set_addr(struct sk_buff *skb,
54 *data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff; 53 *data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff;
55 } else { 54 } else {
56 if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, 55 if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
57 addroff, sizeof(buf), 56 protoff, addroff, sizeof(buf),
58 (char *) &buf, sizeof(buf))) { 57 (char *) &buf, sizeof(buf))) {
59 net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n"); 58 net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n");
60 return -1; 59 return -1;
@@ -69,22 +68,22 @@ static int set_addr(struct sk_buff *skb,
69} 68}
70 69
71/****************************************************************************/ 70/****************************************************************************/
72static int set_h225_addr(struct sk_buff *skb, 71static int set_h225_addr(struct sk_buff *skb, unsigned int protoff,
73 unsigned char **data, int dataoff, 72 unsigned char **data, int dataoff,
74 TransportAddress *taddr, 73 TransportAddress *taddr,
75 union nf_inet_addr *addr, __be16 port) 74 union nf_inet_addr *addr, __be16 port)
76{ 75{
77 return set_addr(skb, data, dataoff, taddr->ipAddress.ip, 76 return set_addr(skb, protoff, data, dataoff, taddr->ipAddress.ip,
78 addr->ip, port); 77 addr->ip, port);
79} 78}
80 79
81/****************************************************************************/ 80/****************************************************************************/
82static int set_h245_addr(struct sk_buff *skb, 81static int set_h245_addr(struct sk_buff *skb, unsigned protoff,
83 unsigned char **data, int dataoff, 82 unsigned char **data, int dataoff,
84 H245_TransportAddress *taddr, 83 H245_TransportAddress *taddr,
85 union nf_inet_addr *addr, __be16 port) 84 union nf_inet_addr *addr, __be16 port)
86{ 85{
87 return set_addr(skb, data, dataoff, 86 return set_addr(skb, protoff, data, dataoff,
88 taddr->unicastAddress.iPAddress.network, 87 taddr->unicastAddress.iPAddress.network,
89 addr->ip, port); 88 addr->ip, port);
90} 89}
@@ -92,7 +91,7 @@ static int set_h245_addr(struct sk_buff *skb,
92/****************************************************************************/ 91/****************************************************************************/
93static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct, 92static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
94 enum ip_conntrack_info ctinfo, 93 enum ip_conntrack_info ctinfo,
95 unsigned char **data, 94 unsigned int protoff, unsigned char **data,
96 TransportAddress *taddr, int count) 95 TransportAddress *taddr, int count)
97{ 96{
98 const struct nf_ct_h323_master *info = nfct_help_data(ct); 97 const struct nf_ct_h323_master *info = nfct_help_data(ct);
@@ -118,7 +117,8 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
118 &addr.ip, port, 117 &addr.ip, port,
119 &ct->tuplehash[!dir].tuple.dst.u3.ip, 118 &ct->tuplehash[!dir].tuple.dst.u3.ip,
120 info->sig_port[!dir]); 119 info->sig_port[!dir]);
121 return set_h225_addr(skb, data, 0, &taddr[i], 120 return set_h225_addr(skb, protoff, data, 0,
121 &taddr[i],
122 &ct->tuplehash[!dir]. 122 &ct->tuplehash[!dir].
123 tuple.dst.u3, 123 tuple.dst.u3,
124 info->sig_port[!dir]); 124 info->sig_port[!dir]);
@@ -129,7 +129,8 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
129 &addr.ip, port, 129 &addr.ip, port,
130 &ct->tuplehash[!dir].tuple.src.u3.ip, 130 &ct->tuplehash[!dir].tuple.src.u3.ip,
131 info->sig_port[!dir]); 131 info->sig_port[!dir]);
132 return set_h225_addr(skb, data, 0, &taddr[i], 132 return set_h225_addr(skb, protoff, data, 0,
133 &taddr[i],
133 &ct->tuplehash[!dir]. 134 &ct->tuplehash[!dir].
134 tuple.src.u3, 135 tuple.src.u3,
135 info->sig_port[!dir]); 136 info->sig_port[!dir]);
@@ -143,7 +144,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
143/****************************************************************************/ 144/****************************************************************************/
144static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct, 145static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
145 enum ip_conntrack_info ctinfo, 146 enum ip_conntrack_info ctinfo,
146 unsigned char **data, 147 unsigned int protoff, unsigned char **data,
147 TransportAddress *taddr, int count) 148 TransportAddress *taddr, int count)
148{ 149{
149 int dir = CTINFO2DIR(ctinfo); 150 int dir = CTINFO2DIR(ctinfo);
@@ -159,7 +160,7 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
159 &addr.ip, ntohs(port), 160 &addr.ip, ntohs(port),
160 &ct->tuplehash[!dir].tuple.dst.u3.ip, 161 &ct->tuplehash[!dir].tuple.dst.u3.ip,
161 ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port)); 162 ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port));
162 return set_h225_addr(skb, data, 0, &taddr[i], 163 return set_h225_addr(skb, protoff, data, 0, &taddr[i],
163 &ct->tuplehash[!dir].tuple.dst.u3, 164 &ct->tuplehash[!dir].tuple.dst.u3,
164 ct->tuplehash[!dir].tuple. 165 ct->tuplehash[!dir].tuple.
165 dst.u.udp.port); 166 dst.u.udp.port);
@@ -172,7 +173,7 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
172/****************************************************************************/ 173/****************************************************************************/
173static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, 174static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
174 enum ip_conntrack_info ctinfo, 175 enum ip_conntrack_info ctinfo,
175 unsigned char **data, int dataoff, 176 unsigned int protoff, unsigned char **data, int dataoff,
176 H245_TransportAddress *taddr, 177 H245_TransportAddress *taddr,
177 __be16 port, __be16 rtp_port, 178 __be16 port, __be16 rtp_port,
178 struct nf_conntrack_expect *rtp_exp, 179 struct nf_conntrack_expect *rtp_exp,
@@ -244,7 +245,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
244 } 245 }
245 246
246 /* Modify signal */ 247 /* Modify signal */
247 if (set_h245_addr(skb, data, dataoff, taddr, 248 if (set_h245_addr(skb, protoff, data, dataoff, taddr,
248 &ct->tuplehash[!dir].tuple.dst.u3, 249 &ct->tuplehash[!dir].tuple.dst.u3,
249 htons((port & htons(1)) ? nated_port + 1 : 250 htons((port & htons(1)) ? nated_port + 1 :
250 nated_port)) == 0) { 251 nated_port)) == 0) {
@@ -275,7 +276,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
275/****************************************************************************/ 276/****************************************************************************/
276static int nat_t120(struct sk_buff *skb, struct nf_conn *ct, 277static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
277 enum ip_conntrack_info ctinfo, 278 enum ip_conntrack_info ctinfo,
278 unsigned char **data, int dataoff, 279 unsigned int protoff, unsigned char **data, int dataoff,
279 H245_TransportAddress *taddr, __be16 port, 280 H245_TransportAddress *taddr, __be16 port,
280 struct nf_conntrack_expect *exp) 281 struct nf_conntrack_expect *exp)
281{ 282{
@@ -307,7 +308,7 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
307 } 308 }
308 309
309 /* Modify signal */ 310 /* Modify signal */
310 if (set_h245_addr(skb, data, dataoff, taddr, 311 if (set_h245_addr(skb, protoff, data, dataoff, taddr,
311 &ct->tuplehash[!dir].tuple.dst.u3, 312 &ct->tuplehash[!dir].tuple.dst.u3,
312 htons(nated_port)) < 0) { 313 htons(nated_port)) < 0) {
313 nf_ct_unexpect_related(exp); 314 nf_ct_unexpect_related(exp);
@@ -326,7 +327,7 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
326/****************************************************************************/ 327/****************************************************************************/
327static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, 328static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
328 enum ip_conntrack_info ctinfo, 329 enum ip_conntrack_info ctinfo,
329 unsigned char **data, int dataoff, 330 unsigned int protoff, unsigned char **data, int dataoff,
330 TransportAddress *taddr, __be16 port, 331 TransportAddress *taddr, __be16 port,
331 struct nf_conntrack_expect *exp) 332 struct nf_conntrack_expect *exp)
332{ 333{
@@ -363,7 +364,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
363 } 364 }
364 365
365 /* Modify signal */ 366 /* Modify signal */
366 if (set_h225_addr(skb, data, dataoff, taddr, 367 if (set_h225_addr(skb, protoff, data, dataoff, taddr,
367 &ct->tuplehash[!dir].tuple.dst.u3, 368 &ct->tuplehash[!dir].tuple.dst.u3,
368 htons(nated_port)) == 0) { 369 htons(nated_port)) == 0) {
369 /* Save ports */ 370 /* Save ports */
@@ -390,7 +391,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
390static void ip_nat_q931_expect(struct nf_conn *new, 391static void ip_nat_q931_expect(struct nf_conn *new,
391 struct nf_conntrack_expect *this) 392 struct nf_conntrack_expect *this)
392{ 393{
393 struct nf_nat_ipv4_range range; 394 struct nf_nat_range range;
394 395
395 if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */ 396 if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */
396 nf_nat_follow_master(new, this); 397 nf_nat_follow_master(new, this);
@@ -402,21 +403,23 @@ static void ip_nat_q931_expect(struct nf_conn *new,
402 403
403 /* Change src to where master sends to */ 404 /* Change src to where master sends to */
404 range.flags = NF_NAT_RANGE_MAP_IPS; 405 range.flags = NF_NAT_RANGE_MAP_IPS;
405 range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip; 406 range.min_addr = range.max_addr =
407 new->tuplehash[!this->dir].tuple.src.u3;
406 nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC); 408 nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
407 409
408 /* For DST manip, map port here to where it's expected. */ 410 /* For DST manip, map port here to where it's expected. */
409 range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); 411 range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
410 range.min = range.max = this->saved_proto; 412 range.min_proto = range.max_proto = this->saved_proto;
411 range.min_ip = range.max_ip = 413 range.min_addr = range.max_addr =
412 new->master->tuplehash[!this->dir].tuple.src.u3.ip; 414 new->master->tuplehash[!this->dir].tuple.src.u3;
413 nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST); 415 nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
414} 416}
415 417
416/****************************************************************************/ 418/****************************************************************************/
417static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, 419static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
418 enum ip_conntrack_info ctinfo, 420 enum ip_conntrack_info ctinfo,
419 unsigned char **data, TransportAddress *taddr, int idx, 421 unsigned int protoff, unsigned char **data,
422 TransportAddress *taddr, int idx,
420 __be16 port, struct nf_conntrack_expect *exp) 423 __be16 port, struct nf_conntrack_expect *exp)
421{ 424{
422 struct nf_ct_h323_master *info = nfct_help_data(ct); 425 struct nf_ct_h323_master *info = nfct_help_data(ct);
@@ -453,7 +456,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
453 } 456 }
454 457
455 /* Modify signal */ 458 /* Modify signal */
456 if (set_h225_addr(skb, data, 0, &taddr[idx], 459 if (set_h225_addr(skb, protoff, data, 0, &taddr[idx],
457 &ct->tuplehash[!dir].tuple.dst.u3, 460 &ct->tuplehash[!dir].tuple.dst.u3,
458 htons(nated_port)) == 0) { 461 htons(nated_port)) == 0) {
459 /* Save ports */ 462 /* Save ports */
@@ -464,7 +467,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
464 if (idx > 0 && 467 if (idx > 0 &&
465 get_h225_addr(ct, *data, &taddr[0], &addr, &port) && 468 get_h225_addr(ct, *data, &taddr[0], &addr, &port) &&
466 (ntohl(addr.ip) & 0xff000000) == 0x7f000000) { 469 (ntohl(addr.ip) & 0xff000000) == 0x7f000000) {
467 set_h225_addr(skb, data, 0, &taddr[0], 470 set_h225_addr(skb, protoff, data, 0, &taddr[0],
468 &ct->tuplehash[!dir].tuple.dst.u3, 471 &ct->tuplehash[!dir].tuple.dst.u3,
469 info->sig_port[!dir]); 472 info->sig_port[!dir]);
470 } 473 }
@@ -487,26 +490,28 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
487static void ip_nat_callforwarding_expect(struct nf_conn *new, 490static void ip_nat_callforwarding_expect(struct nf_conn *new,
488 struct nf_conntrack_expect *this) 491 struct nf_conntrack_expect *this)
489{ 492{
490 struct nf_nat_ipv4_range range; 493 struct nf_nat_range range;
491 494
492 /* This must be a fresh one. */ 495 /* This must be a fresh one. */
493 BUG_ON(new->status & IPS_NAT_DONE_MASK); 496 BUG_ON(new->status & IPS_NAT_DONE_MASK);
494 497
495 /* Change src to where master sends to */ 498 /* Change src to where master sends to */
496 range.flags = NF_NAT_RANGE_MAP_IPS; 499 range.flags = NF_NAT_RANGE_MAP_IPS;
497 range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip; 500 range.min_addr = range.max_addr =
501 new->tuplehash[!this->dir].tuple.src.u3;
498 nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC); 502 nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
499 503
500 /* For DST manip, map port here to where it's expected. */ 504 /* For DST manip, map port here to where it's expected. */
501 range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); 505 range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
502 range.min = range.max = this->saved_proto; 506 range.min_proto = range.max_proto = this->saved_proto;
503 range.min_ip = range.max_ip = this->saved_ip; 507 range.min_addr = range.max_addr = this->saved_addr;
504 nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST); 508 nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
505} 509}
506 510
507/****************************************************************************/ 511/****************************************************************************/
508static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, 512static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
509 enum ip_conntrack_info ctinfo, 513 enum ip_conntrack_info ctinfo,
514 unsigned int protoff,
510 unsigned char **data, int dataoff, 515 unsigned char **data, int dataoff,
511 TransportAddress *taddr, __be16 port, 516 TransportAddress *taddr, __be16 port,
512 struct nf_conntrack_expect *exp) 517 struct nf_conntrack_expect *exp)
@@ -515,7 +520,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
515 u_int16_t nated_port; 520 u_int16_t nated_port;
516 521
517 /* Set expectations for NAT */ 522 /* Set expectations for NAT */
518 exp->saved_ip = exp->tuple.dst.u3.ip; 523 exp->saved_addr = exp->tuple.dst.u3;
519 exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip; 524 exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip;
520 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; 525 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
521 exp->expectfn = ip_nat_callforwarding_expect; 526 exp->expectfn = ip_nat_callforwarding_expect;
@@ -541,7 +546,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
541 } 546 }
542 547
543 /* Modify signal */ 548 /* Modify signal */
544 if (!set_h225_addr(skb, data, dataoff, taddr, 549 if (!set_h225_addr(skb, protoff, data, dataoff, taddr,
545 &ct->tuplehash[!dir].tuple.dst.u3, 550 &ct->tuplehash[!dir].tuple.dst.u3,
546 htons(nated_port)) == 0) { 551 htons(nated_port)) == 0) {
547 nf_ct_unexpect_related(exp); 552 nf_ct_unexpect_related(exp);
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
deleted file mode 100644
index 2e59ad0b90ca..000000000000
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ /dev/null
@@ -1,458 +0,0 @@
1/* ip_nat_helper.c - generic support functions for NAT helpers
2 *
3 * (C) 2000-2002 Harald Welte <laforge@netfilter.org>
4 * (C) 2003-2006 Netfilter Core Team <coreteam@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10#include <linux/module.h>
11#include <linux/gfp.h>
12#include <linux/kmod.h>
13#include <linux/types.h>
14#include <linux/timer.h>
15#include <linux/skbuff.h>
16#include <linux/tcp.h>
17#include <linux/udp.h>
18#include <net/checksum.h>
19#include <net/tcp.h>
20#include <net/route.h>
21
22#include <linux/netfilter_ipv4.h>
23#include <net/netfilter/nf_conntrack.h>
24#include <net/netfilter/nf_conntrack_helper.h>
25#include <net/netfilter/nf_conntrack_ecache.h>
26#include <net/netfilter/nf_conntrack_expect.h>
27#include <net/netfilter/nf_nat.h>
28#include <net/netfilter/nf_nat_protocol.h>
29#include <net/netfilter/nf_nat_core.h>
30#include <net/netfilter/nf_nat_helper.h>
31
32#define DUMP_OFFSET(x) \
33 pr_debug("offset_before=%d, offset_after=%d, correction_pos=%u\n", \
34 x->offset_before, x->offset_after, x->correction_pos);
35
36static DEFINE_SPINLOCK(nf_nat_seqofs_lock);
37
38/* Setup TCP sequence correction given this change at this sequence */
39static inline void
40adjust_tcp_sequence(u32 seq,
41 int sizediff,
42 struct nf_conn *ct,
43 enum ip_conntrack_info ctinfo)
44{
45 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
46 struct nf_conn_nat *nat = nfct_nat(ct);
47 struct nf_nat_seq *this_way = &nat->seq[dir];
48
49 pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n",
50 seq, sizediff);
51
52 pr_debug("adjust_tcp_sequence: Seq_offset before: ");
53 DUMP_OFFSET(this_way);
54
55 spin_lock_bh(&nf_nat_seqofs_lock);
56
57 /* SYN adjust. If it's uninitialized, or this is after last
58 * correction, record it: we don't handle more than one
59 * adjustment in the window, but do deal with common case of a
60 * retransmit */
61 if (this_way->offset_before == this_way->offset_after ||
62 before(this_way->correction_pos, seq)) {
63 this_way->correction_pos = seq;
64 this_way->offset_before = this_way->offset_after;
65 this_way->offset_after += sizediff;
66 }
67 spin_unlock_bh(&nf_nat_seqofs_lock);
68
69 pr_debug("adjust_tcp_sequence: Seq_offset after: ");
70 DUMP_OFFSET(this_way);
71}
72
73/* Get the offset value, for conntrack */
74s16 nf_nat_get_offset(const struct nf_conn *ct,
75 enum ip_conntrack_dir dir,
76 u32 seq)
77{
78 struct nf_conn_nat *nat = nfct_nat(ct);
79 struct nf_nat_seq *this_way;
80 s16 offset;
81
82 if (!nat)
83 return 0;
84
85 this_way = &nat->seq[dir];
86 spin_lock_bh(&nf_nat_seqofs_lock);
87 offset = after(seq, this_way->correction_pos)
88 ? this_way->offset_after : this_way->offset_before;
89 spin_unlock_bh(&nf_nat_seqofs_lock);
90
91 return offset;
92}
93EXPORT_SYMBOL_GPL(nf_nat_get_offset);
94
95/* Frobs data inside this packet, which is linear. */
96static void mangle_contents(struct sk_buff *skb,
97 unsigned int dataoff,
98 unsigned int match_offset,
99 unsigned int match_len,
100 const char *rep_buffer,
101 unsigned int rep_len)
102{
103 unsigned char *data;
104
105 BUG_ON(skb_is_nonlinear(skb));
106 data = skb_network_header(skb) + dataoff;
107
108 /* move post-replacement */
109 memmove(data + match_offset + rep_len,
110 data + match_offset + match_len,
111 skb->tail - (skb->network_header + dataoff +
112 match_offset + match_len));
113
114 /* insert data from buffer */
115 memcpy(data + match_offset, rep_buffer, rep_len);
116
117 /* update skb info */
118 if (rep_len > match_len) {
119 pr_debug("nf_nat_mangle_packet: Extending packet by "
120 "%u from %u bytes\n", rep_len - match_len, skb->len);
121 skb_put(skb, rep_len - match_len);
122 } else {
123 pr_debug("nf_nat_mangle_packet: Shrinking packet from "
124 "%u from %u bytes\n", match_len - rep_len, skb->len);
125 __skb_trim(skb, skb->len + rep_len - match_len);
126 }
127
128 /* fix IP hdr checksum information */
129 ip_hdr(skb)->tot_len = htons(skb->len);
130 ip_send_check(ip_hdr(skb));
131}
132
133/* Unusual, but possible case. */
134static int enlarge_skb(struct sk_buff *skb, unsigned int extra)
135{
136 if (skb->len + extra > 65535)
137 return 0;
138
139 if (pskb_expand_head(skb, 0, extra - skb_tailroom(skb), GFP_ATOMIC))
140 return 0;
141
142 return 1;
143}
144
145void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
146 __be32 seq, s16 off)
147{
148 if (!off)
149 return;
150 set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
151 adjust_tcp_sequence(ntohl(seq), off, ct, ctinfo);
152 nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
153}
154EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
155
156void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
157 u32 ctinfo, int off)
158{
159 const struct tcphdr *th;
160
161 if (nf_ct_protonum(ct) != IPPROTO_TCP)
162 return;
163
164 th = (struct tcphdr *)(skb_network_header(skb)+ ip_hdrlen(skb));
165 nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
166}
167EXPORT_SYMBOL_GPL(nf_nat_tcp_seq_adjust);
168
169static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data,
170 int datalen, __sum16 *check, int oldlen)
171{
172 struct rtable *rt = skb_rtable(skb);
173
174 if (skb->ip_summed != CHECKSUM_PARTIAL) {
175 if (!(rt->rt_flags & RTCF_LOCAL) &&
176 (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) {
177 skb->ip_summed = CHECKSUM_PARTIAL;
178 skb->csum_start = skb_headroom(skb) +
179 skb_network_offset(skb) +
180 iph->ihl * 4;
181 skb->csum_offset = (void *)check - data;
182 *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
183 datalen, iph->protocol, 0);
184 } else {
185 *check = 0;
186 *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
187 datalen, iph->protocol,
188 csum_partial(data, datalen,
189 0));
190 if (iph->protocol == IPPROTO_UDP && !*check)
191 *check = CSUM_MANGLED_0;
192 }
193 } else
194 inet_proto_csum_replace2(check, skb,
195 htons(oldlen), htons(datalen), 1);
196}
197
198/* Generic function for mangling variable-length address changes inside
199 * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
200 * command in FTP).
201 *
202 * Takes care about all the nasty sequence number changes, checksumming,
203 * skb enlargement, ...
204 *
205 * */
206int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
207 struct nf_conn *ct,
208 enum ip_conntrack_info ctinfo,
209 unsigned int match_offset,
210 unsigned int match_len,
211 const char *rep_buffer,
212 unsigned int rep_len, bool adjust)
213{
214 struct iphdr *iph;
215 struct tcphdr *tcph;
216 int oldlen, datalen;
217
218 if (!skb_make_writable(skb, skb->len))
219 return 0;
220
221 if (rep_len > match_len &&
222 rep_len - match_len > skb_tailroom(skb) &&
223 !enlarge_skb(skb, rep_len - match_len))
224 return 0;
225
226 SKB_LINEAR_ASSERT(skb);
227
228 iph = ip_hdr(skb);
229 tcph = (void *)iph + iph->ihl*4;
230
231 oldlen = skb->len - iph->ihl*4;
232 mangle_contents(skb, iph->ihl*4 + tcph->doff*4,
233 match_offset, match_len, rep_buffer, rep_len);
234
235 datalen = skb->len - iph->ihl*4;
236 nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen);
237
238 if (adjust && rep_len != match_len)
239 nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
240 (int)rep_len - (int)match_len);
241
242 return 1;
243}
244EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet);
245
246/* Generic function for mangling variable-length address changes inside
247 * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
248 * command in the Amanda protocol)
249 *
250 * Takes care about all the nasty sequence number changes, checksumming,
251 * skb enlargement, ...
252 *
253 * XXX - This function could be merged with nf_nat_mangle_tcp_packet which
254 * should be fairly easy to do.
255 */
256int
257nf_nat_mangle_udp_packet(struct sk_buff *skb,
258 struct nf_conn *ct,
259 enum ip_conntrack_info ctinfo,
260 unsigned int match_offset,
261 unsigned int match_len,
262 const char *rep_buffer,
263 unsigned int rep_len)
264{
265 struct iphdr *iph;
266 struct udphdr *udph;
267 int datalen, oldlen;
268
269 if (!skb_make_writable(skb, skb->len))
270 return 0;
271
272 if (rep_len > match_len &&
273 rep_len - match_len > skb_tailroom(skb) &&
274 !enlarge_skb(skb, rep_len - match_len))
275 return 0;
276
277 iph = ip_hdr(skb);
278 udph = (void *)iph + iph->ihl*4;
279
280 oldlen = skb->len - iph->ihl*4;
281 mangle_contents(skb, iph->ihl*4 + sizeof(*udph),
282 match_offset, match_len, rep_buffer, rep_len);
283
284 /* update the length of the UDP packet */
285 datalen = skb->len - iph->ihl*4;
286 udph->len = htons(datalen);
287
288 /* fix udp checksum if udp checksum was previously calculated */
289 if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
290 return 1;
291
292 nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen);
293
294 return 1;
295}
296EXPORT_SYMBOL(nf_nat_mangle_udp_packet);
297
298/* Adjust one found SACK option including checksum correction */
299static void
300sack_adjust(struct sk_buff *skb,
301 struct tcphdr *tcph,
302 unsigned int sackoff,
303 unsigned int sackend,
304 struct nf_nat_seq *natseq)
305{
306 while (sackoff < sackend) {
307 struct tcp_sack_block_wire *sack;
308 __be32 new_start_seq, new_end_seq;
309
310 sack = (void *)skb->data + sackoff;
311 if (after(ntohl(sack->start_seq) - natseq->offset_before,
312 natseq->correction_pos))
313 new_start_seq = htonl(ntohl(sack->start_seq)
314 - natseq->offset_after);
315 else
316 new_start_seq = htonl(ntohl(sack->start_seq)
317 - natseq->offset_before);
318
319 if (after(ntohl(sack->end_seq) - natseq->offset_before,
320 natseq->correction_pos))
321 new_end_seq = htonl(ntohl(sack->end_seq)
322 - natseq->offset_after);
323 else
324 new_end_seq = htonl(ntohl(sack->end_seq)
325 - natseq->offset_before);
326
327 pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
328 ntohl(sack->start_seq), new_start_seq,
329 ntohl(sack->end_seq), new_end_seq);
330
331 inet_proto_csum_replace4(&tcph->check, skb,
332 sack->start_seq, new_start_seq, 0);
333 inet_proto_csum_replace4(&tcph->check, skb,
334 sack->end_seq, new_end_seq, 0);
335 sack->start_seq = new_start_seq;
336 sack->end_seq = new_end_seq;
337 sackoff += sizeof(*sack);
338 }
339}
340
341/* TCP SACK sequence number adjustment */
342static inline unsigned int
343nf_nat_sack_adjust(struct sk_buff *skb,
344 struct tcphdr *tcph,
345 struct nf_conn *ct,
346 enum ip_conntrack_info ctinfo)
347{
348 unsigned int dir, optoff, optend;
349 struct nf_conn_nat *nat = nfct_nat(ct);
350
351 optoff = ip_hdrlen(skb) + sizeof(struct tcphdr);
352 optend = ip_hdrlen(skb) + tcph->doff * 4;
353
354 if (!skb_make_writable(skb, optend))
355 return 0;
356
357 dir = CTINFO2DIR(ctinfo);
358
359 while (optoff < optend) {
360 /* Usually: option, length. */
361 unsigned char *op = skb->data + optoff;
362
363 switch (op[0]) {
364 case TCPOPT_EOL:
365 return 1;
366 case TCPOPT_NOP:
367 optoff++;
368 continue;
369 default:
370 /* no partial options */
371 if (optoff + 1 == optend ||
372 optoff + op[1] > optend ||
373 op[1] < 2)
374 return 0;
375 if (op[0] == TCPOPT_SACK &&
376 op[1] >= 2+TCPOLEN_SACK_PERBLOCK &&
377 ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
378 sack_adjust(skb, tcph, optoff+2,
379 optoff+op[1], &nat->seq[!dir]);
380 optoff += op[1];
381 }
382 }
383 return 1;
384}
385
386/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */
387int
388nf_nat_seq_adjust(struct sk_buff *skb,
389 struct nf_conn *ct,
390 enum ip_conntrack_info ctinfo)
391{
392 struct tcphdr *tcph;
393 int dir;
394 __be32 newseq, newack;
395 s16 seqoff, ackoff;
396 struct nf_conn_nat *nat = nfct_nat(ct);
397 struct nf_nat_seq *this_way, *other_way;
398
399 dir = CTINFO2DIR(ctinfo);
400
401 this_way = &nat->seq[dir];
402 other_way = &nat->seq[!dir];
403
404 if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*tcph)))
405 return 0;
406
407 tcph = (void *)skb->data + ip_hdrlen(skb);
408 if (after(ntohl(tcph->seq), this_way->correction_pos))
409 seqoff = this_way->offset_after;
410 else
411 seqoff = this_way->offset_before;
412
413 if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
414 other_way->correction_pos))
415 ackoff = other_way->offset_after;
416 else
417 ackoff = other_way->offset_before;
418
419 newseq = htonl(ntohl(tcph->seq) + seqoff);
420 newack = htonl(ntohl(tcph->ack_seq) - ackoff);
421
422 inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0);
423 inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0);
424
425 pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
426 ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
427 ntohl(newack));
428
429 tcph->seq = newseq;
430 tcph->ack_seq = newack;
431
432 return nf_nat_sack_adjust(skb, tcph, ct, ctinfo);
433}
434
435/* Setup NAT on this expected conntrack so it follows master. */
436/* If we fail to get a free NAT slot, we'll get dropped on confirm */
437void nf_nat_follow_master(struct nf_conn *ct,
438 struct nf_conntrack_expect *exp)
439{
440 struct nf_nat_ipv4_range range;
441
442 /* This must be a fresh one. */
443 BUG_ON(ct->status & IPS_NAT_DONE_MASK);
444
445 /* Change src to where master sends to */
446 range.flags = NF_NAT_RANGE_MAP_IPS;
447 range.min_ip = range.max_ip
448 = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
449 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
450
451 /* For DST manip, map port here to where it's expected. */
452 range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
453 range.min = range.max = exp->saved_proto;
454 range.min_ip = range.max_ip
455 = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip;
456 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
457}
458EXPORT_SYMBOL(nf_nat_follow_master);
diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c
deleted file mode 100644
index 979ae165f4ef..000000000000
--- a/net/ipv4/netfilter/nf_nat_irc.c
+++ /dev/null
@@ -1,99 +0,0 @@
1/* IRC extension for TCP NAT alteration.
2 *
3 * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
4 * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
5 * based on a copy of RR's ip_nat_ftp.c
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#include <linux/module.h>
14#include <linux/moduleparam.h>
15#include <linux/tcp.h>
16#include <linux/kernel.h>
17
18#include <net/netfilter/nf_nat.h>
19#include <net/netfilter/nf_nat_helper.h>
20#include <net/netfilter/nf_nat_rule.h>
21#include <net/netfilter/nf_conntrack_helper.h>
22#include <net/netfilter/nf_conntrack_expect.h>
23#include <linux/netfilter/nf_conntrack_irc.h>
24
25MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
26MODULE_DESCRIPTION("IRC (DCC) NAT helper");
27MODULE_LICENSE("GPL");
28MODULE_ALIAS("ip_nat_irc");
29
30static unsigned int help(struct sk_buff *skb,
31 enum ip_conntrack_info ctinfo,
32 unsigned int matchoff,
33 unsigned int matchlen,
34 struct nf_conntrack_expect *exp)
35{
36 char buffer[sizeof("4294967296 65635")];
37 u_int32_t ip;
38 u_int16_t port;
39 unsigned int ret;
40
41 /* Reply comes from server. */
42 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
43 exp->dir = IP_CT_DIR_REPLY;
44 exp->expectfn = nf_nat_follow_master;
45
46 /* Try to get same port: if not, try to change it. */
47 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
48 int ret;
49
50 exp->tuple.dst.u.tcp.port = htons(port);
51 ret = nf_ct_expect_related(exp);
52 if (ret == 0)
53 break;
54 else if (ret != -EBUSY) {
55 port = 0;
56 break;
57 }
58 }
59
60 if (port == 0)
61 return NF_DROP;
62
63 ip = ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip);
64 sprintf(buffer, "%u %u", ip, port);
65 pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n",
66 buffer, &ip, port);
67
68 ret = nf_nat_mangle_tcp_packet(skb, exp->master, ctinfo,
69 matchoff, matchlen, buffer,
70 strlen(buffer));
71 if (ret != NF_ACCEPT)
72 nf_ct_unexpect_related(exp);
73 return ret;
74}
75
76static void __exit nf_nat_irc_fini(void)
77{
78 RCU_INIT_POINTER(nf_nat_irc_hook, NULL);
79 synchronize_rcu();
80}
81
82static int __init nf_nat_irc_init(void)
83{
84 BUG_ON(nf_nat_irc_hook != NULL);
85 RCU_INIT_POINTER(nf_nat_irc_hook, help);
86 return 0;
87}
88
89/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
90static int warn_set(const char *val, struct kernel_param *kp)
91{
92 printk(KERN_INFO KBUILD_MODNAME
93 ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
94 return 0;
95}
96module_param_call(ports, warn_set, NULL, NULL, 0);
97
98module_init(nf_nat_irc_init);
99module_exit(nf_nat_irc_fini);
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
new file mode 100644
index 000000000000..d8b2e14efddc
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -0,0 +1,281 @@
1/*
2 * (C) 1999-2001 Paul `Rusty' Russell
3 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
4 * (C) 2011 Patrick McHardy <kaber@trash.net>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/types.h>
12#include <linux/module.h>
13#include <linux/skbuff.h>
14#include <linux/ip.h>
15#include <linux/icmp.h>
16#include <linux/netfilter.h>
17#include <linux/netfilter_ipv4.h>
18#include <net/secure_seq.h>
19#include <net/checksum.h>
20#include <net/route.h>
21#include <net/ip.h>
22
23#include <net/netfilter/nf_conntrack_core.h>
24#include <net/netfilter/nf_conntrack.h>
25#include <net/netfilter/nf_nat_core.h>
26#include <net/netfilter/nf_nat_l3proto.h>
27#include <net/netfilter/nf_nat_l4proto.h>
28
29static const struct nf_nat_l3proto nf_nat_l3proto_ipv4;
30
31#ifdef CONFIG_XFRM
32static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
33 const struct nf_conn *ct,
34 enum ip_conntrack_dir dir,
35 unsigned long statusbit,
36 struct flowi *fl)
37{
38 const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple;
39 struct flowi4 *fl4 = &fl->u.ip4;
40
41 if (ct->status & statusbit) {
42 fl4->daddr = t->dst.u3.ip;
43 if (t->dst.protonum == IPPROTO_TCP ||
44 t->dst.protonum == IPPROTO_UDP ||
45 t->dst.protonum == IPPROTO_UDPLITE ||
46 t->dst.protonum == IPPROTO_DCCP ||
47 t->dst.protonum == IPPROTO_SCTP)
48 fl4->fl4_dport = t->dst.u.all;
49 }
50
51 statusbit ^= IPS_NAT_MASK;
52
53 if (ct->status & statusbit) {
54 fl4->saddr = t->src.u3.ip;
55 if (t->dst.protonum == IPPROTO_TCP ||
56 t->dst.protonum == IPPROTO_UDP ||
57 t->dst.protonum == IPPROTO_UDPLITE ||
58 t->dst.protonum == IPPROTO_DCCP ||
59 t->dst.protonum == IPPROTO_SCTP)
60 fl4->fl4_sport = t->src.u.all;
61 }
62}
63#endif /* CONFIG_XFRM */
64
65static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t,
66 const struct nf_nat_range *range)
67{
68 return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
69 ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
70}
71
72static u32 nf_nat_ipv4_secure_port(const struct nf_conntrack_tuple *t,
73 __be16 dport)
74{
75 return secure_ipv4_port_ephemeral(t->src.u3.ip, t->dst.u3.ip, dport);
76}
77
78static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
79 unsigned int iphdroff,
80 const struct nf_nat_l4proto *l4proto,
81 const struct nf_conntrack_tuple *target,
82 enum nf_nat_manip_type maniptype)
83{
84 struct iphdr *iph;
85 unsigned int hdroff;
86
87 if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
88 return false;
89
90 iph = (void *)skb->data + iphdroff;
91 hdroff = iphdroff + iph->ihl * 4;
92
93 if (!l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, hdroff,
94 target, maniptype))
95 return false;
96 iph = (void *)skb->data + iphdroff;
97
98 if (maniptype == NF_NAT_MANIP_SRC) {
99 csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
100 iph->saddr = target->src.u3.ip;
101 } else {
102 csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
103 iph->daddr = target->dst.u3.ip;
104 }
105 return true;
106}
107
108static void nf_nat_ipv4_csum_update(struct sk_buff *skb,
109 unsigned int iphdroff, __sum16 *check,
110 const struct nf_conntrack_tuple *t,
111 enum nf_nat_manip_type maniptype)
112{
113 struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
114 __be32 oldip, newip;
115
116 if (maniptype == NF_NAT_MANIP_SRC) {
117 oldip = iph->saddr;
118 newip = t->src.u3.ip;
119 } else {
120 oldip = iph->daddr;
121 newip = t->dst.u3.ip;
122 }
123 inet_proto_csum_replace4(check, skb, oldip, newip, 1);
124}
125
126static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
127 u8 proto, void *data, __sum16 *check,
128 int datalen, int oldlen)
129{
130 const struct iphdr *iph = ip_hdr(skb);
131 struct rtable *rt = skb_rtable(skb);
132
133 if (skb->ip_summed != CHECKSUM_PARTIAL) {
134 if (!(rt->rt_flags & RTCF_LOCAL) &&
135 (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) {
136 skb->ip_summed = CHECKSUM_PARTIAL;
137 skb->csum_start = skb_headroom(skb) +
138 skb_network_offset(skb) +
139 ip_hdrlen(skb);
140 skb->csum_offset = (void *)check - data;
141 *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
142 datalen, proto, 0);
143 } else {
144 *check = 0;
145 *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
146 datalen, proto,
147 csum_partial(data, datalen,
148 0));
149 if (proto == IPPROTO_UDP && !*check)
150 *check = CSUM_MANGLED_0;
151 }
152 } else
153 inet_proto_csum_replace2(check, skb,
154 htons(oldlen), htons(datalen), 1);
155}
156
157static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
158 struct nf_nat_range *range)
159{
160 if (tb[CTA_NAT_V4_MINIP]) {
161 range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]);
162 range->flags |= NF_NAT_RANGE_MAP_IPS;
163 }
164
165 if (tb[CTA_NAT_V4_MAXIP])
166 range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]);
167 else
168 range->max_addr.ip = range->min_addr.ip;
169
170 return 0;
171}
172
173static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
174 .l3proto = NFPROTO_IPV4,
175 .in_range = nf_nat_ipv4_in_range,
176 .secure_port = nf_nat_ipv4_secure_port,
177 .manip_pkt = nf_nat_ipv4_manip_pkt,
178 .csum_update = nf_nat_ipv4_csum_update,
179 .csum_recalc = nf_nat_ipv4_csum_recalc,
180 .nlattr_to_range = nf_nat_ipv4_nlattr_to_range,
181#ifdef CONFIG_XFRM
182 .decode_session = nf_nat_ipv4_decode_session,
183#endif
184};
185
186int nf_nat_icmp_reply_translation(struct sk_buff *skb,
187 struct nf_conn *ct,
188 enum ip_conntrack_info ctinfo,
189 unsigned int hooknum)
190{
191 struct {
192 struct icmphdr icmp;
193 struct iphdr ip;
194 } *inside;
195 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
196 enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
197 unsigned int hdrlen = ip_hdrlen(skb);
198 const struct nf_nat_l4proto *l4proto;
199 struct nf_conntrack_tuple target;
200 unsigned long statusbit;
201
202 NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY);
203
204 if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
205 return 0;
206 if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
207 return 0;
208
209 inside = (void *)skb->data + hdrlen;
210 if (inside->icmp.type == ICMP_REDIRECT) {
211 if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
212 return 0;
213 if (ct->status & IPS_NAT_MASK)
214 return 0;
215 }
216
217 if (manip == NF_NAT_MANIP_SRC)
218 statusbit = IPS_SRC_NAT;
219 else
220 statusbit = IPS_DST_NAT;
221
222 /* Invert if this is reply direction */
223 if (dir == IP_CT_DIR_REPLY)
224 statusbit ^= IPS_NAT_MASK;
225
226 if (!(ct->status & statusbit))
227 return 1;
228
229 l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol);
230 if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp),
231 l4proto, &ct->tuplehash[!dir].tuple, !manip))
232 return 0;
233
234 if (skb->ip_summed != CHECKSUM_PARTIAL) {
235 /* Reloading "inside" here since manip_pkt may reallocate */
236 inside = (void *)skb->data + hdrlen;
237 inside->icmp.checksum = 0;
238 inside->icmp.checksum =
239 csum_fold(skb_checksum(skb, hdrlen,
240 skb->len - hdrlen, 0));
241 }
242
243 /* Change outer to look like the reply to an incoming packet */
244 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
245 l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0);
246 if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip))
247 return 0;
248
249 return 1;
250}
251EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
252
253static int __init nf_nat_l3proto_ipv4_init(void)
254{
255 int err;
256
257 err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
258 if (err < 0)
259 goto err1;
260 err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv4);
261 if (err < 0)
262 goto err2;
263 return err;
264
265err2:
266 nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
267err1:
268 return err;
269}
270
271static void __exit nf_nat_l3proto_ipv4_exit(void)
272{
273 nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4);
274 nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
275}
276
277MODULE_LICENSE("GPL");
278MODULE_ALIAS("nf-nat-" __stringify(AF_INET));
279
280module_init(nf_nat_l3proto_ipv4_init);
281module_exit(nf_nat_l3proto_ipv4_exit);
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 388140881ebe..a06d7d74817d 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -22,7 +22,6 @@
22 22
23#include <net/netfilter/nf_nat.h> 23#include <net/netfilter/nf_nat.h>
24#include <net/netfilter/nf_nat_helper.h> 24#include <net/netfilter/nf_nat_helper.h>
25#include <net/netfilter/nf_nat_rule.h>
26#include <net/netfilter/nf_conntrack_helper.h> 25#include <net/netfilter/nf_conntrack_helper.h>
27#include <net/netfilter/nf_conntrack_expect.h> 26#include <net/netfilter/nf_conntrack_expect.h>
28#include <net/netfilter/nf_conntrack_zones.h> 27#include <net/netfilter/nf_conntrack_zones.h>
@@ -47,7 +46,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
47 struct nf_conntrack_tuple t; 46 struct nf_conntrack_tuple t;
48 const struct nf_ct_pptp_master *ct_pptp_info; 47 const struct nf_ct_pptp_master *ct_pptp_info;
49 const struct nf_nat_pptp *nat_pptp_info; 48 const struct nf_nat_pptp *nat_pptp_info;
50 struct nf_nat_ipv4_range range; 49 struct nf_nat_range range;
51 50
52 ct_pptp_info = nfct_help_data(master); 51 ct_pptp_info = nfct_help_data(master);
53 nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info; 52 nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info;
@@ -89,21 +88,21 @@ static void pptp_nat_expected(struct nf_conn *ct,
89 88
90 /* Change src to where master sends to */ 89 /* Change src to where master sends to */
91 range.flags = NF_NAT_RANGE_MAP_IPS; 90 range.flags = NF_NAT_RANGE_MAP_IPS;
92 range.min_ip = range.max_ip 91 range.min_addr = range.max_addr
93 = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; 92 = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
94 if (exp->dir == IP_CT_DIR_ORIGINAL) { 93 if (exp->dir == IP_CT_DIR_ORIGINAL) {
95 range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 94 range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
96 range.min = range.max = exp->saved_proto; 95 range.min_proto = range.max_proto = exp->saved_proto;
97 } 96 }
98 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); 97 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
99 98
100 /* For DST manip, map port here to where it's expected. */ 99 /* For DST manip, map port here to where it's expected. */
101 range.flags = NF_NAT_RANGE_MAP_IPS; 100 range.flags = NF_NAT_RANGE_MAP_IPS;
102 range.min_ip = range.max_ip 101 range.min_addr = range.max_addr
103 = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip; 102 = ct->master->tuplehash[!exp->dir].tuple.src.u3;
104 if (exp->dir == IP_CT_DIR_REPLY) { 103 if (exp->dir == IP_CT_DIR_REPLY) {
105 range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 104 range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
106 range.min = range.max = exp->saved_proto; 105 range.min_proto = range.max_proto = exp->saved_proto;
107 } 106 }
108 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); 107 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
109} 108}
@@ -113,6 +112,7 @@ static int
113pptp_outbound_pkt(struct sk_buff *skb, 112pptp_outbound_pkt(struct sk_buff *skb,
114 struct nf_conn *ct, 113 struct nf_conn *ct,
115 enum ip_conntrack_info ctinfo, 114 enum ip_conntrack_info ctinfo,
115 unsigned int protoff,
116 struct PptpControlHeader *ctlh, 116 struct PptpControlHeader *ctlh,
117 union pptp_ctrl_union *pptpReq) 117 union pptp_ctrl_union *pptpReq)
118 118
@@ -175,7 +175,7 @@ pptp_outbound_pkt(struct sk_buff *skb,
175 ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid)); 175 ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid));
176 176
177 /* mangle packet */ 177 /* mangle packet */
178 if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, 178 if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
179 cid_off + sizeof(struct pptp_pkt_hdr) + 179 cid_off + sizeof(struct pptp_pkt_hdr) +
180 sizeof(struct PptpControlHeader), 180 sizeof(struct PptpControlHeader),
181 sizeof(new_callid), (char *)&new_callid, 181 sizeof(new_callid), (char *)&new_callid,
@@ -216,6 +216,7 @@ static int
216pptp_inbound_pkt(struct sk_buff *skb, 216pptp_inbound_pkt(struct sk_buff *skb,
217 struct nf_conn *ct, 217 struct nf_conn *ct,
218 enum ip_conntrack_info ctinfo, 218 enum ip_conntrack_info ctinfo,
219 unsigned int protoff,
219 struct PptpControlHeader *ctlh, 220 struct PptpControlHeader *ctlh,
220 union pptp_ctrl_union *pptpReq) 221 union pptp_ctrl_union *pptpReq)
221{ 222{
@@ -268,7 +269,7 @@ pptp_inbound_pkt(struct sk_buff *skb,
268 pr_debug("altering peer call id from 0x%04x to 0x%04x\n", 269 pr_debug("altering peer call id from 0x%04x to 0x%04x\n",
269 ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid)); 270 ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid));
270 271
271 if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, 272 if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
272 pcid_off + sizeof(struct pptp_pkt_hdr) + 273 pcid_off + sizeof(struct pptp_pkt_hdr) +
273 sizeof(struct PptpControlHeader), 274 sizeof(struct PptpControlHeader),
274 sizeof(new_pcid), (char *)&new_pcid, 275 sizeof(new_pcid), (char *)&new_pcid,
diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c
deleted file mode 100644
index 9993bc93e102..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_common.c
+++ /dev/null
@@ -1,114 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
3 * (C) 2008 Patrick McHardy <kaber@trash.net>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/types.h>
11#include <linux/random.h>
12#include <linux/ip.h>
13
14#include <linux/netfilter.h>
15#include <linux/export.h>
16#include <net/secure_seq.h>
17#include <net/netfilter/nf_nat.h>
18#include <net/netfilter/nf_nat_core.h>
19#include <net/netfilter/nf_nat_rule.h>
20#include <net/netfilter/nf_nat_protocol.h>
21
22bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple,
23 enum nf_nat_manip_type maniptype,
24 const union nf_conntrack_man_proto *min,
25 const union nf_conntrack_man_proto *max)
26{
27 __be16 port;
28
29 if (maniptype == NF_NAT_MANIP_SRC)
30 port = tuple->src.u.all;
31 else
32 port = tuple->dst.u.all;
33
34 return ntohs(port) >= ntohs(min->all) &&
35 ntohs(port) <= ntohs(max->all);
36}
37EXPORT_SYMBOL_GPL(nf_nat_proto_in_range);
38
39void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
40 const struct nf_nat_ipv4_range *range,
41 enum nf_nat_manip_type maniptype,
42 const struct nf_conn *ct,
43 u_int16_t *rover)
44{
45 unsigned int range_size, min, i;
46 __be16 *portptr;
47 u_int16_t off;
48
49 if (maniptype == NF_NAT_MANIP_SRC)
50 portptr = &tuple->src.u.all;
51 else
52 portptr = &tuple->dst.u.all;
53
54 /* If no range specified... */
55 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
56 /* If it's dst rewrite, can't change port */
57 if (maniptype == NF_NAT_MANIP_DST)
58 return;
59
60 if (ntohs(*portptr) < 1024) {
61 /* Loose convention: >> 512 is credential passing */
62 if (ntohs(*portptr) < 512) {
63 min = 1;
64 range_size = 511 - min + 1;
65 } else {
66 min = 600;
67 range_size = 1023 - min + 1;
68 }
69 } else {
70 min = 1024;
71 range_size = 65535 - 1024 + 1;
72 }
73 } else {
74 min = ntohs(range->min.all);
75 range_size = ntohs(range->max.all) - min + 1;
76 }
77
78 if (range->flags & NF_NAT_RANGE_PROTO_RANDOM)
79 off = secure_ipv4_port_ephemeral(tuple->src.u3.ip, tuple->dst.u3.ip,
80 maniptype == NF_NAT_MANIP_SRC
81 ? tuple->dst.u.all
82 : tuple->src.u.all);
83 else
84 off = *rover;
85
86 for (i = 0; ; ++off) {
87 *portptr = htons(min + off % range_size);
88 if (++i != range_size && nf_nat_used_tuple(tuple, ct))
89 continue;
90 if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM))
91 *rover = off;
92 return;
93 }
94 return;
95}
96EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple);
97
98#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
99int nf_nat_proto_nlattr_to_range(struct nlattr *tb[],
100 struct nf_nat_ipv4_range *range)
101{
102 if (tb[CTA_PROTONAT_PORT_MIN]) {
103 range->min.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
104 range->max.all = range->min.tcp.port;
105 range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
106 }
107 if (tb[CTA_PROTONAT_PORT_MAX]) {
108 range->max.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]);
109 range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
110 }
111 return 0;
112}
113EXPORT_SYMBOL_GPL(nf_nat_proto_nlattr_to_range);
114#endif
diff --git a/net/ipv4/netfilter/nf_nat_proto_dccp.c b/net/ipv4/netfilter/nf_nat_proto_dccp.c
deleted file mode 100644
index 3f67138d187c..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_dccp.c
+++ /dev/null
@@ -1,106 +0,0 @@
1/*
2 * DCCP NAT protocol helper
3 *
4 * Copyright (c) 2005, 2006. 2008 Patrick McHardy <kaber@trash.net>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 */
11
12#include <linux/kernel.h>
13#include <linux/module.h>
14#include <linux/init.h>
15#include <linux/skbuff.h>
16#include <linux/ip.h>
17#include <linux/dccp.h>
18
19#include <net/netfilter/nf_conntrack.h>
20#include <net/netfilter/nf_nat.h>
21#include <net/netfilter/nf_nat_protocol.h>
22
23static u_int16_t dccp_port_rover;
24
25static void
26dccp_unique_tuple(struct nf_conntrack_tuple *tuple,
27 const struct nf_nat_ipv4_range *range,
28 enum nf_nat_manip_type maniptype,
29 const struct nf_conn *ct)
30{
31 nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
32 &dccp_port_rover);
33}
34
35static bool
36dccp_manip_pkt(struct sk_buff *skb,
37 unsigned int iphdroff,
38 const struct nf_conntrack_tuple *tuple,
39 enum nf_nat_manip_type maniptype)
40{
41 const struct iphdr *iph = (const void *)(skb->data + iphdroff);
42 struct dccp_hdr *hdr;
43 unsigned int hdroff = iphdroff + iph->ihl * 4;
44 __be32 oldip, newip;
45 __be16 *portptr, oldport, newport;
46 int hdrsize = 8; /* DCCP connection tracking guarantees this much */
47
48 if (skb->len >= hdroff + sizeof(struct dccp_hdr))
49 hdrsize = sizeof(struct dccp_hdr);
50
51 if (!skb_make_writable(skb, hdroff + hdrsize))
52 return false;
53
54 iph = (struct iphdr *)(skb->data + iphdroff);
55 hdr = (struct dccp_hdr *)(skb->data + hdroff);
56
57 if (maniptype == NF_NAT_MANIP_SRC) {
58 oldip = iph->saddr;
59 newip = tuple->src.u3.ip;
60 newport = tuple->src.u.dccp.port;
61 portptr = &hdr->dccph_sport;
62 } else {
63 oldip = iph->daddr;
64 newip = tuple->dst.u3.ip;
65 newport = tuple->dst.u.dccp.port;
66 portptr = &hdr->dccph_dport;
67 }
68
69 oldport = *portptr;
70 *portptr = newport;
71
72 if (hdrsize < sizeof(*hdr))
73 return true;
74
75 inet_proto_csum_replace4(&hdr->dccph_checksum, skb, oldip, newip, 1);
76 inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport,
77 0);
78 return true;
79}
80
81static const struct nf_nat_protocol nf_nat_protocol_dccp = {
82 .protonum = IPPROTO_DCCP,
83 .manip_pkt = dccp_manip_pkt,
84 .in_range = nf_nat_proto_in_range,
85 .unique_tuple = dccp_unique_tuple,
86#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
87 .nlattr_to_range = nf_nat_proto_nlattr_to_range,
88#endif
89};
90
91static int __init nf_nat_proto_dccp_init(void)
92{
93 return nf_nat_protocol_register(&nf_nat_protocol_dccp);
94}
95
96static void __exit nf_nat_proto_dccp_fini(void)
97{
98 nf_nat_protocol_unregister(&nf_nat_protocol_dccp);
99}
100
101module_init(nf_nat_proto_dccp_init);
102module_exit(nf_nat_proto_dccp_fini);
103
104MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
105MODULE_DESCRIPTION("DCCP NAT protocol helper");
106MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index 46ba0b9ab985..ea44f02563b5 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -28,8 +28,7 @@
28#include <linux/ip.h> 28#include <linux/ip.h>
29 29
30#include <net/netfilter/nf_nat.h> 30#include <net/netfilter/nf_nat.h>
31#include <net/netfilter/nf_nat_rule.h> 31#include <net/netfilter/nf_nat_l4proto.h>
32#include <net/netfilter/nf_nat_protocol.h>
33#include <linux/netfilter/nf_conntrack_proto_gre.h> 32#include <linux/netfilter/nf_conntrack_proto_gre.h>
34 33
35MODULE_LICENSE("GPL"); 34MODULE_LICENSE("GPL");
@@ -38,8 +37,9 @@ MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
38 37
39/* generate unique tuple ... */ 38/* generate unique tuple ... */
40static void 39static void
41gre_unique_tuple(struct nf_conntrack_tuple *tuple, 40gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
42 const struct nf_nat_ipv4_range *range, 41 struct nf_conntrack_tuple *tuple,
42 const struct nf_nat_range *range,
43 enum nf_nat_manip_type maniptype, 43 enum nf_nat_manip_type maniptype,
44 const struct nf_conn *ct) 44 const struct nf_conn *ct)
45{ 45{
@@ -62,8 +62,8 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
62 min = 1; 62 min = 1;
63 range_size = 0xffff; 63 range_size = 0xffff;
64 } else { 64 } else {
65 min = ntohs(range->min.gre.key); 65 min = ntohs(range->min_proto.gre.key);
66 range_size = ntohs(range->max.gre.key) - min + 1; 66 range_size = ntohs(range->max_proto.gre.key) - min + 1;
67 } 67 }
68 68
69 pr_debug("min = %u, range_size = %u\n", min, range_size); 69 pr_debug("min = %u, range_size = %u\n", min, range_size);
@@ -80,14 +80,14 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
80 80
81/* manipulate a GRE packet according to maniptype */ 81/* manipulate a GRE packet according to maniptype */
82static bool 82static bool
83gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, 83gre_manip_pkt(struct sk_buff *skb,
84 const struct nf_nat_l3proto *l3proto,
85 unsigned int iphdroff, unsigned int hdroff,
84 const struct nf_conntrack_tuple *tuple, 86 const struct nf_conntrack_tuple *tuple,
85 enum nf_nat_manip_type maniptype) 87 enum nf_nat_manip_type maniptype)
86{ 88{
87 const struct gre_hdr *greh; 89 const struct gre_hdr *greh;
88 struct gre_hdr_pptp *pgreh; 90 struct gre_hdr_pptp *pgreh;
89 const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
90 unsigned int hdroff = iphdroff + iph->ihl * 4;
91 91
92 /* pgreh includes two optional 32bit fields which are not required 92 /* pgreh includes two optional 32bit fields which are not required
93 * to be there. That's where the magic '8' comes from */ 93 * to be there. That's where the magic '8' comes from */
@@ -117,24 +117,24 @@ gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff,
117 return true; 117 return true;
118} 118}
119 119
120static const struct nf_nat_protocol gre = { 120static const struct nf_nat_l4proto gre = {
121 .protonum = IPPROTO_GRE, 121 .l4proto = IPPROTO_GRE,
122 .manip_pkt = gre_manip_pkt, 122 .manip_pkt = gre_manip_pkt,
123 .in_range = nf_nat_proto_in_range, 123 .in_range = nf_nat_l4proto_in_range,
124 .unique_tuple = gre_unique_tuple, 124 .unique_tuple = gre_unique_tuple,
125#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 125#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
126 .nlattr_to_range = nf_nat_proto_nlattr_to_range, 126 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
127#endif 127#endif
128}; 128};
129 129
130static int __init nf_nat_proto_gre_init(void) 130static int __init nf_nat_proto_gre_init(void)
131{ 131{
132 return nf_nat_protocol_register(&gre); 132 return nf_nat_l4proto_register(NFPROTO_IPV4, &gre);
133} 133}
134 134
135static void __exit nf_nat_proto_gre_fini(void) 135static void __exit nf_nat_proto_gre_fini(void)
136{ 136{
137 nf_nat_protocol_unregister(&gre); 137 nf_nat_l4proto_unregister(NFPROTO_IPV4, &gre);
138} 138}
139 139
140module_init(nf_nat_proto_gre_init); 140module_init(nf_nat_proto_gre_init);
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index b35172851bae..eb303471bcf6 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -15,8 +15,7 @@
15#include <linux/netfilter.h> 15#include <linux/netfilter.h>
16#include <net/netfilter/nf_nat.h> 16#include <net/netfilter/nf_nat.h>
17#include <net/netfilter/nf_nat_core.h> 17#include <net/netfilter/nf_nat_core.h>
18#include <net/netfilter/nf_nat_rule.h> 18#include <net/netfilter/nf_nat_l4proto.h>
19#include <net/netfilter/nf_nat_protocol.h>
20 19
21static bool 20static bool
22icmp_in_range(const struct nf_conntrack_tuple *tuple, 21icmp_in_range(const struct nf_conntrack_tuple *tuple,
@@ -29,8 +28,9 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,
29} 28}
30 29
31static void 30static void
32icmp_unique_tuple(struct nf_conntrack_tuple *tuple, 31icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
33 const struct nf_nat_ipv4_range *range, 32 struct nf_conntrack_tuple *tuple,
33 const struct nf_nat_range *range,
34 enum nf_nat_manip_type maniptype, 34 enum nf_nat_manip_type maniptype,
35 const struct nf_conn *ct) 35 const struct nf_conn *ct)
36{ 36{
@@ -38,13 +38,14 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
38 unsigned int range_size; 38 unsigned int range_size;
39 unsigned int i; 39 unsigned int i;
40 40
41 range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1; 41 range_size = ntohs(range->max_proto.icmp.id) -
42 ntohs(range->min_proto.icmp.id) + 1;
42 /* If no range specified... */ 43 /* If no range specified... */
43 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) 44 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
44 range_size = 0xFFFF; 45 range_size = 0xFFFF;
45 46
46 for (i = 0; ; ++id) { 47 for (i = 0; ; ++id) {
47 tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) + 48 tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
48 (id % range_size)); 49 (id % range_size));
49 if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) 50 if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
50 return; 51 return;
@@ -54,13 +55,12 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
54 55
55static bool 56static bool
56icmp_manip_pkt(struct sk_buff *skb, 57icmp_manip_pkt(struct sk_buff *skb,
57 unsigned int iphdroff, 58 const struct nf_nat_l3proto *l3proto,
59 unsigned int iphdroff, unsigned int hdroff,
58 const struct nf_conntrack_tuple *tuple, 60 const struct nf_conntrack_tuple *tuple,
59 enum nf_nat_manip_type maniptype) 61 enum nf_nat_manip_type maniptype)
60{ 62{
61 const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
62 struct icmphdr *hdr; 63 struct icmphdr *hdr;
63 unsigned int hdroff = iphdroff + iph->ihl*4;
64 64
65 if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) 65 if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
66 return false; 66 return false;
@@ -72,12 +72,12 @@ icmp_manip_pkt(struct sk_buff *skb,
72 return true; 72 return true;
73} 73}
74 74
75const struct nf_nat_protocol nf_nat_protocol_icmp = { 75const struct nf_nat_l4proto nf_nat_l4proto_icmp = {
76 .protonum = IPPROTO_ICMP, 76 .l4proto = IPPROTO_ICMP,
77 .manip_pkt = icmp_manip_pkt, 77 .manip_pkt = icmp_manip_pkt,
78 .in_range = icmp_in_range, 78 .in_range = icmp_in_range,
79 .unique_tuple = icmp_unique_tuple, 79 .unique_tuple = icmp_unique_tuple,
80#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 80#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
81 .nlattr_to_range = nf_nat_proto_nlattr_to_range, 81 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
82#endif 82#endif
83}; 83};
diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c
deleted file mode 100644
index 3cce9b6c1c29..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_sctp.c
+++ /dev/null
@@ -1,96 +0,0 @@
1/*
2 * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/init.h>
11#include <linux/ip.h>
12#include <linux/sctp.h>
13#include <linux/module.h>
14#include <net/sctp/checksum.h>
15
16#include <net/netfilter/nf_nat_protocol.h>
17
18static u_int16_t nf_sctp_port_rover;
19
20static void
21sctp_unique_tuple(struct nf_conntrack_tuple *tuple,
22 const struct nf_nat_ipv4_range *range,
23 enum nf_nat_manip_type maniptype,
24 const struct nf_conn *ct)
25{
26 nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
27 &nf_sctp_port_rover);
28}
29
30static bool
31sctp_manip_pkt(struct sk_buff *skb,
32 unsigned int iphdroff,
33 const struct nf_conntrack_tuple *tuple,
34 enum nf_nat_manip_type maniptype)
35{
36 const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
37 struct sk_buff *frag;
38 sctp_sctphdr_t *hdr;
39 unsigned int hdroff = iphdroff + iph->ihl*4;
40 __be32 oldip, newip;
41 __be32 crc32;
42
43 if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
44 return false;
45
46 iph = (struct iphdr *)(skb->data + iphdroff);
47 hdr = (struct sctphdr *)(skb->data + hdroff);
48
49 if (maniptype == NF_NAT_MANIP_SRC) {
50 /* Get rid of src ip and src pt */
51 oldip = iph->saddr;
52 newip = tuple->src.u3.ip;
53 hdr->source = tuple->src.u.sctp.port;
54 } else {
55 /* Get rid of dst ip and dst pt */
56 oldip = iph->daddr;
57 newip = tuple->dst.u3.ip;
58 hdr->dest = tuple->dst.u.sctp.port;
59 }
60
61 crc32 = sctp_start_cksum((u8 *)hdr, skb_headlen(skb) - hdroff);
62 skb_walk_frags(skb, frag)
63 crc32 = sctp_update_cksum((u8 *)frag->data, skb_headlen(frag),
64 crc32);
65 crc32 = sctp_end_cksum(crc32);
66 hdr->checksum = crc32;
67
68 return true;
69}
70
71static const struct nf_nat_protocol nf_nat_protocol_sctp = {
72 .protonum = IPPROTO_SCTP,
73 .manip_pkt = sctp_manip_pkt,
74 .in_range = nf_nat_proto_in_range,
75 .unique_tuple = sctp_unique_tuple,
76#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
77 .nlattr_to_range = nf_nat_proto_nlattr_to_range,
78#endif
79};
80
81static int __init nf_nat_proto_sctp_init(void)
82{
83 return nf_nat_protocol_register(&nf_nat_protocol_sctp);
84}
85
86static void __exit nf_nat_proto_sctp_exit(void)
87{
88 nf_nat_protocol_unregister(&nf_nat_protocol_sctp);
89}
90
91module_init(nf_nat_proto_sctp_init);
92module_exit(nf_nat_proto_sctp_exit);
93
94MODULE_LICENSE("GPL");
95MODULE_DESCRIPTION("SCTP NAT protocol helper");
96MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c
deleted file mode 100644
index 9fb4b4e72bbf..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_tcp.c
+++ /dev/null
@@ -1,91 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/init.h>
11#include <linux/export.h>
12#include <linux/ip.h>
13#include <linux/tcp.h>
14
15#include <linux/netfilter.h>
16#include <linux/netfilter/nfnetlink_conntrack.h>
17#include <net/netfilter/nf_nat.h>
18#include <net/netfilter/nf_nat_rule.h>
19#include <net/netfilter/nf_nat_protocol.h>
20#include <net/netfilter/nf_nat_core.h>
21
22static u_int16_t tcp_port_rover;
23
24static void
25tcp_unique_tuple(struct nf_conntrack_tuple *tuple,
26 const struct nf_nat_ipv4_range *range,
27 enum nf_nat_manip_type maniptype,
28 const struct nf_conn *ct)
29{
30 nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &tcp_port_rover);
31}
32
33static bool
34tcp_manip_pkt(struct sk_buff *skb,
35 unsigned int iphdroff,
36 const struct nf_conntrack_tuple *tuple,
37 enum nf_nat_manip_type maniptype)
38{
39 const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
40 struct tcphdr *hdr;
41 unsigned int hdroff = iphdroff + iph->ihl*4;
42 __be32 oldip, newip;
43 __be16 *portptr, newport, oldport;
44 int hdrsize = 8; /* TCP connection tracking guarantees this much */
45
46 /* this could be a inner header returned in icmp packet; in such
47 cases we cannot update the checksum field since it is outside of
48 the 8 bytes of transport layer headers we are guaranteed */
49 if (skb->len >= hdroff + sizeof(struct tcphdr))
50 hdrsize = sizeof(struct tcphdr);
51
52 if (!skb_make_writable(skb, hdroff + hdrsize))
53 return false;
54
55 iph = (struct iphdr *)(skb->data + iphdroff);
56 hdr = (struct tcphdr *)(skb->data + hdroff);
57
58 if (maniptype == NF_NAT_MANIP_SRC) {
59 /* Get rid of src ip and src pt */
60 oldip = iph->saddr;
61 newip = tuple->src.u3.ip;
62 newport = tuple->src.u.tcp.port;
63 portptr = &hdr->source;
64 } else {
65 /* Get rid of dst ip and dst pt */
66 oldip = iph->daddr;
67 newip = tuple->dst.u3.ip;
68 newport = tuple->dst.u.tcp.port;
69 portptr = &hdr->dest;
70 }
71
72 oldport = *portptr;
73 *portptr = newport;
74
75 if (hdrsize < sizeof(*hdr))
76 return true;
77
78 inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
79 inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0);
80 return true;
81}
82
83const struct nf_nat_protocol nf_nat_protocol_tcp = {
84 .protonum = IPPROTO_TCP,
85 .manip_pkt = tcp_manip_pkt,
86 .in_range = nf_nat_proto_in_range,
87 .unique_tuple = tcp_unique_tuple,
88#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
89 .nlattr_to_range = nf_nat_proto_nlattr_to_range,
90#endif
91};
diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c
deleted file mode 100644
index 9883336e628f..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_udp.c
+++ /dev/null
@@ -1,82 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/export.h>
11#include <linux/init.h>
12#include <linux/ip.h>
13#include <linux/udp.h>
14
15#include <linux/netfilter.h>
16#include <net/netfilter/nf_nat.h>
17#include <net/netfilter/nf_nat_core.h>
18#include <net/netfilter/nf_nat_rule.h>
19#include <net/netfilter/nf_nat_protocol.h>
20
21static u_int16_t udp_port_rover;
22
23static void
24udp_unique_tuple(struct nf_conntrack_tuple *tuple,
25 const struct nf_nat_ipv4_range *range,
26 enum nf_nat_manip_type maniptype,
27 const struct nf_conn *ct)
28{
29 nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &udp_port_rover);
30}
31
32static bool
33udp_manip_pkt(struct sk_buff *skb,
34 unsigned int iphdroff,
35 const struct nf_conntrack_tuple *tuple,
36 enum nf_nat_manip_type maniptype)
37{
38 const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
39 struct udphdr *hdr;
40 unsigned int hdroff = iphdroff + iph->ihl*4;
41 __be32 oldip, newip;
42 __be16 *portptr, newport;
43
44 if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
45 return false;
46
47 iph = (struct iphdr *)(skb->data + iphdroff);
48 hdr = (struct udphdr *)(skb->data + hdroff);
49
50 if (maniptype == NF_NAT_MANIP_SRC) {
51 /* Get rid of src ip and src pt */
52 oldip = iph->saddr;
53 newip = tuple->src.u3.ip;
54 newport = tuple->src.u.udp.port;
55 portptr = &hdr->source;
56 } else {
57 /* Get rid of dst ip and dst pt */
58 oldip = iph->daddr;
59 newip = tuple->dst.u3.ip;
60 newport = tuple->dst.u.udp.port;
61 portptr = &hdr->dest;
62 }
63 if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) {
64 inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
65 inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
66 0);
67 if (!hdr->check)
68 hdr->check = CSUM_MANGLED_0;
69 }
70 *portptr = newport;
71 return true;
72}
73
74const struct nf_nat_protocol nf_nat_protocol_udp = {
75 .protonum = IPPROTO_UDP,
76 .manip_pkt = udp_manip_pkt,
77 .in_range = nf_nat_proto_in_range,
78 .unique_tuple = udp_unique_tuple,
79#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
80 .nlattr_to_range = nf_nat_proto_nlattr_to_range,
81#endif
82};
diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c
deleted file mode 100644
index d24d10a7beb2..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_udplite.c
+++ /dev/null
@@ -1,98 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
3 * (C) 2008 Patrick McHardy <kaber@trash.net>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/types.h>
11#include <linux/init.h>
12#include <linux/ip.h>
13#include <linux/udp.h>
14
15#include <linux/netfilter.h>
16#include <linux/module.h>
17#include <net/netfilter/nf_nat.h>
18#include <net/netfilter/nf_nat_protocol.h>
19
20static u_int16_t udplite_port_rover;
21
22static void
23udplite_unique_tuple(struct nf_conntrack_tuple *tuple,
24 const struct nf_nat_ipv4_range *range,
25 enum nf_nat_manip_type maniptype,
26 const struct nf_conn *ct)
27{
28 nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
29 &udplite_port_rover);
30}
31
32static bool
33udplite_manip_pkt(struct sk_buff *skb,
34 unsigned int iphdroff,
35 const struct nf_conntrack_tuple *tuple,
36 enum nf_nat_manip_type maniptype)
37{
38 const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
39 struct udphdr *hdr;
40 unsigned int hdroff = iphdroff + iph->ihl*4;
41 __be32 oldip, newip;
42 __be16 *portptr, newport;
43
44 if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
45 return false;
46
47 iph = (struct iphdr *)(skb->data + iphdroff);
48 hdr = (struct udphdr *)(skb->data + hdroff);
49
50 if (maniptype == NF_NAT_MANIP_SRC) {
51 /* Get rid of src ip and src pt */
52 oldip = iph->saddr;
53 newip = tuple->src.u3.ip;
54 newport = tuple->src.u.udp.port;
55 portptr = &hdr->source;
56 } else {
57 /* Get rid of dst ip and dst pt */
58 oldip = iph->daddr;
59 newip = tuple->dst.u3.ip;
60 newport = tuple->dst.u.udp.port;
61 portptr = &hdr->dest;
62 }
63
64 inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
65 inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0);
66 if (!hdr->check)
67 hdr->check = CSUM_MANGLED_0;
68
69 *portptr = newport;
70 return true;
71}
72
73static const struct nf_nat_protocol nf_nat_protocol_udplite = {
74 .protonum = IPPROTO_UDPLITE,
75 .manip_pkt = udplite_manip_pkt,
76 .in_range = nf_nat_proto_in_range,
77 .unique_tuple = udplite_unique_tuple,
78#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
79 .nlattr_to_range = nf_nat_proto_nlattr_to_range,
80#endif
81};
82
83static int __init nf_nat_proto_udplite_init(void)
84{
85 return nf_nat_protocol_register(&nf_nat_protocol_udplite);
86}
87
88static void __exit nf_nat_proto_udplite_fini(void)
89{
90 nf_nat_protocol_unregister(&nf_nat_protocol_udplite);
91}
92
93module_init(nf_nat_proto_udplite_init);
94module_exit(nf_nat_proto_udplite_fini);
95
96MODULE_LICENSE("GPL");
97MODULE_DESCRIPTION("UDP-Lite NAT protocol helper");
98MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c
deleted file mode 100644
index e0afe8112b1c..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_unknown.c
+++ /dev/null
@@ -1,52 +0,0 @@
1/* The "unknown" protocol. This is what is used for protocols we
2 * don't understand. It's returned by ip_ct_find_proto().
3 */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/types.h>
14#include <linux/init.h>
15
16#include <linux/netfilter.h>
17#include <net/netfilter/nf_nat.h>
18#include <net/netfilter/nf_nat_rule.h>
19#include <net/netfilter/nf_nat_protocol.h>
20
21static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
22 enum nf_nat_manip_type manip_type,
23 const union nf_conntrack_man_proto *min,
24 const union nf_conntrack_man_proto *max)
25{
26 return true;
27}
28
29static void unknown_unique_tuple(struct nf_conntrack_tuple *tuple,
30 const struct nf_nat_ipv4_range *range,
31 enum nf_nat_manip_type maniptype,
32 const struct nf_conn *ct)
33{
34 /* Sorry: we can't help you; if it's not unique, we can't frob
35 anything. */
36 return;
37}
38
39static bool
40unknown_manip_pkt(struct sk_buff *skb,
41 unsigned int iphdroff,
42 const struct nf_conntrack_tuple *tuple,
43 enum nf_nat_manip_type maniptype)
44{
45 return true;
46}
47
48const struct nf_nat_protocol nf_nat_unknown_protocol = {
49 .manip_pkt = unknown_manip_pkt,
50 .in_range = unknown_in_range,
51 .unique_tuple = unknown_unique_tuple,
52};
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
deleted file mode 100644
index d2a9dc314e0e..000000000000
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ /dev/null
@@ -1,214 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9/* Everything about the rules for NAT. */
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11#include <linux/types.h>
12#include <linux/ip.h>
13#include <linux/netfilter.h>
14#include <linux/netfilter_ipv4.h>
15#include <linux/module.h>
16#include <linux/kmod.h>
17#include <linux/skbuff.h>
18#include <linux/proc_fs.h>
19#include <linux/slab.h>
20#include <net/checksum.h>
21#include <net/route.h>
22#include <linux/bitops.h>
23
24#include <linux/netfilter_ipv4/ip_tables.h>
25#include <net/netfilter/nf_nat.h>
26#include <net/netfilter/nf_nat_core.h>
27#include <net/netfilter/nf_nat_rule.h>
28
29#define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
30 (1 << NF_INET_POST_ROUTING) | \
31 (1 << NF_INET_LOCAL_OUT) | \
32 (1 << NF_INET_LOCAL_IN))
33
34static const struct xt_table nat_table = {
35 .name = "nat",
36 .valid_hooks = NAT_VALID_HOOKS,
37 .me = THIS_MODULE,
38 .af = NFPROTO_IPV4,
39};
40
41/* Source NAT */
42static unsigned int
43ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par)
44{
45 struct nf_conn *ct;
46 enum ip_conntrack_info ctinfo;
47 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
48
49 NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING ||
50 par->hooknum == NF_INET_LOCAL_IN);
51
52 ct = nf_ct_get(skb, &ctinfo);
53
54 /* Connection must be valid and new. */
55 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
56 ctinfo == IP_CT_RELATED_REPLY));
57 NF_CT_ASSERT(par->out != NULL);
58
59 return nf_nat_setup_info(ct, &mr->range[0], NF_NAT_MANIP_SRC);
60}
61
62static unsigned int
63ipt_dnat_target(struct sk_buff *skb, const struct xt_action_param *par)
64{
65 struct nf_conn *ct;
66 enum ip_conntrack_info ctinfo;
67 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
68
69 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
70 par->hooknum == NF_INET_LOCAL_OUT);
71
72 ct = nf_ct_get(skb, &ctinfo);
73
74 /* Connection must be valid and new. */
75 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
76
77 return nf_nat_setup_info(ct, &mr->range[0], NF_NAT_MANIP_DST);
78}
79
80static int ipt_snat_checkentry(const struct xt_tgchk_param *par)
81{
82 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
83
84 /* Must be a valid range */
85 if (mr->rangesize != 1) {
86 pr_info("SNAT: multiple ranges no longer supported\n");
87 return -EINVAL;
88 }
89 return 0;
90}
91
92static int ipt_dnat_checkentry(const struct xt_tgchk_param *par)
93{
94 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
95
96 /* Must be a valid range */
97 if (mr->rangesize != 1) {
98 pr_info("DNAT: multiple ranges no longer supported\n");
99 return -EINVAL;
100 }
101 return 0;
102}
103
104static unsigned int
105alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
106{
107 /* Force range to this IP; let proto decide mapping for
108 per-proto parts (hence not NF_NAT_RANGE_PROTO_SPECIFIED).
109 */
110 struct nf_nat_ipv4_range range;
111
112 range.flags = 0;
113 pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
114 HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ?
115 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
116 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
117
118 return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
119}
120
121int nf_nat_rule_find(struct sk_buff *skb,
122 unsigned int hooknum,
123 const struct net_device *in,
124 const struct net_device *out,
125 struct nf_conn *ct)
126{
127 struct net *net = nf_ct_net(ct);
128 int ret;
129
130 ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table);
131
132 if (ret == NF_ACCEPT) {
133 if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
134 /* NUL mapping */
135 ret = alloc_null_binding(ct, hooknum);
136 }
137 return ret;
138}
139
140static struct xt_target ipt_snat_reg __read_mostly = {
141 .name = "SNAT",
142 .target = ipt_snat_target,
143 .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
144 .table = "nat",
145 .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN),
146 .checkentry = ipt_snat_checkentry,
147 .family = AF_INET,
148};
149
150static struct xt_target ipt_dnat_reg __read_mostly = {
151 .name = "DNAT",
152 .target = ipt_dnat_target,
153 .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
154 .table = "nat",
155 .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT),
156 .checkentry = ipt_dnat_checkentry,
157 .family = AF_INET,
158};
159
160static int __net_init nf_nat_rule_net_init(struct net *net)
161{
162 struct ipt_replace *repl;
163
164 repl = ipt_alloc_initial_table(&nat_table);
165 if (repl == NULL)
166 return -ENOMEM;
167 net->ipv4.nat_table = ipt_register_table(net, &nat_table, repl);
168 kfree(repl);
169 if (IS_ERR(net->ipv4.nat_table))
170 return PTR_ERR(net->ipv4.nat_table);
171 return 0;
172}
173
174static void __net_exit nf_nat_rule_net_exit(struct net *net)
175{
176 ipt_unregister_table(net, net->ipv4.nat_table);
177}
178
179static struct pernet_operations nf_nat_rule_net_ops = {
180 .init = nf_nat_rule_net_init,
181 .exit = nf_nat_rule_net_exit,
182};
183
184int __init nf_nat_rule_init(void)
185{
186 int ret;
187
188 ret = register_pernet_subsys(&nf_nat_rule_net_ops);
189 if (ret != 0)
190 goto out;
191 ret = xt_register_target(&ipt_snat_reg);
192 if (ret != 0)
193 goto unregister_table;
194
195 ret = xt_register_target(&ipt_dnat_reg);
196 if (ret != 0)
197 goto unregister_snat;
198
199 return ret;
200
201 unregister_snat:
202 xt_unregister_target(&ipt_snat_reg);
203 unregister_table:
204 unregister_pernet_subsys(&nf_nat_rule_net_ops);
205 out:
206 return ret;
207}
208
209void nf_nat_rule_cleanup(void)
210{
211 xt_unregister_target(&ipt_dnat_reg);
212 xt_unregister_target(&ipt_snat_reg);
213 unregister_pernet_subsys(&nf_nat_rule_net_ops);
214}
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
deleted file mode 100644
index 9c87cde28ff8..000000000000
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ /dev/null
@@ -1,572 +0,0 @@
1/* SIP extension for NAT alteration.
2 *
3 * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
4 * based on RR's ip_nat_ftp.c and other modules.
5 * (C) 2007 United Security Providers
6 * (C) 2007, 2008 Patrick McHardy <kaber@trash.net>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/module.h>
14#include <linux/skbuff.h>
15#include <linux/ip.h>
16#include <net/ip.h>
17#include <linux/udp.h>
18#include <linux/tcp.h>
19
20#include <net/netfilter/nf_nat.h>
21#include <net/netfilter/nf_nat_helper.h>
22#include <net/netfilter/nf_nat_rule.h>
23#include <net/netfilter/nf_conntrack_helper.h>
24#include <net/netfilter/nf_conntrack_expect.h>
25#include <linux/netfilter/nf_conntrack_sip.h>
26
27MODULE_LICENSE("GPL");
28MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
29MODULE_DESCRIPTION("SIP NAT helper");
30MODULE_ALIAS("ip_nat_sip");
31
32
33static unsigned int mangle_packet(struct sk_buff *skb, unsigned int dataoff,
34 const char **dptr, unsigned int *datalen,
35 unsigned int matchoff, unsigned int matchlen,
36 const char *buffer, unsigned int buflen)
37{
38 enum ip_conntrack_info ctinfo;
39 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
40 struct tcphdr *th;
41 unsigned int baseoff;
42
43 if (nf_ct_protonum(ct) == IPPROTO_TCP) {
44 th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
45 baseoff = ip_hdrlen(skb) + th->doff * 4;
46 matchoff += dataoff - baseoff;
47
48 if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
49 matchoff, matchlen,
50 buffer, buflen, false))
51 return 0;
52 } else {
53 baseoff = ip_hdrlen(skb) + sizeof(struct udphdr);
54 matchoff += dataoff - baseoff;
55
56 if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
57 matchoff, matchlen,
58 buffer, buflen))
59 return 0;
60 }
61
62 /* Reload data pointer and adjust datalen value */
63 *dptr = skb->data + dataoff;
64 *datalen += buflen - matchlen;
65 return 1;
66}
67
68static int map_addr(struct sk_buff *skb, unsigned int dataoff,
69 const char **dptr, unsigned int *datalen,
70 unsigned int matchoff, unsigned int matchlen,
71 union nf_inet_addr *addr, __be16 port)
72{
73 enum ip_conntrack_info ctinfo;
74 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
75 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
76 char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
77 unsigned int buflen;
78 __be32 newaddr;
79 __be16 newport;
80
81 if (ct->tuplehash[dir].tuple.src.u3.ip == addr->ip &&
82 ct->tuplehash[dir].tuple.src.u.udp.port == port) {
83 newaddr = ct->tuplehash[!dir].tuple.dst.u3.ip;
84 newport = ct->tuplehash[!dir].tuple.dst.u.udp.port;
85 } else if (ct->tuplehash[dir].tuple.dst.u3.ip == addr->ip &&
86 ct->tuplehash[dir].tuple.dst.u.udp.port == port) {
87 newaddr = ct->tuplehash[!dir].tuple.src.u3.ip;
88 newport = ct->tuplehash[!dir].tuple.src.u.udp.port;
89 } else
90 return 1;
91
92 if (newaddr == addr->ip && newport == port)
93 return 1;
94
95 buflen = sprintf(buffer, "%pI4:%u", &newaddr, ntohs(newport));
96
97 return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
98 buffer, buflen);
99}
100
101static int map_sip_addr(struct sk_buff *skb, unsigned int dataoff,
102 const char **dptr, unsigned int *datalen,
103 enum sip_header_types type)
104{
105 enum ip_conntrack_info ctinfo;
106 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
107 unsigned int matchlen, matchoff;
108 union nf_inet_addr addr;
109 __be16 port;
110
111 if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL,
112 &matchoff, &matchlen, &addr, &port) <= 0)
113 return 1;
114 return map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
115 &addr, port);
116}
117
118static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
119 const char **dptr, unsigned int *datalen)
120{
121 enum ip_conntrack_info ctinfo;
122 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
123 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
124 unsigned int coff, matchoff, matchlen;
125 enum sip_header_types hdr;
126 union nf_inet_addr addr;
127 __be16 port;
128 int request, in_header;
129
130 /* Basic rules: requests and responses. */
131 if (strnicmp(*dptr, "SIP/2.0", strlen("SIP/2.0")) != 0) {
132 if (ct_sip_parse_request(ct, *dptr, *datalen,
133 &matchoff, &matchlen,
134 &addr, &port) > 0 &&
135 !map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
136 &addr, port))
137 return NF_DROP;
138 request = 1;
139 } else
140 request = 0;
141
142 if (nf_ct_protonum(ct) == IPPROTO_TCP)
143 hdr = SIP_HDR_VIA_TCP;
144 else
145 hdr = SIP_HDR_VIA_UDP;
146
147 /* Translate topmost Via header and parameters */
148 if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
149 hdr, NULL, &matchoff, &matchlen,
150 &addr, &port) > 0) {
151 unsigned int olen, matchend, poff, plen, buflen, n;
152 char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
153
154 /* We're only interested in headers related to this
155 * connection */
156 if (request) {
157 if (addr.ip != ct->tuplehash[dir].tuple.src.u3.ip ||
158 port != ct->tuplehash[dir].tuple.src.u.udp.port)
159 goto next;
160 } else {
161 if (addr.ip != ct->tuplehash[dir].tuple.dst.u3.ip ||
162 port != ct->tuplehash[dir].tuple.dst.u.udp.port)
163 goto next;
164 }
165
166 olen = *datalen;
167 if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
168 &addr, port))
169 return NF_DROP;
170
171 matchend = matchoff + matchlen + *datalen - olen;
172
173 /* The maddr= parameter (RFC 2361) specifies where to send
174 * the reply. */
175 if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
176 "maddr=", &poff, &plen,
177 &addr, true) > 0 &&
178 addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
179 addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) {
180 buflen = sprintf(buffer, "%pI4",
181 &ct->tuplehash[!dir].tuple.dst.u3.ip);
182 if (!mangle_packet(skb, dataoff, dptr, datalen,
183 poff, plen, buffer, buflen))
184 return NF_DROP;
185 }
186
187 /* The received= parameter (RFC 2361) contains the address
188 * from which the server received the request. */
189 if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
190 "received=", &poff, &plen,
191 &addr, false) > 0 &&
192 addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip &&
193 addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) {
194 buflen = sprintf(buffer, "%pI4",
195 &ct->tuplehash[!dir].tuple.src.u3.ip);
196 if (!mangle_packet(skb, dataoff, dptr, datalen,
197 poff, plen, buffer, buflen))
198 return NF_DROP;
199 }
200
201 /* The rport= parameter (RFC 3581) contains the port number
202 * from which the server received the request. */
203 if (ct_sip_parse_numerical_param(ct, *dptr, matchend, *datalen,
204 "rport=", &poff, &plen,
205 &n) > 0 &&
206 htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port &&
207 htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) {
208 __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port;
209 buflen = sprintf(buffer, "%u", ntohs(p));
210 if (!mangle_packet(skb, dataoff, dptr, datalen,
211 poff, plen, buffer, buflen))
212 return NF_DROP;
213 }
214 }
215
216next:
217 /* Translate Contact headers */
218 coff = 0;
219 in_header = 0;
220 while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen,
221 SIP_HDR_CONTACT, &in_header,
222 &matchoff, &matchlen,
223 &addr, &port) > 0) {
224 if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
225 &addr, port))
226 return NF_DROP;
227 }
228
229 if (!map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_FROM) ||
230 !map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_TO))
231 return NF_DROP;
232
233 return NF_ACCEPT;
234}
235
236static void ip_nat_sip_seq_adjust(struct sk_buff *skb, s16 off)
237{
238 enum ip_conntrack_info ctinfo;
239 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
240 const struct tcphdr *th;
241
242 if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0)
243 return;
244
245 th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
246 nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
247}
248
249/* Handles expected signalling connections and media streams */
250static void ip_nat_sip_expected(struct nf_conn *ct,
251 struct nf_conntrack_expect *exp)
252{
253 struct nf_nat_ipv4_range range;
254
255 /* This must be a fresh one. */
256 BUG_ON(ct->status & IPS_NAT_DONE_MASK);
257
258 /* For DST manip, map port here to where it's expected. */
259 range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
260 range.min = range.max = exp->saved_proto;
261 range.min_ip = range.max_ip = exp->saved_ip;
262 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
263
264 /* Change src to where master sends to, but only if the connection
265 * actually came from the same source. */
266 if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip ==
267 ct->master->tuplehash[exp->dir].tuple.src.u3.ip) {
268 range.flags = NF_NAT_RANGE_MAP_IPS;
269 range.min_ip = range.max_ip
270 = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
271 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
272 }
273}
274
275static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
276 const char **dptr, unsigned int *datalen,
277 struct nf_conntrack_expect *exp,
278 unsigned int matchoff,
279 unsigned int matchlen)
280{
281 enum ip_conntrack_info ctinfo;
282 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
283 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
284 __be32 newip;
285 u_int16_t port;
286 char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
287 unsigned int buflen;
288
289 /* Connection will come from reply */
290 if (ct->tuplehash[dir].tuple.src.u3.ip == ct->tuplehash[!dir].tuple.dst.u3.ip)
291 newip = exp->tuple.dst.u3.ip;
292 else
293 newip = ct->tuplehash[!dir].tuple.dst.u3.ip;
294
295 /* If the signalling port matches the connection's source port in the
296 * original direction, try to use the destination port in the opposite
297 * direction. */
298 if (exp->tuple.dst.u.udp.port ==
299 ct->tuplehash[dir].tuple.src.u.udp.port)
300 port = ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port);
301 else
302 port = ntohs(exp->tuple.dst.u.udp.port);
303
304 exp->saved_ip = exp->tuple.dst.u3.ip;
305 exp->tuple.dst.u3.ip = newip;
306 exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port;
307 exp->dir = !dir;
308 exp->expectfn = ip_nat_sip_expected;
309
310 for (; port != 0; port++) {
311 int ret;
312
313 exp->tuple.dst.u.udp.port = htons(port);
314 ret = nf_ct_expect_related(exp);
315 if (ret == 0)
316 break;
317 else if (ret != -EBUSY) {
318 port = 0;
319 break;
320 }
321 }
322
323 if (port == 0)
324 return NF_DROP;
325
326 if (exp->tuple.dst.u3.ip != exp->saved_ip ||
327 exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) {
328 buflen = sprintf(buffer, "%pI4:%u", &newip, port);
329 if (!mangle_packet(skb, dataoff, dptr, datalen,
330 matchoff, matchlen, buffer, buflen))
331 goto err;
332 }
333 return NF_ACCEPT;
334
335err:
336 nf_ct_unexpect_related(exp);
337 return NF_DROP;
338}
339
340static int mangle_content_len(struct sk_buff *skb, unsigned int dataoff,
341 const char **dptr, unsigned int *datalen)
342{
343 enum ip_conntrack_info ctinfo;
344 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
345 unsigned int matchoff, matchlen;
346 char buffer[sizeof("65536")];
347 int buflen, c_len;
348
349 /* Get actual SDP length */
350 if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen,
351 SDP_HDR_VERSION, SDP_HDR_UNSPEC,
352 &matchoff, &matchlen) <= 0)
353 return 0;
354 c_len = *datalen - matchoff + strlen("v=");
355
356 /* Now, update SDP length */
357 if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CONTENT_LENGTH,
358 &matchoff, &matchlen) <= 0)
359 return 0;
360
361 buflen = sprintf(buffer, "%u", c_len);
362 return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
363 buffer, buflen);
364}
365
366static int mangle_sdp_packet(struct sk_buff *skb, unsigned int dataoff,
367 const char **dptr, unsigned int *datalen,
368 unsigned int sdpoff,
369 enum sdp_header_types type,
370 enum sdp_header_types term,
371 char *buffer, int buflen)
372{
373 enum ip_conntrack_info ctinfo;
374 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
375 unsigned int matchlen, matchoff;
376
377 if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term,
378 &matchoff, &matchlen) <= 0)
379 return -ENOENT;
380 return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
381 buffer, buflen) ? 0 : -EINVAL;
382}
383
384static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, unsigned int dataoff,
385 const char **dptr, unsigned int *datalen,
386 unsigned int sdpoff,
387 enum sdp_header_types type,
388 enum sdp_header_types term,
389 const union nf_inet_addr *addr)
390{
391 char buffer[sizeof("nnn.nnn.nnn.nnn")];
392 unsigned int buflen;
393
394 buflen = sprintf(buffer, "%pI4", &addr->ip);
395 if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, type, term,
396 buffer, buflen))
397 return 0;
398
399 return mangle_content_len(skb, dataoff, dptr, datalen);
400}
401
402static unsigned int ip_nat_sdp_port(struct sk_buff *skb, unsigned int dataoff,
403 const char **dptr, unsigned int *datalen,
404 unsigned int matchoff,
405 unsigned int matchlen,
406 u_int16_t port)
407{
408 char buffer[sizeof("nnnnn")];
409 unsigned int buflen;
410
411 buflen = sprintf(buffer, "%u", port);
412 if (!mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
413 buffer, buflen))
414 return 0;
415
416 return mangle_content_len(skb, dataoff, dptr, datalen);
417}
418
419static unsigned int ip_nat_sdp_session(struct sk_buff *skb, unsigned int dataoff,
420 const char **dptr, unsigned int *datalen,
421 unsigned int sdpoff,
422 const union nf_inet_addr *addr)
423{
424 char buffer[sizeof("nnn.nnn.nnn.nnn")];
425 unsigned int buflen;
426
427 /* Mangle session description owner and contact addresses */
428 buflen = sprintf(buffer, "%pI4", &addr->ip);
429 if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff,
430 SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA,
431 buffer, buflen))
432 return 0;
433
434 switch (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff,
435 SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA,
436 buffer, buflen)) {
437 case 0:
438 /*
439 * RFC 2327:
440 *
441 * Session description
442 *
443 * c=* (connection information - not required if included in all media)
444 */
445 case -ENOENT:
446 break;
447 default:
448 return 0;
449 }
450
451 return mangle_content_len(skb, dataoff, dptr, datalen);
452}
453
454/* So, this packet has hit the connection tracking matching code.
455 Mangle it, and change the expectation to match the new version. */
456static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
457 const char **dptr, unsigned int *datalen,
458 struct nf_conntrack_expect *rtp_exp,
459 struct nf_conntrack_expect *rtcp_exp,
460 unsigned int mediaoff,
461 unsigned int medialen,
462 union nf_inet_addr *rtp_addr)
463{
464 enum ip_conntrack_info ctinfo;
465 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
466 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
467 u_int16_t port;
468
469 /* Connection will come from reply */
470 if (ct->tuplehash[dir].tuple.src.u3.ip ==
471 ct->tuplehash[!dir].tuple.dst.u3.ip)
472 rtp_addr->ip = rtp_exp->tuple.dst.u3.ip;
473 else
474 rtp_addr->ip = ct->tuplehash[!dir].tuple.dst.u3.ip;
475
476 rtp_exp->saved_ip = rtp_exp->tuple.dst.u3.ip;
477 rtp_exp->tuple.dst.u3.ip = rtp_addr->ip;
478 rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
479 rtp_exp->dir = !dir;
480 rtp_exp->expectfn = ip_nat_sip_expected;
481
482 rtcp_exp->saved_ip = rtcp_exp->tuple.dst.u3.ip;
483 rtcp_exp->tuple.dst.u3.ip = rtp_addr->ip;
484 rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port;
485 rtcp_exp->dir = !dir;
486 rtcp_exp->expectfn = ip_nat_sip_expected;
487
488 /* Try to get same pair of ports: if not, try to change them. */
489 for (port = ntohs(rtp_exp->tuple.dst.u.udp.port);
490 port != 0; port += 2) {
491 int ret;
492
493 rtp_exp->tuple.dst.u.udp.port = htons(port);
494 ret = nf_ct_expect_related(rtp_exp);
495 if (ret == -EBUSY)
496 continue;
497 else if (ret < 0) {
498 port = 0;
499 break;
500 }
501 rtcp_exp->tuple.dst.u.udp.port = htons(port + 1);
502 ret = nf_ct_expect_related(rtcp_exp);
503 if (ret == 0)
504 break;
505 else if (ret == -EBUSY) {
506 nf_ct_unexpect_related(rtp_exp);
507 continue;
508 } else if (ret < 0) {
509 nf_ct_unexpect_related(rtp_exp);
510 port = 0;
511 break;
512 }
513 }
514
515 if (port == 0)
516 goto err1;
517
518 /* Update media port. */
519 if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port &&
520 !ip_nat_sdp_port(skb, dataoff, dptr, datalen,
521 mediaoff, medialen, port))
522 goto err2;
523
524 return NF_ACCEPT;
525
526err2:
527 nf_ct_unexpect_related(rtp_exp);
528 nf_ct_unexpect_related(rtcp_exp);
529err1:
530 return NF_DROP;
531}
532
533static struct nf_ct_helper_expectfn sip_nat = {
534 .name = "sip",
535 .expectfn = ip_nat_sip_expected,
536};
537
538static void __exit nf_nat_sip_fini(void)
539{
540 RCU_INIT_POINTER(nf_nat_sip_hook, NULL);
541 RCU_INIT_POINTER(nf_nat_sip_seq_adjust_hook, NULL);
542 RCU_INIT_POINTER(nf_nat_sip_expect_hook, NULL);
543 RCU_INIT_POINTER(nf_nat_sdp_addr_hook, NULL);
544 RCU_INIT_POINTER(nf_nat_sdp_port_hook, NULL);
545 RCU_INIT_POINTER(nf_nat_sdp_session_hook, NULL);
546 RCU_INIT_POINTER(nf_nat_sdp_media_hook, NULL);
547 nf_ct_helper_expectfn_unregister(&sip_nat);
548 synchronize_rcu();
549}
550
551static int __init nf_nat_sip_init(void)
552{
553 BUG_ON(nf_nat_sip_hook != NULL);
554 BUG_ON(nf_nat_sip_seq_adjust_hook != NULL);
555 BUG_ON(nf_nat_sip_expect_hook != NULL);
556 BUG_ON(nf_nat_sdp_addr_hook != NULL);
557 BUG_ON(nf_nat_sdp_port_hook != NULL);
558 BUG_ON(nf_nat_sdp_session_hook != NULL);
559 BUG_ON(nf_nat_sdp_media_hook != NULL);
560 RCU_INIT_POINTER(nf_nat_sip_hook, ip_nat_sip);
561 RCU_INIT_POINTER(nf_nat_sip_seq_adjust_hook, ip_nat_sip_seq_adjust);
562 RCU_INIT_POINTER(nf_nat_sip_expect_hook, ip_nat_sip_expect);
563 RCU_INIT_POINTER(nf_nat_sdp_addr_hook, ip_nat_sdp_addr);
564 RCU_INIT_POINTER(nf_nat_sdp_port_hook, ip_nat_sdp_port);
565 RCU_INIT_POINTER(nf_nat_sdp_session_hook, ip_nat_sdp_session);
566 RCU_INIT_POINTER(nf_nat_sdp_media_hook, ip_nat_sdp_media);
567 nf_ct_helper_expectfn_register(&sip_nat);
568 return 0;
569}
570
571module_init(nf_nat_sip_init);
572module_exit(nf_nat_sip_fini);
diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c
deleted file mode 100644
index 9dbb8d284f99..000000000000
--- a/net/ipv4/netfilter/nf_nat_tftp.c
+++ /dev/null
@@ -1,51 +0,0 @@
1/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
2 *
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License version 2 as
5 * published by the Free Software Foundation.
6 */
7
8#include <linux/module.h>
9#include <linux/udp.h>
10
11#include <net/netfilter/nf_conntrack_helper.h>
12#include <net/netfilter/nf_conntrack_expect.h>
13#include <net/netfilter/nf_nat_helper.h>
14#include <net/netfilter/nf_nat_rule.h>
15#include <linux/netfilter/nf_conntrack_tftp.h>
16
17MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
18MODULE_DESCRIPTION("TFTP NAT helper");
19MODULE_LICENSE("GPL");
20MODULE_ALIAS("ip_nat_tftp");
21
22static unsigned int help(struct sk_buff *skb,
23 enum ip_conntrack_info ctinfo,
24 struct nf_conntrack_expect *exp)
25{
26 const struct nf_conn *ct = exp->master;
27
28 exp->saved_proto.udp.port
29 = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
30 exp->dir = IP_CT_DIR_REPLY;
31 exp->expectfn = nf_nat_follow_master;
32 if (nf_ct_expect_related(exp) != 0)
33 return NF_DROP;
34 return NF_ACCEPT;
35}
36
37static void __exit nf_nat_tftp_fini(void)
38{
39 RCU_INIT_POINTER(nf_nat_tftp_hook, NULL);
40 synchronize_rcu();
41}
42
43static int __init nf_nat_tftp_init(void)
44{
45 BUG_ON(nf_nat_tftp_hook != NULL);
46 RCU_INIT_POINTER(nf_nat_tftp_hook, help);
47 return 0;
48}
49
50module_init(nf_nat_tftp_init);
51module_exit(nf_nat_tftp_fini);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 957acd12250b..8de53e1ddd54 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -263,6 +263,10 @@ static const struct snmp_mib snmp4_net_list[] = {
263 SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), 263 SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
264 SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), 264 SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
265 SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), 265 SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
266 SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE),
267 SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
268 SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
269 SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
266 SNMP_MIB_SENTINEL 270 SNMP_MIB_SENTINEL
267}; 271};
268 272
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index fd9af60397b5..ff622069fcef 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1111,10 +1111,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
1111 const struct rtable *rt = (const struct rtable *) dst; 1111 const struct rtable *rt = (const struct rtable *) dst;
1112 unsigned int mtu = rt->rt_pmtu; 1112 unsigned int mtu = rt->rt_pmtu;
1113 1113
1114 if (mtu && time_after_eq(jiffies, rt->dst.expires)) 1114 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1115 mtu = 0;
1116
1117 if (!mtu)
1118 mtu = dst_metric_raw(dst, RTAX_MTU); 1115 mtu = dst_metric_raw(dst, RTAX_MTU);
1119 1116
1120 if (mtu && rt_is_output_route(rt)) 1117 if (mtu && rt_is_output_route(rt))
@@ -1566,11 +1563,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1566 if (ipv4_is_zeronet(daddr)) 1563 if (ipv4_is_zeronet(daddr))
1567 goto martian_destination; 1564 goto martian_destination;
1568 1565
1569 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) { 1566 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1570 if (ipv4_is_loopback(daddr)) 1567 * and call it once if daddr or/and saddr are loopback addresses
1568 */
1569 if (ipv4_is_loopback(daddr)) {
1570 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1571 goto martian_destination; 1571 goto martian_destination;
1572 1572 } else if (ipv4_is_loopback(saddr)) {
1573 if (ipv4_is_loopback(saddr)) 1573 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1574 goto martian_source; 1574 goto martian_source;
1575 } 1575 }
1576 1576
@@ -1595,7 +1595,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1595 1595
1596 if (res.type == RTN_LOCAL) { 1596 if (res.type == RTN_LOCAL) {
1597 err = fib_validate_source(skb, saddr, daddr, tos, 1597 err = fib_validate_source(skb, saddr, daddr, tos,
1598 net->loopback_dev->ifindex, 1598 LOOPBACK_IFINDEX,
1599 dev, in_dev, &itag); 1599 dev, in_dev, &itag);
1600 if (err < 0) 1600 if (err < 0)
1601 goto martian_source_keep_err; 1601 goto martian_source_keep_err;
@@ -1871,7 +1871,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1871 1871
1872 orig_oif = fl4->flowi4_oif; 1872 orig_oif = fl4->flowi4_oif;
1873 1873
1874 fl4->flowi4_iif = net->loopback_dev->ifindex; 1874 fl4->flowi4_iif = LOOPBACK_IFINDEX;
1875 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 1875 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1876 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 1876 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1877 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 1877 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
@@ -1960,7 +1960,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1960 if (!fl4->daddr) 1960 if (!fl4->daddr)
1961 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); 1961 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1962 dev_out = net->loopback_dev; 1962 dev_out = net->loopback_dev;
1963 fl4->flowi4_oif = net->loopback_dev->ifindex; 1963 fl4->flowi4_oif = LOOPBACK_IFINDEX;
1964 res.type = RTN_LOCAL; 1964 res.type = RTN_LOCAL;
1965 flags |= RTCF_LOCAL; 1965 flags |= RTCF_LOCAL;
1966 goto make_route; 1966 goto make_route;
@@ -2131,7 +2131,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2131EXPORT_SYMBOL_GPL(ip_route_output_flow); 2131EXPORT_SYMBOL_GPL(ip_route_output_flow);
2132 2132
2133static int rt_fill_info(struct net *net, __be32 dst, __be32 src, 2133static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2134 struct flowi4 *fl4, struct sk_buff *skb, u32 pid, 2134 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2135 u32 seq, int event, int nowait, unsigned int flags) 2135 u32 seq, int event, int nowait, unsigned int flags)
2136{ 2136{
2137 struct rtable *rt = skb_rtable(skb); 2137 struct rtable *rt = skb_rtable(skb);
@@ -2141,7 +2141,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2141 u32 error; 2141 u32 error;
2142 u32 metrics[RTAX_MAX]; 2142 u32 metrics[RTAX_MAX];
2143 2143
2144 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); 2144 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2145 if (nlh == NULL) 2145 if (nlh == NULL)
2146 return -EMSGSIZE; 2146 return -EMSGSIZE;
2147 2147
@@ -2301,12 +2301,12 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
2301 rt->rt_flags |= RTCF_NOTIFY; 2301 rt->rt_flags |= RTCF_NOTIFY;
2302 2302
2303 err = rt_fill_info(net, dst, src, &fl4, skb, 2303 err = rt_fill_info(net, dst, src, &fl4, skb,
2304 NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 2304 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2305 RTM_NEWROUTE, 0, 0); 2305 RTM_NEWROUTE, 0, 0);
2306 if (err <= 0) 2306 if (err <= 0)
2307 goto errout_free; 2307 goto errout_free;
2308 2308
2309 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); 2309 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2310errout: 2310errout:
2311 return err; 2311 return err;
2312 2312
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 650e1528e1e6..ba48e799b031 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -319,6 +319,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
319 ireq->tstamp_ok = tcp_opt.saw_tstamp; 319 ireq->tstamp_ok = tcp_opt.saw_tstamp;
320 req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; 320 req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
321 treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; 321 treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
322 treq->listener = NULL;
322 323
323 /* We throwed the options of the initial SYN away, so we hope 324 /* We throwed the options of the initial SYN away, so we hope
324 * the ACK carries the same options again (see RFC1122 4.2.3.8) 325 * the ACK carries the same options again (see RFC1122 4.2.3.8)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3e78c79b5586..9205e492dc9d 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -232,6 +232,45 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
232 return 0; 232 return 0;
233} 233}
234 234
235int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer,
236 size_t *lenp, loff_t *ppos)
237{
238 ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
239 struct tcp_fastopen_context *ctxt;
240 int ret;
241 u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
242
243 tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
244 if (!tbl.data)
245 return -ENOMEM;
246
247 rcu_read_lock();
248 ctxt = rcu_dereference(tcp_fastopen_ctx);
249 if (ctxt)
250 memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
251 rcu_read_unlock();
252
253 snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
254 user_key[0], user_key[1], user_key[2], user_key[3]);
255 ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
256
257 if (write && ret == 0) {
258 if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1,
259 user_key + 2, user_key + 3) != 4) {
260 ret = -EINVAL;
261 goto bad_key;
262 }
263 tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
264 }
265
266bad_key:
267 pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
268 user_key[0], user_key[1], user_key[2], user_key[3],
269 (char *)tbl.data, ret);
270 kfree(tbl.data);
271 return ret;
272}
273
235static struct ctl_table ipv4_table[] = { 274static struct ctl_table ipv4_table[] = {
236 { 275 {
237 .procname = "tcp_timestamps", 276 .procname = "tcp_timestamps",
@@ -386,6 +425,12 @@ static struct ctl_table ipv4_table[] = {
386 .proc_handler = proc_dointvec, 425 .proc_handler = proc_dointvec,
387 }, 426 },
388 { 427 {
428 .procname = "tcp_fastopen_key",
429 .mode = 0600,
430 .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
431 .proc_handler = proc_tcp_fastopen_key,
432 },
433 {
389 .procname = "tcp_tw_recycle", 434 .procname = "tcp_tw_recycle",
390 .data = &tcp_death_row.sysctl_tw_recycle, 435 .data = &tcp_death_row.sysctl_tw_recycle,
391 .maxlen = sizeof(int), 436 .maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5f6419341821..f32c02e2a543 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -486,8 +486,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
486 if (sk->sk_shutdown & RCV_SHUTDOWN) 486 if (sk->sk_shutdown & RCV_SHUTDOWN)
487 mask |= POLLIN | POLLRDNORM | POLLRDHUP; 487 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
488 488
489 /* Connected? */ 489 /* Connected or passive Fast Open socket? */
490 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { 490 if (sk->sk_state != TCP_SYN_SENT &&
491 (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
491 int target = sock_rcvlowat(sk, 0, INT_MAX); 492 int target = sock_rcvlowat(sk, 0, INT_MAX);
492 493
493 if (tp->urg_seq == tp->copied_seq && 494 if (tp->urg_seq == tp->copied_seq &&
@@ -840,10 +841,15 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
840 ssize_t copied; 841 ssize_t copied;
841 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 842 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
842 843
843 /* Wait for a connection to finish. */ 844 /* Wait for a connection to finish. One exception is TCP Fast Open
844 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) 845 * (passive side) where data is allowed to be sent before a connection
846 * is fully established.
847 */
848 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
849 !tcp_passive_fastopen(sk)) {
845 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) 850 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
846 goto out_err; 851 goto out_err;
852 }
847 853
848 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 854 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
849 855
@@ -1042,10 +1048,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1042 1048
1043 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 1049 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1044 1050
1045 /* Wait for a connection to finish. */ 1051 /* Wait for a connection to finish. One exception is TCP Fast Open
1046 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) 1052 * (passive side) where data is allowed to be sent before a connection
1053 * is fully established.
1054 */
1055 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1056 !tcp_passive_fastopen(sk)) {
1047 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) 1057 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
1048 goto do_error; 1058 goto do_error;
1059 }
1049 1060
1050 if (unlikely(tp->repair)) { 1061 if (unlikely(tp->repair)) {
1051 if (tp->repair_queue == TCP_RECV_QUEUE) { 1062 if (tp->repair_queue == TCP_RECV_QUEUE) {
@@ -1139,78 +1150,43 @@ new_segment:
1139 if (err) 1150 if (err)
1140 goto do_fault; 1151 goto do_fault;
1141 } else { 1152 } else {
1142 bool merge = false; 1153 bool merge = true;
1143 int i = skb_shinfo(skb)->nr_frags; 1154 int i = skb_shinfo(skb)->nr_frags;
1144 struct page *page = sk->sk_sndmsg_page; 1155 struct page_frag *pfrag = sk_page_frag(sk);
1145 int off; 1156
1146 1157 if (!sk_page_frag_refill(sk, pfrag))
1147 if (page && page_count(page) == 1) 1158 goto wait_for_memory;
1148 sk->sk_sndmsg_off = 0; 1159
1149 1160 if (!skb_can_coalesce(skb, i, pfrag->page,
1150 off = sk->sk_sndmsg_off; 1161 pfrag->offset)) {
1151 1162 if (i == MAX_SKB_FRAGS || !sg) {
1152 if (skb_can_coalesce(skb, i, page, off) && 1163 tcp_mark_push(tp, skb);
1153 off != PAGE_SIZE) { 1164 goto new_segment;
1154 /* We can extend the last page
1155 * fragment. */
1156 merge = true;
1157 } else if (i == MAX_SKB_FRAGS || !sg) {
1158 /* Need to add new fragment and cannot
1159 * do this because interface is non-SG,
1160 * or because all the page slots are
1161 * busy. */
1162 tcp_mark_push(tp, skb);
1163 goto new_segment;
1164 } else if (page) {
1165 if (off == PAGE_SIZE) {
1166 put_page(page);
1167 sk->sk_sndmsg_page = page = NULL;
1168 off = 0;
1169 } 1165 }
1170 } else 1166 merge = false;
1171 off = 0; 1167 }
1172 1168
1173 if (copy > PAGE_SIZE - off) 1169 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1174 copy = PAGE_SIZE - off;
1175 1170
1176 if (!sk_wmem_schedule(sk, copy)) 1171 if (!sk_wmem_schedule(sk, copy))
1177 goto wait_for_memory; 1172 goto wait_for_memory;
1178 1173
1179 if (!page) {
1180 /* Allocate new cache page. */
1181 if (!(page = sk_stream_alloc_page(sk)))
1182 goto wait_for_memory;
1183 }
1184
1185 /* Time to copy data. We are close to
1186 * the end! */
1187 err = skb_copy_to_page_nocache(sk, from, skb, 1174 err = skb_copy_to_page_nocache(sk, from, skb,
1188 page, off, copy); 1175 pfrag->page,
1189 if (err) { 1176 pfrag->offset,
1190 /* If this page was new, give it to the 1177 copy);
1191 * socket so it does not get leaked. 1178 if (err)
1192 */
1193 if (!sk->sk_sndmsg_page) {
1194 sk->sk_sndmsg_page = page;
1195 sk->sk_sndmsg_off = 0;
1196 }
1197 goto do_error; 1179 goto do_error;
1198 }
1199 1180
1200 /* Update the skb. */ 1181 /* Update the skb. */
1201 if (merge) { 1182 if (merge) {
1202 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1183 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1203 } else { 1184 } else {
1204 skb_fill_page_desc(skb, i, page, off, copy); 1185 skb_fill_page_desc(skb, i, pfrag->page,
1205 if (sk->sk_sndmsg_page) { 1186 pfrag->offset, copy);
1206 get_page(page); 1187 get_page(pfrag->page);
1207 } else if (off + copy < PAGE_SIZE) {
1208 get_page(page);
1209 sk->sk_sndmsg_page = page;
1210 }
1211 } 1188 }
1212 1189 pfrag->offset += copy;
1213 sk->sk_sndmsg_off = off + copy;
1214 } 1190 }
1215 1191
1216 if (!copied) 1192 if (!copied)
@@ -2150,6 +2126,10 @@ void tcp_close(struct sock *sk, long timeout)
2150 * they look as CLOSING or LAST_ACK for Linux) 2126 * they look as CLOSING or LAST_ACK for Linux)
2151 * Probably, I missed some more holelets. 2127 * Probably, I missed some more holelets.
2152 * --ANK 2128 * --ANK
2129 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
2130 * in a single packet! (May consider it later but will
2131 * probably need API support or TCP_CORK SYN-ACK until
2132 * data is written and socket is closed.)
2153 */ 2133 */
2154 tcp_send_fin(sk); 2134 tcp_send_fin(sk);
2155 } 2135 }
@@ -2221,8 +2201,16 @@ adjudge_to_death:
2221 } 2201 }
2222 } 2202 }
2223 2203
2224 if (sk->sk_state == TCP_CLOSE) 2204 if (sk->sk_state == TCP_CLOSE) {
2205 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2206 /* We could get here with a non-NULL req if the socket is
2207 * aborted (e.g., closed with unread data) before 3WHS
2208 * finishes.
2209 */
2210 if (req != NULL)
2211 reqsk_fastopen_remove(sk, req, false);
2225 inet_csk_destroy_sock(sk); 2212 inet_csk_destroy_sock(sk);
2213 }
2226 /* Otherwise, socket is reprieved until protocol close. */ 2214 /* Otherwise, socket is reprieved until protocol close. */
2227 2215
2228out: 2216out:
@@ -2308,6 +2296,13 @@ int tcp_disconnect(struct sock *sk, int flags)
2308} 2296}
2309EXPORT_SYMBOL(tcp_disconnect); 2297EXPORT_SYMBOL(tcp_disconnect);
2310 2298
2299void tcp_sock_destruct(struct sock *sk)
2300{
2301 inet_sock_destruct(sk);
2302
2303 kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
2304}
2305
2311static inline bool tcp_can_repair_sock(const struct sock *sk) 2306static inline bool tcp_can_repair_sock(const struct sock *sk)
2312{ 2307{
2313 return capable(CAP_NET_ADMIN) && 2308 return capable(CAP_NET_ADMIN) &&
@@ -2701,6 +2696,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2701 else 2696 else
2702 icsk->icsk_user_timeout = msecs_to_jiffies(val); 2697 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2703 break; 2698 break;
2699
2700 case TCP_FASTOPEN:
2701 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
2702 TCPF_LISTEN)))
2703 err = fastopen_init_queue(sk, val);
2704 else
2705 err = -EINVAL;
2706 break;
2704 default: 2707 default:
2705 err = -ENOPROTOOPT; 2708 err = -ENOPROTOOPT;
2706 break; 2709 break;
@@ -3514,11 +3517,15 @@ EXPORT_SYMBOL(tcp_cookie_generator);
3514 3517
3515void tcp_done(struct sock *sk) 3518void tcp_done(struct sock *sk)
3516{ 3519{
3520 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3521
3517 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) 3522 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3518 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); 3523 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3519 3524
3520 tcp_set_state(sk, TCP_CLOSE); 3525 tcp_set_state(sk, TCP_CLOSE);
3521 tcp_clear_xmit_timers(sk); 3526 tcp_clear_xmit_timers(sk);
3527 if (req != NULL)
3528 reqsk_fastopen_remove(sk, req, false);
3522 3529
3523 sk->sk_shutdown = SHUTDOWN_MASK; 3530 sk->sk_shutdown = SHUTDOWN_MASK;
3524 3531
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index a7f729c409d7..8f7ef0ad80e5 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -1,10 +1,91 @@
1#include <linux/err.h>
1#include <linux/init.h> 2#include <linux/init.h>
2#include <linux/kernel.h> 3#include <linux/kernel.h>
4#include <linux/list.h>
5#include <linux/tcp.h>
6#include <linux/rcupdate.h>
7#include <linux/rculist.h>
8#include <net/inetpeer.h>
9#include <net/tcp.h>
3 10
4int sysctl_tcp_fastopen; 11int sysctl_tcp_fastopen __read_mostly;
12
13struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
14
15static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
16
17static void tcp_fastopen_ctx_free(struct rcu_head *head)
18{
19 struct tcp_fastopen_context *ctx =
20 container_of(head, struct tcp_fastopen_context, rcu);
21 crypto_free_cipher(ctx->tfm);
22 kfree(ctx);
23}
24
25int tcp_fastopen_reset_cipher(void *key, unsigned int len)
26{
27 int err;
28 struct tcp_fastopen_context *ctx, *octx;
29
30 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
31 if (!ctx)
32 return -ENOMEM;
33 ctx->tfm = crypto_alloc_cipher("aes", 0, 0);
34
35 if (IS_ERR(ctx->tfm)) {
36 err = PTR_ERR(ctx->tfm);
37error: kfree(ctx);
38 pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
39 return err;
40 }
41 err = crypto_cipher_setkey(ctx->tfm, key, len);
42 if (err) {
43 pr_err("TCP: TFO cipher key error: %d\n", err);
44 crypto_free_cipher(ctx->tfm);
45 goto error;
46 }
47 memcpy(ctx->key, key, len);
48
49 spin_lock(&tcp_fastopen_ctx_lock);
50
51 octx = rcu_dereference_protected(tcp_fastopen_ctx,
52 lockdep_is_held(&tcp_fastopen_ctx_lock));
53 rcu_assign_pointer(tcp_fastopen_ctx, ctx);
54 spin_unlock(&tcp_fastopen_ctx_lock);
55
56 if (octx)
57 call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
58 return err;
59}
60
61/* Computes the fastopen cookie for the peer.
62 * The peer address is a 128 bits long (pad with zeros for IPv4).
63 *
64 * The caller must check foc->len to determine if a valid cookie
65 * has been generated successfully.
66*/
67void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc)
68{
69 __be32 peer_addr[4] = { addr, 0, 0, 0 };
70 struct tcp_fastopen_context *ctx;
71
72 rcu_read_lock();
73 ctx = rcu_dereference(tcp_fastopen_ctx);
74 if (ctx) {
75 crypto_cipher_encrypt_one(ctx->tfm,
76 foc->val,
77 (__u8 *)peer_addr);
78 foc->len = TCP_FASTOPEN_COOKIE_SIZE;
79 }
80 rcu_read_unlock();
81}
5 82
6static int __init tcp_fastopen_init(void) 83static int __init tcp_fastopen_init(void)
7{ 84{
85 __u8 key[TCP_FASTOPEN_KEY_LENGTH];
86
87 get_random_bytes(key, sizeof(key));
88 tcp_fastopen_reset_cipher(key, sizeof(key));
8 return 0; 89 return 0;
9} 90}
10 91
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d377f4854cb8..432c36649db3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -237,7 +237,11 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
237 tcp_enter_quickack_mode((struct sock *)tp); 237 tcp_enter_quickack_mode((struct sock *)tp);
238 break; 238 break;
239 case INET_ECN_CE: 239 case INET_ECN_CE:
240 tp->ecn_flags |= TCP_ECN_DEMAND_CWR; 240 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
241 /* Better not delay acks, sender can have a very low cwnd */
242 tcp_enter_quickack_mode((struct sock *)tp);
243 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
244 }
241 /* fallinto */ 245 /* fallinto */
242 default: 246 default:
243 tp->ecn_flags |= TCP_ECN_SEEN; 247 tp->ecn_flags |= TCP_ECN_SEEN;
@@ -374,7 +378,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
374/* 4. Try to fixup all. It is made immediately after connection enters 378/* 4. Try to fixup all. It is made immediately after connection enters
375 * established state. 379 * established state.
376 */ 380 */
377static void tcp_init_buffer_space(struct sock *sk) 381void tcp_init_buffer_space(struct sock *sk)
378{ 382{
379 struct tcp_sock *tp = tcp_sk(sk); 383 struct tcp_sock *tp = tcp_sk(sk);
380 int maxwin; 384 int maxwin;
@@ -739,29 +743,6 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
739 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 743 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
740} 744}
741 745
742/* Set slow start threshold and cwnd not falling to slow start */
743void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
744{
745 struct tcp_sock *tp = tcp_sk(sk);
746 const struct inet_connection_sock *icsk = inet_csk(sk);
747
748 tp->prior_ssthresh = 0;
749 tp->bytes_acked = 0;
750 if (icsk->icsk_ca_state < TCP_CA_CWR) {
751 tp->undo_marker = 0;
752 if (set_ssthresh)
753 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
754 tp->snd_cwnd = min(tp->snd_cwnd,
755 tcp_packets_in_flight(tp) + 1U);
756 tp->snd_cwnd_cnt = 0;
757 tp->high_seq = tp->snd_nxt;
758 tp->snd_cwnd_stamp = tcp_time_stamp;
759 TCP_ECN_queue_cwr(tp);
760
761 tcp_set_ca_state(sk, TCP_CA_CWR);
762 }
763}
764
765/* 746/*
766 * Packet counting of FACK is based on in-order assumptions, therefore TCP 747 * Packet counting of FACK is based on in-order assumptions, therefore TCP
767 * disables it when reordering is detected 748 * disables it when reordering is detected
@@ -2489,35 +2470,6 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
2489 tp->snd_cwnd_stamp = tcp_time_stamp; 2470 tp->snd_cwnd_stamp = tcp_time_stamp;
2490} 2471}
2491 2472
2492/* Lower bound on congestion window is slow start threshold
2493 * unless congestion avoidance choice decides to overide it.
2494 */
2495static inline u32 tcp_cwnd_min(const struct sock *sk)
2496{
2497 const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
2498
2499 return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
2500}
2501
2502/* Decrease cwnd each second ack. */
2503static void tcp_cwnd_down(struct sock *sk, int flag)
2504{
2505 struct tcp_sock *tp = tcp_sk(sk);
2506 int decr = tp->snd_cwnd_cnt + 1;
2507
2508 if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) ||
2509 (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) {
2510 tp->snd_cwnd_cnt = decr & 1;
2511 decr >>= 1;
2512
2513 if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
2514 tp->snd_cwnd -= decr;
2515
2516 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
2517 tp->snd_cwnd_stamp = tcp_time_stamp;
2518 }
2519}
2520
2521/* Nothing was retransmitted or returned timestamp is less 2473/* Nothing was retransmitted or returned timestamp is less
2522 * than timestamp of the first retransmission. 2474 * than timestamp of the first retransmission.
2523 */ 2475 */
@@ -2719,24 +2671,80 @@ static bool tcp_try_undo_loss(struct sock *sk)
2719 return false; 2671 return false;
2720} 2672}
2721 2673
2722static inline void tcp_complete_cwr(struct sock *sk) 2674/* The cwnd reduction in CWR and Recovery use the PRR algorithm
2675 * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/
2676 * It computes the number of packets to send (sndcnt) based on packets newly
2677 * delivered:
2678 * 1) If the packets in flight is larger than ssthresh, PRR spreads the
2679 * cwnd reductions across a full RTT.
2680 * 2) If packets in flight is lower than ssthresh (such as due to excess
2681 * losses and/or application stalls), do not perform any further cwnd
2682 * reductions, but instead slow start up to ssthresh.
2683 */
2684static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
2723{ 2685{
2724 struct tcp_sock *tp = tcp_sk(sk); 2686 struct tcp_sock *tp = tcp_sk(sk);
2725 2687
2726 /* Do not moderate cwnd if it's already undone in cwr or recovery. */ 2688 tp->high_seq = tp->snd_nxt;
2727 if (tp->undo_marker) { 2689 tp->bytes_acked = 0;
2728 if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) { 2690 tp->snd_cwnd_cnt = 0;
2729 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); 2691 tp->prior_cwnd = tp->snd_cwnd;
2730 tp->snd_cwnd_stamp = tcp_time_stamp; 2692 tp->prr_delivered = 0;
2731 } else if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) { 2693 tp->prr_out = 0;
2732 /* PRR algorithm. */ 2694 if (set_ssthresh)
2733 tp->snd_cwnd = tp->snd_ssthresh; 2695 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2734 tp->snd_cwnd_stamp = tcp_time_stamp; 2696 TCP_ECN_queue_cwr(tp);
2735 } 2697}
2698
2699static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
2700 int fast_rexmit)
2701{
2702 struct tcp_sock *tp = tcp_sk(sk);
2703 int sndcnt = 0;
2704 int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
2705
2706 tp->prr_delivered += newly_acked_sacked;
2707 if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
2708 u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
2709 tp->prior_cwnd - 1;
2710 sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
2711 } else {
2712 sndcnt = min_t(int, delta,
2713 max_t(int, tp->prr_delivered - tp->prr_out,
2714 newly_acked_sacked) + 1);
2715 }
2716
2717 sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
2718 tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
2719}
2720
2721static inline void tcp_end_cwnd_reduction(struct sock *sk)
2722{
2723 struct tcp_sock *tp = tcp_sk(sk);
2724
2725 /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
2726 if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
2727 (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
2728 tp->snd_cwnd = tp->snd_ssthresh;
2729 tp->snd_cwnd_stamp = tcp_time_stamp;
2736 } 2730 }
2737 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); 2731 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
2738} 2732}
2739 2733
2734/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
2735void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
2736{
2737 struct tcp_sock *tp = tcp_sk(sk);
2738
2739 tp->prior_ssthresh = 0;
2740 tp->bytes_acked = 0;
2741 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2742 tp->undo_marker = 0;
2743 tcp_init_cwnd_reduction(sk, set_ssthresh);
2744 tcp_set_ca_state(sk, TCP_CA_CWR);
2745 }
2746}
2747
2740static void tcp_try_keep_open(struct sock *sk) 2748static void tcp_try_keep_open(struct sock *sk)
2741{ 2749{
2742 struct tcp_sock *tp = tcp_sk(sk); 2750 struct tcp_sock *tp = tcp_sk(sk);
@@ -2751,7 +2759,7 @@ static void tcp_try_keep_open(struct sock *sk)
2751 } 2759 }
2752} 2760}
2753 2761
2754static void tcp_try_to_open(struct sock *sk, int flag) 2762static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked)
2755{ 2763{
2756 struct tcp_sock *tp = tcp_sk(sk); 2764 struct tcp_sock *tp = tcp_sk(sk);
2757 2765
@@ -2768,7 +2776,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
2768 if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open) 2776 if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
2769 tcp_moderate_cwnd(tp); 2777 tcp_moderate_cwnd(tp);
2770 } else { 2778 } else {
2771 tcp_cwnd_down(sk, flag); 2779 tcp_cwnd_reduction(sk, newly_acked_sacked, 0);
2772 } 2780 }
2773} 2781}
2774 2782
@@ -2850,38 +2858,6 @@ void tcp_simple_retransmit(struct sock *sk)
2850} 2858}
2851EXPORT_SYMBOL(tcp_simple_retransmit); 2859EXPORT_SYMBOL(tcp_simple_retransmit);
2852 2860
2853/* This function implements the PRR algorithm, specifcally the PRR-SSRB
2854 * (proportional rate reduction with slow start reduction bound) as described in
2855 * http://www.ietf.org/id/draft-mathis-tcpm-proportional-rate-reduction-01.txt.
2856 * It computes the number of packets to send (sndcnt) based on packets newly
2857 * delivered:
2858 * 1) If the packets in flight is larger than ssthresh, PRR spreads the
2859 * cwnd reductions across a full RTT.
2860 * 2) If packets in flight is lower than ssthresh (such as due to excess
2861 * losses and/or application stalls), do not perform any further cwnd
2862 * reductions, but instead slow start up to ssthresh.
2863 */
2864static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
2865 int fast_rexmit, int flag)
2866{
2867 struct tcp_sock *tp = tcp_sk(sk);
2868 int sndcnt = 0;
2869 int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
2870
2871 if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
2872 u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
2873 tp->prior_cwnd - 1;
2874 sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
2875 } else {
2876 sndcnt = min_t(int, delta,
2877 max_t(int, tp->prr_delivered - tp->prr_out,
2878 newly_acked_sacked) + 1);
2879 }
2880
2881 sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
2882 tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
2883}
2884
2885static void tcp_enter_recovery(struct sock *sk, bool ece_ack) 2861static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2886{ 2862{
2887 struct tcp_sock *tp = tcp_sk(sk); 2863 struct tcp_sock *tp = tcp_sk(sk);
@@ -2894,7 +2870,6 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2894 2870
2895 NET_INC_STATS_BH(sock_net(sk), mib_idx); 2871 NET_INC_STATS_BH(sock_net(sk), mib_idx);
2896 2872
2897 tp->high_seq = tp->snd_nxt;
2898 tp->prior_ssthresh = 0; 2873 tp->prior_ssthresh = 0;
2899 tp->undo_marker = tp->snd_una; 2874 tp->undo_marker = tp->snd_una;
2900 tp->undo_retrans = tp->retrans_out; 2875 tp->undo_retrans = tp->retrans_out;
@@ -2902,15 +2877,8 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2902 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { 2877 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2903 if (!ece_ack) 2878 if (!ece_ack)
2904 tp->prior_ssthresh = tcp_current_ssthresh(sk); 2879 tp->prior_ssthresh = tcp_current_ssthresh(sk);
2905 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); 2880 tcp_init_cwnd_reduction(sk, true);
2906 TCP_ECN_queue_cwr(tp);
2907 } 2881 }
2908
2909 tp->bytes_acked = 0;
2910 tp->snd_cwnd_cnt = 0;
2911 tp->prior_cwnd = tp->snd_cwnd;
2912 tp->prr_delivered = 0;
2913 tp->prr_out = 0;
2914 tcp_set_ca_state(sk, TCP_CA_Recovery); 2882 tcp_set_ca_state(sk, TCP_CA_Recovery);
2915} 2883}
2916 2884
@@ -2970,7 +2938,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2970 /* CWR is to be held something *above* high_seq 2938 /* CWR is to be held something *above* high_seq
2971 * is ACKed for CWR bit to reach receiver. */ 2939 * is ACKed for CWR bit to reach receiver. */
2972 if (tp->snd_una != tp->high_seq) { 2940 if (tp->snd_una != tp->high_seq) {
2973 tcp_complete_cwr(sk); 2941 tcp_end_cwnd_reduction(sk);
2974 tcp_set_ca_state(sk, TCP_CA_Open); 2942 tcp_set_ca_state(sk, TCP_CA_Open);
2975 } 2943 }
2976 break; 2944 break;
@@ -2980,7 +2948,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2980 tcp_reset_reno_sack(tp); 2948 tcp_reset_reno_sack(tp);
2981 if (tcp_try_undo_recovery(sk)) 2949 if (tcp_try_undo_recovery(sk))
2982 return; 2950 return;
2983 tcp_complete_cwr(sk); 2951 tcp_end_cwnd_reduction(sk);
2984 break; 2952 break;
2985 } 2953 }
2986 } 2954 }
@@ -3021,7 +2989,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
3021 tcp_try_undo_dsack(sk); 2989 tcp_try_undo_dsack(sk);
3022 2990
3023 if (!tcp_time_to_recover(sk, flag)) { 2991 if (!tcp_time_to_recover(sk, flag)) {
3024 tcp_try_to_open(sk, flag); 2992 tcp_try_to_open(sk, flag, newly_acked_sacked);
3025 return; 2993 return;
3026 } 2994 }
3027 2995
@@ -3043,8 +3011,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
3043 3011
3044 if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) 3012 if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
3045 tcp_update_scoreboard(sk, fast_rexmit); 3013 tcp_update_scoreboard(sk, fast_rexmit);
3046 tp->prr_delivered += newly_acked_sacked; 3014 tcp_cwnd_reduction(sk, newly_acked_sacked, fast_rexmit);
3047 tcp_update_cwnd_in_recovery(sk, newly_acked_sacked, fast_rexmit, flag);
3048 tcp_xmit_retransmit_queue(sk); 3015 tcp_xmit_retransmit_queue(sk);
3049} 3016}
3050 3017
@@ -3123,6 +3090,12 @@ void tcp_rearm_rto(struct sock *sk)
3123{ 3090{
3124 struct tcp_sock *tp = tcp_sk(sk); 3091 struct tcp_sock *tp = tcp_sk(sk);
3125 3092
3093 /* If the retrans timer is currently being used by Fast Open
3094 * for SYN-ACK retrans purpose, stay put.
3095 */
3096 if (tp->fastopen_rsk)
3097 return;
3098
3126 if (!tp->packets_out) { 3099 if (!tp->packets_out) {
3127 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); 3100 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
3128 } else { 3101 } else {
@@ -3384,7 +3357,7 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
3384{ 3357{
3385 const struct tcp_sock *tp = tcp_sk(sk); 3358 const struct tcp_sock *tp = tcp_sk(sk);
3386 return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && 3359 return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
3387 !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR)); 3360 !tcp_in_cwnd_reduction(sk);
3388} 3361}
3389 3362
3390/* Check that window update is acceptable. 3363/* Check that window update is acceptable.
@@ -3452,9 +3425,9 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
3452} 3425}
3453 3426
3454/* A conservative spurious RTO response algorithm: reduce cwnd using 3427/* A conservative spurious RTO response algorithm: reduce cwnd using
3455 * rate halving and continue in congestion avoidance. 3428 * PRR and continue in congestion avoidance.
3456 */ 3429 */
3457static void tcp_ratehalving_spur_to_response(struct sock *sk) 3430static void tcp_cwr_spur_to_response(struct sock *sk)
3458{ 3431{
3459 tcp_enter_cwr(sk, 0); 3432 tcp_enter_cwr(sk, 0);
3460} 3433}
@@ -3462,7 +3435,7 @@ static void tcp_ratehalving_spur_to_response(struct sock *sk)
3462static void tcp_undo_spur_to_response(struct sock *sk, int flag) 3435static void tcp_undo_spur_to_response(struct sock *sk, int flag)
3463{ 3436{
3464 if (flag & FLAG_ECE) 3437 if (flag & FLAG_ECE)
3465 tcp_ratehalving_spur_to_response(sk); 3438 tcp_cwr_spur_to_response(sk);
3466 else 3439 else
3467 tcp_undo_cwr(sk, true); 3440 tcp_undo_cwr(sk, true);
3468} 3441}
@@ -3569,7 +3542,7 @@ static bool tcp_process_frto(struct sock *sk, int flag)
3569 tcp_conservative_spur_to_response(tp); 3542 tcp_conservative_spur_to_response(tp);
3570 break; 3543 break;
3571 default: 3544 default:
3572 tcp_ratehalving_spur_to_response(sk); 3545 tcp_cwr_spur_to_response(sk);
3573 break; 3546 break;
3574 } 3547 }
3575 tp->frto_counter = 0; 3548 tp->frto_counter = 0;
@@ -4034,7 +4007,7 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
4034} 4007}
4035 4008
4036/* When we get a reset we do this. */ 4009/* When we get a reset we do this. */
4037static void tcp_reset(struct sock *sk) 4010void tcp_reset(struct sock *sk)
4038{ 4011{
4039 /* We want the right error as BSD sees it (and indeed as we do). */ 4012 /* We want the right error as BSD sees it (and indeed as we do). */
4040 switch (sk->sk_state) { 4013 switch (sk->sk_state) {
@@ -5740,7 +5713,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5740 5713
5741 TCP_ECN_rcv_synack(tp, th); 5714 TCP_ECN_rcv_synack(tp, th);
5742 5715
5743 tp->snd_wl1 = TCP_SKB_CB(skb)->seq; 5716 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5744 tcp_ack(sk, skb, FLAG_SLOWPATH); 5717 tcp_ack(sk, skb, FLAG_SLOWPATH);
5745 5718
5746 /* Ok.. it's good. Set up sequence numbers and 5719 /* Ok.. it's good. Set up sequence numbers and
@@ -5753,7 +5726,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5753 * never scaled. 5726 * never scaled.
5754 */ 5727 */
5755 tp->snd_wnd = ntohs(th->window); 5728 tp->snd_wnd = ntohs(th->window);
5756 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5757 5729
5758 if (!tp->rx_opt.wscale_ok) { 5730 if (!tp->rx_opt.wscale_ok) {
5759 tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; 5731 tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
@@ -5891,7 +5863,9 @@ discard:
5891 tcp_send_synack(sk); 5863 tcp_send_synack(sk);
5892#if 0 5864#if 0
5893 /* Note, we could accept data and URG from this segment. 5865 /* Note, we could accept data and URG from this segment.
5894 * There are no obstacles to make this. 5866 * There are no obstacles to make this (except that we must
5867 * either change tcp_recvmsg() to prevent it from returning data
5868 * before 3WHS completes per RFC793, or employ TCP Fast Open).
5895 * 5869 *
5896 * However, if we ignore data in ACKless segments sometimes, 5870 * However, if we ignore data in ACKless segments sometimes,
5897 * we have no reasons to accept it sometimes. 5871 * we have no reasons to accept it sometimes.
@@ -5931,6 +5905,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5931{ 5905{
5932 struct tcp_sock *tp = tcp_sk(sk); 5906 struct tcp_sock *tp = tcp_sk(sk);
5933 struct inet_connection_sock *icsk = inet_csk(sk); 5907 struct inet_connection_sock *icsk = inet_csk(sk);
5908 struct request_sock *req;
5934 int queued = 0; 5909 int queued = 0;
5935 5910
5936 tp->rx_opt.saw_tstamp = 0; 5911 tp->rx_opt.saw_tstamp = 0;
@@ -5986,6 +5961,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5986 return 0; 5961 return 0;
5987 } 5962 }
5988 5963
5964 req = tp->fastopen_rsk;
5965 if (req != NULL) {
5966 BUG_ON(sk->sk_state != TCP_SYN_RECV &&
5967 sk->sk_state != TCP_FIN_WAIT1);
5968
5969 if (tcp_check_req(sk, skb, req, NULL, true) == NULL)
5970 goto discard;
5971 }
5989 if (!tcp_validate_incoming(sk, skb, th, 0)) 5972 if (!tcp_validate_incoming(sk, skb, th, 0))
5990 return 0; 5973 return 0;
5991 5974
@@ -5996,7 +5979,25 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5996 switch (sk->sk_state) { 5979 switch (sk->sk_state) {
5997 case TCP_SYN_RECV: 5980 case TCP_SYN_RECV:
5998 if (acceptable) { 5981 if (acceptable) {
5999 tp->copied_seq = tp->rcv_nxt; 5982 /* Once we leave TCP_SYN_RECV, we no longer
5983 * need req so release it.
5984 */
5985 if (req) {
5986 tcp_synack_rtt_meas(sk, req);
5987 tp->total_retrans = req->retrans;
5988
5989 reqsk_fastopen_remove(sk, req, false);
5990 } else {
5991 /* Make sure socket is routed, for
5992 * correct metrics.
5993 */
5994 icsk->icsk_af_ops->rebuild_header(sk);
5995 tcp_init_congestion_control(sk);
5996
5997 tcp_mtup_init(sk);
5998 tcp_init_buffer_space(sk);
5999 tp->copied_seq = tp->rcv_nxt;
6000 }
6000 smp_mb(); 6001 smp_mb();
6001 tcp_set_state(sk, TCP_ESTABLISHED); 6002 tcp_set_state(sk, TCP_ESTABLISHED);
6002 sk->sk_state_change(sk); 6003 sk->sk_state_change(sk);
@@ -6018,23 +6019,27 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
6018 if (tp->rx_opt.tstamp_ok) 6019 if (tp->rx_opt.tstamp_ok)
6019 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 6020 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
6020 6021
6021 /* Make sure socket is routed, for 6022 if (req) {
6022 * correct metrics. 6023 /* Re-arm the timer because data may
6023 */ 6024 * have been sent out. This is similar
6024 icsk->icsk_af_ops->rebuild_header(sk); 6025 * to the regular data transmission case
6025 6026 * when new data has just been ack'ed.
6026 tcp_init_metrics(sk); 6027 *
6027 6028 * (TFO) - we could try to be more
6028 tcp_init_congestion_control(sk); 6029 * aggressive and retranmitting any data
6030 * sooner based on when they were sent
6031 * out.
6032 */
6033 tcp_rearm_rto(sk);
6034 } else
6035 tcp_init_metrics(sk);
6029 6036
6030 /* Prevent spurious tcp_cwnd_restart() on 6037 /* Prevent spurious tcp_cwnd_restart() on
6031 * first data packet. 6038 * first data packet.
6032 */ 6039 */
6033 tp->lsndtime = tcp_time_stamp; 6040 tp->lsndtime = tcp_time_stamp;
6034 6041
6035 tcp_mtup_init(sk);
6036 tcp_initialize_rcv_mss(sk); 6042 tcp_initialize_rcv_mss(sk);
6037 tcp_init_buffer_space(sk);
6038 tcp_fast_path_on(tp); 6043 tcp_fast_path_on(tp);
6039 } else { 6044 } else {
6040 return 1; 6045 return 1;
@@ -6042,6 +6047,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
6042 break; 6047 break;
6043 6048
6044 case TCP_FIN_WAIT1: 6049 case TCP_FIN_WAIT1:
6050 /* If we enter the TCP_FIN_WAIT1 state and we are a
6051 * Fast Open socket and this is the first acceptable
6052 * ACK we have received, this would have acknowledged
6053 * our SYNACK so stop the SYNACK timer.
6054 */
6055 if (acceptable && req != NULL) {
6056 /* We no longer need the request sock. */
6057 reqsk_fastopen_remove(sk, req, false);
6058 tcp_rearm_rto(sk);
6059 }
6045 if (tp->snd_una == tp->write_seq) { 6060 if (tp->snd_una == tp->write_seq) {
6046 struct dst_entry *dst; 6061 struct dst_entry *dst;
6047 6062
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index be23a0b7b89e..75735c9a6a9d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -352,6 +352,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
352 const int code = icmp_hdr(icmp_skb)->code; 352 const int code = icmp_hdr(icmp_skb)->code;
353 struct sock *sk; 353 struct sock *sk;
354 struct sk_buff *skb; 354 struct sk_buff *skb;
355 struct request_sock *req;
355 __u32 seq; 356 __u32 seq;
356 __u32 remaining; 357 __u32 remaining;
357 int err; 358 int err;
@@ -394,9 +395,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
394 395
395 icsk = inet_csk(sk); 396 icsk = inet_csk(sk);
396 tp = tcp_sk(sk); 397 tp = tcp_sk(sk);
398 req = tp->fastopen_rsk;
397 seq = ntohl(th->seq); 399 seq = ntohl(th->seq);
398 if (sk->sk_state != TCP_LISTEN && 400 if (sk->sk_state != TCP_LISTEN &&
399 !between(seq, tp->snd_una, tp->snd_nxt)) { 401 !between(seq, tp->snd_una, tp->snd_nxt) &&
402 (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
403 /* For a Fast Open socket, allow seq to be snt_isn. */
400 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); 404 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
401 goto out; 405 goto out;
402 } 406 }
@@ -435,6 +439,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
435 !icsk->icsk_backoff) 439 !icsk->icsk_backoff)
436 break; 440 break;
437 441
442 /* XXX (TFO) - revisit the following logic for TFO */
443
438 if (sock_owned_by_user(sk)) 444 if (sock_owned_by_user(sk))
439 break; 445 break;
440 446
@@ -466,6 +472,14 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
466 goto out; 472 goto out;
467 } 473 }
468 474
475 /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
476 * than following the TCP_SYN_RECV case and closing the socket,
477 * we ignore the ICMP error and keep trying like a fully established
478 * socket. Is this the right thing to do?
479 */
480 if (req && req->sk == NULL)
481 goto out;
482
469 switch (sk->sk_state) { 483 switch (sk->sk_state) {
470 struct request_sock *req, **prev; 484 struct request_sock *req, **prev;
471 case TCP_LISTEN: 485 case TCP_LISTEN:
@@ -498,7 +512,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
498 512
499 case TCP_SYN_SENT: 513 case TCP_SYN_SENT:
500 case TCP_SYN_RECV: /* Cannot happen. 514 case TCP_SYN_RECV: /* Cannot happen.
501 It can f.e. if SYNs crossed. 515 It can f.e. if SYNs crossed,
516 or Fast Open.
502 */ 517 */
503 if (!sock_owned_by_user(sk)) { 518 if (!sock_owned_by_user(sk)) {
504 sk->sk_err = err; 519 sk->sk_err = err;
@@ -809,8 +824,12 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
809static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, 824static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
810 struct request_sock *req) 825 struct request_sock *req)
811{ 826{
812 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, 827 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
813 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, 828 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
829 */
830 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
831 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
832 tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
814 req->ts_recent, 833 req->ts_recent,
815 0, 834 0,
816 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, 835 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
@@ -839,7 +858,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
839 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 858 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
840 return -1; 859 return -1;
841 860
842 skb = tcp_make_synack(sk, dst, req, rvp); 861 skb = tcp_make_synack(sk, dst, req, rvp, NULL);
843 862
844 if (skb) { 863 if (skb) {
845 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); 864 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
@@ -849,6 +868,8 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
849 ireq->rmt_addr, 868 ireq->rmt_addr,
850 ireq->opt); 869 ireq->opt);
851 err = net_xmit_eval(err); 870 err = net_xmit_eval(err);
871 if (!tcp_rsk(req)->snt_synack && !err)
872 tcp_rsk(req)->snt_synack = tcp_time_stamp;
852 } 873 }
853 874
854 return err; 875 return err;
@@ -904,8 +925,7 @@ EXPORT_SYMBOL(tcp_syn_flood_action);
904/* 925/*
905 * Save and compile IPv4 options into the request_sock if needed. 926 * Save and compile IPv4 options into the request_sock if needed.
906 */ 927 */
907static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk, 928static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
908 struct sk_buff *skb)
909{ 929{
910 const struct ip_options *opt = &(IPCB(skb)->opt); 930 const struct ip_options *opt = &(IPCB(skb)->opt);
911 struct ip_options_rcu *dopt = NULL; 931 struct ip_options_rcu *dopt = NULL;
@@ -1272,6 +1292,182 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1272}; 1292};
1273#endif 1293#endif
1274 1294
1295static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1296 struct request_sock *req,
1297 struct tcp_fastopen_cookie *foc,
1298 struct tcp_fastopen_cookie *valid_foc)
1299{
1300 bool skip_cookie = false;
1301 struct fastopen_queue *fastopenq;
1302
1303 if (likely(!fastopen_cookie_present(foc))) {
1304 /* See include/net/tcp.h for the meaning of these knobs */
1305 if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1306 ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1307 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1308 skip_cookie = true; /* no cookie to validate */
1309 else
1310 return false;
1311 }
1312 fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1313 /* A FO option is present; bump the counter. */
1314 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1315
1316 /* Make sure the listener has enabled fastopen, and we don't
1317 * exceed the max # of pending TFO requests allowed before trying
1318 * to validating the cookie in order to avoid burning CPU cycles
1319 * unnecessarily.
1320 *
1321 * XXX (TFO) - The implication of checking the max_qlen before
1322 * processing a cookie request is that clients can't differentiate
1323 * between qlen overflow causing Fast Open to be disabled
1324 * temporarily vs a server not supporting Fast Open at all.
1325 */
1326 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1327 fastopenq == NULL || fastopenq->max_qlen == 0)
1328 return false;
1329
1330 if (fastopenq->qlen >= fastopenq->max_qlen) {
1331 struct request_sock *req1;
1332 spin_lock(&fastopenq->lock);
1333 req1 = fastopenq->rskq_rst_head;
1334 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1335 spin_unlock(&fastopenq->lock);
1336 NET_INC_STATS_BH(sock_net(sk),
1337 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1338 /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1339 foc->len = -1;
1340 return false;
1341 }
1342 fastopenq->rskq_rst_head = req1->dl_next;
1343 fastopenq->qlen--;
1344 spin_unlock(&fastopenq->lock);
1345 reqsk_free(req1);
1346 }
1347 if (skip_cookie) {
1348 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1349 return true;
1350 }
1351 if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1352 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1353 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1354 if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1355 memcmp(&foc->val[0], &valid_foc->val[0],
1356 TCP_FASTOPEN_COOKIE_SIZE) != 0)
1357 return false;
1358 valid_foc->len = -1;
1359 }
1360 /* Acknowledge the data received from the peer. */
1361 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1362 return true;
1363 } else if (foc->len == 0) { /* Client requesting a cookie */
1364 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1365 NET_INC_STATS_BH(sock_net(sk),
1366 LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1367 } else {
1368 /* Client sent a cookie with wrong size. Treat it
1369 * the same as invalid and return a valid one.
1370 */
1371 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1372 }
1373 return false;
1374}
1375
1376static int tcp_v4_conn_req_fastopen(struct sock *sk,
1377 struct sk_buff *skb,
1378 struct sk_buff *skb_synack,
1379 struct request_sock *req,
1380 struct request_values *rvp)
1381{
1382 struct tcp_sock *tp = tcp_sk(sk);
1383 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1384 const struct inet_request_sock *ireq = inet_rsk(req);
1385 struct sock *child;
1386 int err;
1387
1388 req->retrans = 0;
1389 req->sk = NULL;
1390
1391 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1392 if (child == NULL) {
1393 NET_INC_STATS_BH(sock_net(sk),
1394 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1395 kfree_skb(skb_synack);
1396 return -1;
1397 }
1398 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1399 ireq->rmt_addr, ireq->opt);
1400 err = net_xmit_eval(err);
1401 if (!err)
1402 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1403 /* XXX (TFO) - is it ok to ignore error and continue? */
1404
1405 spin_lock(&queue->fastopenq->lock);
1406 queue->fastopenq->qlen++;
1407 spin_unlock(&queue->fastopenq->lock);
1408
1409 /* Initialize the child socket. Have to fix some values to take
1410 * into account the child is a Fast Open socket and is created
1411 * only out of the bits carried in the SYN packet.
1412 */
1413 tp = tcp_sk(child);
1414
1415 tp->fastopen_rsk = req;
1416 /* Do a hold on the listner sk so that if the listener is being
1417 * closed, the child that has been accepted can live on and still
1418 * access listen_lock.
1419 */
1420 sock_hold(sk);
1421 tcp_rsk(req)->listener = sk;
1422
1423 /* RFC1323: The window in SYN & SYN/ACK segments is never
1424 * scaled. So correct it appropriately.
1425 */
1426 tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1427
1428 /* Activate the retrans timer so that SYNACK can be retransmitted.
1429 * The request socket is not added to the SYN table of the parent
1430 * because it's been added to the accept queue directly.
1431 */
1432 inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1433 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1434
1435 /* Add the child socket directly into the accept queue */
1436 inet_csk_reqsk_queue_add(sk, req, child);
1437
1438 /* Now finish processing the fastopen child socket. */
1439 inet_csk(child)->icsk_af_ops->rebuild_header(child);
1440 tcp_init_congestion_control(child);
1441 tcp_mtup_init(child);
1442 tcp_init_buffer_space(child);
1443 tcp_init_metrics(child);
1444
1445 /* Queue the data carried in the SYN packet. We need to first
1446 * bump skb's refcnt because the caller will attempt to free it.
1447 *
1448 * XXX (TFO) - we honor a zero-payload TFO request for now.
1449 * (Any reason not to?)
1450 */
1451 if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1452 /* Don't queue the skb if there is no payload in SYN.
1453 * XXX (TFO) - How about SYN+FIN?
1454 */
1455 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1456 } else {
1457 skb = skb_get(skb);
1458 skb_dst_drop(skb);
1459 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1460 skb_set_owner_r(skb, child);
1461 __skb_queue_tail(&child->sk_receive_queue, skb);
1462 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1463 }
1464 sk->sk_data_ready(sk, 0);
1465 bh_unlock_sock(child);
1466 sock_put(child);
1467 WARN_ON(req->sk == NULL);
1468 return 0;
1469}
1470
1275int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1471int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1276{ 1472{
1277 struct tcp_extend_values tmp_ext; 1473 struct tcp_extend_values tmp_ext;
@@ -1285,6 +1481,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1285 __be32 daddr = ip_hdr(skb)->daddr; 1481 __be32 daddr = ip_hdr(skb)->daddr;
1286 __u32 isn = TCP_SKB_CB(skb)->when; 1482 __u32 isn = TCP_SKB_CB(skb)->when;
1287 bool want_cookie = false; 1483 bool want_cookie = false;
1484 struct flowi4 fl4;
1485 struct tcp_fastopen_cookie foc = { .len = -1 };
1486 struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1487 struct sk_buff *skb_synack;
1488 int do_fastopen;
1288 1489
1289 /* Never answer to SYNs send to broadcast or multicast */ 1490 /* Never answer to SYNs send to broadcast or multicast */
1290 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1491 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -1319,7 +1520,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1319 tcp_clear_options(&tmp_opt); 1520 tcp_clear_options(&tmp_opt);
1320 tmp_opt.mss_clamp = TCP_MSS_DEFAULT; 1521 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1321 tmp_opt.user_mss = tp->rx_opt.user_mss; 1522 tmp_opt.user_mss = tp->rx_opt.user_mss;
1322 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); 1523 tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
1524 want_cookie ? NULL : &foc);
1323 1525
1324 if (tmp_opt.cookie_plus > 0 && 1526 if (tmp_opt.cookie_plus > 0 &&
1325 tmp_opt.saw_tstamp && 1527 tmp_opt.saw_tstamp &&
@@ -1365,7 +1567,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1365 ireq->loc_addr = daddr; 1567 ireq->loc_addr = daddr;
1366 ireq->rmt_addr = saddr; 1568 ireq->rmt_addr = saddr;
1367 ireq->no_srccheck = inet_sk(sk)->transparent; 1569 ireq->no_srccheck = inet_sk(sk)->transparent;
1368 ireq->opt = tcp_v4_save_options(sk, skb); 1570 ireq->opt = tcp_v4_save_options(skb);
1369 1571
1370 if (security_inet_conn_request(sk, skb, req)) 1572 if (security_inet_conn_request(sk, skb, req))
1371 goto drop_and_free; 1573 goto drop_and_free;
@@ -1377,8 +1579,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1377 isn = cookie_v4_init_sequence(sk, skb, &req->mss); 1579 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1378 req->cookie_ts = tmp_opt.tstamp_ok; 1580 req->cookie_ts = tmp_opt.tstamp_ok;
1379 } else if (!isn) { 1581 } else if (!isn) {
1380 struct flowi4 fl4;
1381
1382 /* VJ's idea. We save last timestamp seen 1582 /* VJ's idea. We save last timestamp seen
1383 * from the destination in peer table, when entering 1583 * from the destination in peer table, when entering
1384 * state TIME-WAIT, and check against it before 1584 * state TIME-WAIT, and check against it before
@@ -1417,16 +1617,54 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1417 isn = tcp_v4_init_sequence(skb); 1617 isn = tcp_v4_init_sequence(skb);
1418 } 1618 }
1419 tcp_rsk(req)->snt_isn = isn; 1619 tcp_rsk(req)->snt_isn = isn;
1420 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1421 1620
1422 if (tcp_v4_send_synack(sk, dst, req, 1621 if (dst == NULL) {
1423 (struct request_values *)&tmp_ext, 1622 dst = inet_csk_route_req(sk, &fl4, req);
1424 skb_get_queue_mapping(skb), 1623 if (dst == NULL)
1425 want_cookie) || 1624 goto drop_and_free;
1426 want_cookie) 1625 }
1626 do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1627
1628 /* We don't call tcp_v4_send_synack() directly because we need
1629 * to make sure a child socket can be created successfully before
1630 * sending back synack!
1631 *
1632 * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1633 * (or better yet, call tcp_send_synack() in the child context
1634 * directly, but will have to fix bunch of other code first)
1635 * after syn_recv_sock() except one will need to first fix the
1636 * latter to remove its dependency on the current implementation
1637 * of tcp_v4_send_synack()->tcp_select_initial_window().
1638 */
1639 skb_synack = tcp_make_synack(sk, dst, req,
1640 (struct request_values *)&tmp_ext,
1641 fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1642
1643 if (skb_synack) {
1644 __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1645 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1646 } else
1647 goto drop_and_free;
1648
1649 if (likely(!do_fastopen)) {
1650 int err;
1651 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1652 ireq->rmt_addr, ireq->opt);
1653 err = net_xmit_eval(err);
1654 if (err || want_cookie)
1655 goto drop_and_free;
1656
1657 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1658 tcp_rsk(req)->listener = NULL;
1659 /* Add the request_sock to the SYN table */
1660 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1661 if (fastopen_cookie_present(&foc) && foc.len != 0)
1662 NET_INC_STATS_BH(sock_net(sk),
1663 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1664 } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
1665 (struct request_values *)&tmp_ext))
1427 goto drop_and_free; 1666 goto drop_and_free;
1428 1667
1429 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1430 return 0; 1668 return 0;
1431 1669
1432drop_and_release: 1670drop_and_release:
@@ -1500,9 +1738,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1500 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; 1738 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1501 1739
1502 tcp_initialize_rcv_mss(newsk); 1740 tcp_initialize_rcv_mss(newsk);
1503 if (tcp_rsk(req)->snt_synack) 1741 tcp_synack_rtt_meas(newsk, req);
1504 tcp_valid_rtt_meas(newsk,
1505 tcp_time_stamp - tcp_rsk(req)->snt_synack);
1506 newtp->total_retrans = req->retrans; 1742 newtp->total_retrans = req->retrans;
1507 1743
1508#ifdef CONFIG_TCP_MD5SIG 1744#ifdef CONFIG_TCP_MD5SIG
@@ -1554,7 +1790,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1554 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, 1790 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1555 iph->saddr, iph->daddr); 1791 iph->saddr, iph->daddr);
1556 if (req) 1792 if (req)
1557 return tcp_check_req(sk, skb, req, prev); 1793 return tcp_check_req(sk, skb, req, prev, false);
1558 1794
1559 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, 1795 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1560 th->source, iph->daddr, th->dest, inet_iif(skb)); 1796 th->source, iph->daddr, th->dest, inet_iif(skb));
@@ -1963,20 +2199,13 @@ void tcp_v4_destroy_sock(struct sock *sk)
1963 if (inet_csk(sk)->icsk_bind_hash) 2199 if (inet_csk(sk)->icsk_bind_hash)
1964 inet_put_port(sk); 2200 inet_put_port(sk);
1965 2201
1966 /*
1967 * If sendmsg cached page exists, toss it.
1968 */
1969 if (sk->sk_sndmsg_page) {
1970 __free_page(sk->sk_sndmsg_page);
1971 sk->sk_sndmsg_page = NULL;
1972 }
1973
1974 /* TCP Cookie Transactions */ 2202 /* TCP Cookie Transactions */
1975 if (tp->cookie_values != NULL) { 2203 if (tp->cookie_values != NULL) {
1976 kref_put(&tp->cookie_values->kref, 2204 kref_put(&tp->cookie_values->kref,
1977 tcp_cookie_values_release); 2205 tcp_cookie_values_release);
1978 tp->cookie_values = NULL; 2206 tp->cookie_values = NULL;
1979 } 2207 }
2208 BUG_ON(tp->fastopen_rsk != NULL);
1980 2209
1981 /* If socket is aborted during connect operation */ 2210 /* If socket is aborted during connect operation */
1982 tcp_free_fastopen_req(tp); 2211 tcp_free_fastopen_req(tp);
@@ -2396,7 +2625,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2396 struct seq_file *f, int i, kuid_t uid, int *len) 2625 struct seq_file *f, int i, kuid_t uid, int *len)
2397{ 2626{
2398 const struct inet_request_sock *ireq = inet_rsk(req); 2627 const struct inet_request_sock *ireq = inet_rsk(req);
2399 int ttd = req->expires - jiffies; 2628 long delta = req->expires - jiffies;
2400 2629
2401 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2630 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2402 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n", 2631 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
@@ -2408,7 +2637,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2408 TCP_SYN_RECV, 2637 TCP_SYN_RECV,
2409 0, 0, /* could print option size, but that is af dependent. */ 2638 0, 0, /* could print option size, but that is af dependent. */
2410 1, /* timers active (only the expire timer) */ 2639 1, /* timers active (only the expire timer) */
2411 jiffies_to_clock_t(ttd), 2640 jiffies_delta_to_clock_t(delta),
2412 req->retrans, 2641 req->retrans,
2413 from_kuid_munged(seq_user_ns(f), uid), 2642 from_kuid_munged(seq_user_ns(f), uid),
2414 0, /* non standard timer */ 2643 0, /* non standard timer */
@@ -2425,6 +2654,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2425 const struct tcp_sock *tp = tcp_sk(sk); 2654 const struct tcp_sock *tp = tcp_sk(sk);
2426 const struct inet_connection_sock *icsk = inet_csk(sk); 2655 const struct inet_connection_sock *icsk = inet_csk(sk);
2427 const struct inet_sock *inet = inet_sk(sk); 2656 const struct inet_sock *inet = inet_sk(sk);
2657 struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2428 __be32 dest = inet->inet_daddr; 2658 __be32 dest = inet->inet_daddr;
2429 __be32 src = inet->inet_rcv_saddr; 2659 __be32 src = inet->inet_rcv_saddr;
2430 __u16 destp = ntohs(inet->inet_dport); 2660 __u16 destp = ntohs(inet->inet_dport);
@@ -2459,7 +2689,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2459 tp->write_seq - tp->snd_una, 2689 tp->write_seq - tp->snd_una,
2460 rx_queue, 2690 rx_queue,
2461 timer_active, 2691 timer_active,
2462 jiffies_to_clock_t(timer_expires - jiffies), 2692 jiffies_delta_to_clock_t(timer_expires - jiffies),
2463 icsk->icsk_retransmits, 2693 icsk->icsk_retransmits,
2464 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2694 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2465 icsk->icsk_probes_out, 2695 icsk->icsk_probes_out,
@@ -2469,7 +2699,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2469 jiffies_to_clock_t(icsk->icsk_ack.ato), 2699 jiffies_to_clock_t(icsk->icsk_ack.ato),
2470 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, 2700 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2471 tp->snd_cwnd, 2701 tp->snd_cwnd,
2472 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh, 2702 sk->sk_state == TCP_LISTEN ?
2703 (fastopenq ? fastopenq->max_qlen : 0) :
2704 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2473 len); 2705 len);
2474} 2706}
2475 2707
@@ -2478,10 +2710,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2478{ 2710{
2479 __be32 dest, src; 2711 __be32 dest, src;
2480 __u16 destp, srcp; 2712 __u16 destp, srcp;
2481 int ttd = tw->tw_ttd - jiffies; 2713 long delta = tw->tw_ttd - jiffies;
2482
2483 if (ttd < 0)
2484 ttd = 0;
2485 2714
2486 dest = tw->tw_daddr; 2715 dest = tw->tw_daddr;
2487 src = tw->tw_rcv_saddr; 2716 src = tw->tw_rcv_saddr;
@@ -2491,7 +2720,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2491 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2720 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2492 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n", 2721 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2493 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2722 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2494 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, 2723 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2495 atomic_read(&tw->tw_refcnt), tw, len); 2724 atomic_read(&tw->tw_refcnt), tw, len);
2496} 2725}
2497 2726
@@ -2574,6 +2803,8 @@ void tcp4_proc_exit(void)
2574struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) 2803struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2575{ 2804{
2576 const struct iphdr *iph = skb_gro_network_header(skb); 2805 const struct iphdr *iph = skb_gro_network_header(skb);
2806 __wsum wsum;
2807 __sum16 sum;
2577 2808
2578 switch (skb->ip_summed) { 2809 switch (skb->ip_summed) {
2579 case CHECKSUM_COMPLETE: 2810 case CHECKSUM_COMPLETE:
@@ -2582,11 +2813,22 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2582 skb->ip_summed = CHECKSUM_UNNECESSARY; 2813 skb->ip_summed = CHECKSUM_UNNECESSARY;
2583 break; 2814 break;
2584 } 2815 }
2585 2816flush:
2586 /* fall through */
2587 case CHECKSUM_NONE:
2588 NAPI_GRO_CB(skb)->flush = 1; 2817 NAPI_GRO_CB(skb)->flush = 1;
2589 return NULL; 2818 return NULL;
2819
2820 case CHECKSUM_NONE:
2821 wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
2822 skb_gro_len(skb), IPPROTO_TCP, 0);
2823 sum = csum_fold(skb_checksum(skb,
2824 skb_gro_offset(skb),
2825 skb_gro_len(skb),
2826 wsum));
2827 if (sum)
2828 goto flush;
2829
2830 skb->ip_summed = CHECKSUM_UNNECESSARY;
2831 break;
2590 } 2832 }
2591 2833
2592 return tcp_gro_receive(head, skb); 2834 return tcp_gro_receive(head, skb);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 0abe67bb4d3a..4c752a6e0bcd 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -8,6 +8,7 @@
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/tcp.h> 9#include <linux/tcp.h>
10#include <linux/hash.h> 10#include <linux/hash.h>
11#include <linux/tcp_metrics.h>
11 12
12#include <net/inet_connection_sock.h> 13#include <net/inet_connection_sock.h>
13#include <net/net_namespace.h> 14#include <net/net_namespace.h>
@@ -17,20 +18,10 @@
17#include <net/ipv6.h> 18#include <net/ipv6.h>
18#include <net/dst.h> 19#include <net/dst.h>
19#include <net/tcp.h> 20#include <net/tcp.h>
21#include <net/genetlink.h>
20 22
21int sysctl_tcp_nometrics_save __read_mostly; 23int sysctl_tcp_nometrics_save __read_mostly;
22 24
23enum tcp_metric_index {
24 TCP_METRIC_RTT,
25 TCP_METRIC_RTTVAR,
26 TCP_METRIC_SSTHRESH,
27 TCP_METRIC_CWND,
28 TCP_METRIC_REORDERING,
29
30 /* Always last. */
31 TCP_METRIC_MAX,
32};
33
34struct tcp_fastopen_metrics { 25struct tcp_fastopen_metrics {
35 u16 mss; 26 u16 mss;
36 u16 syn_loss:10; /* Recurring Fast Open SYN losses */ 27 u16 syn_loss:10; /* Recurring Fast Open SYN losses */
@@ -45,8 +36,10 @@ struct tcp_metrics_block {
45 u32 tcpm_ts; 36 u32 tcpm_ts;
46 u32 tcpm_ts_stamp; 37 u32 tcpm_ts_stamp;
47 u32 tcpm_lock; 38 u32 tcpm_lock;
48 u32 tcpm_vals[TCP_METRIC_MAX]; 39 u32 tcpm_vals[TCP_METRIC_MAX + 1];
49 struct tcp_fastopen_metrics tcpm_fastopen; 40 struct tcp_fastopen_metrics tcpm_fastopen;
41
42 struct rcu_head rcu_head;
50}; 43};
51 44
52static bool tcp_metric_locked(struct tcp_metrics_block *tm, 45static bool tcp_metric_locked(struct tcp_metrics_block *tm,
@@ -690,6 +683,325 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
690 rcu_read_unlock(); 683 rcu_read_unlock();
691} 684}
692 685
686static struct genl_family tcp_metrics_nl_family = {
687 .id = GENL_ID_GENERATE,
688 .hdrsize = 0,
689 .name = TCP_METRICS_GENL_NAME,
690 .version = TCP_METRICS_GENL_VERSION,
691 .maxattr = TCP_METRICS_ATTR_MAX,
692 .netnsok = true,
693};
694
695static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
696 [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, },
697 [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY,
698 .len = sizeof(struct in6_addr), },
699 /* Following attributes are not received for GET/DEL,
700 * we keep them for reference
701 */
702#if 0
703 [TCP_METRICS_ATTR_AGE] = { .type = NLA_MSECS, },
704 [TCP_METRICS_ATTR_TW_TSVAL] = { .type = NLA_U32, },
705 [TCP_METRICS_ATTR_TW_TS_STAMP] = { .type = NLA_S32, },
706 [TCP_METRICS_ATTR_VALS] = { .type = NLA_NESTED, },
707 [TCP_METRICS_ATTR_FOPEN_MSS] = { .type = NLA_U16, },
708 [TCP_METRICS_ATTR_FOPEN_SYN_DROPS] = { .type = NLA_U16, },
709 [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS] = { .type = NLA_MSECS, },
710 [TCP_METRICS_ATTR_FOPEN_COOKIE] = { .type = NLA_BINARY,
711 .len = TCP_FASTOPEN_COOKIE_MAX, },
712#endif
713};
714
715/* Add attributes, caller cancels its header on failure */
716static int tcp_metrics_fill_info(struct sk_buff *msg,
717 struct tcp_metrics_block *tm)
718{
719 struct nlattr *nest;
720 int i;
721
722 switch (tm->tcpm_addr.family) {
723 case AF_INET:
724 if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4,
725 tm->tcpm_addr.addr.a4) < 0)
726 goto nla_put_failure;
727 break;
728 case AF_INET6:
729 if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16,
730 tm->tcpm_addr.addr.a6) < 0)
731 goto nla_put_failure;
732 break;
733 default:
734 return -EAFNOSUPPORT;
735 }
736
737 if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
738 jiffies - tm->tcpm_stamp) < 0)
739 goto nla_put_failure;
740 if (tm->tcpm_ts_stamp) {
741 if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP,
742 (s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0)
743 goto nla_put_failure;
744 if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL,
745 tm->tcpm_ts) < 0)
746 goto nla_put_failure;
747 }
748
749 {
750 int n = 0;
751
752 nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
753 if (!nest)
754 goto nla_put_failure;
755 for (i = 0; i < TCP_METRIC_MAX + 1; i++) {
756 if (!tm->tcpm_vals[i])
757 continue;
758 if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0)
759 goto nla_put_failure;
760 n++;
761 }
762 if (n)
763 nla_nest_end(msg, nest);
764 else
765 nla_nest_cancel(msg, nest);
766 }
767
768 {
769 struct tcp_fastopen_metrics tfom_copy[1], *tfom;
770 unsigned int seq;
771
772 do {
773 seq = read_seqbegin(&fastopen_seqlock);
774 tfom_copy[0] = tm->tcpm_fastopen;
775 } while (read_seqretry(&fastopen_seqlock, seq));
776
777 tfom = tfom_copy;
778 if (tfom->mss &&
779 nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS,
780 tfom->mss) < 0)
781 goto nla_put_failure;
782 if (tfom->syn_loss &&
783 (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS,
784 tfom->syn_loss) < 0 ||
785 nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,
786 jiffies - tfom->last_syn_loss) < 0))
787 goto nla_put_failure;
788 if (tfom->cookie.len > 0 &&
789 nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE,
790 tfom->cookie.len, tfom->cookie.val) < 0)
791 goto nla_put_failure;
792 }
793
794 return 0;
795
796nla_put_failure:
797 return -EMSGSIZE;
798}
799
800static int tcp_metrics_dump_info(struct sk_buff *skb,
801 struct netlink_callback *cb,
802 struct tcp_metrics_block *tm)
803{
804 void *hdr;
805
806 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
807 &tcp_metrics_nl_family, NLM_F_MULTI,
808 TCP_METRICS_CMD_GET);
809 if (!hdr)
810 return -EMSGSIZE;
811
812 if (tcp_metrics_fill_info(skb, tm) < 0)
813 goto nla_put_failure;
814
815 return genlmsg_end(skb, hdr);
816
817nla_put_failure:
818 genlmsg_cancel(skb, hdr);
819 return -EMSGSIZE;
820}
821
822static int tcp_metrics_nl_dump(struct sk_buff *skb,
823 struct netlink_callback *cb)
824{
825 struct net *net = sock_net(skb->sk);
826 unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
827 unsigned int row, s_row = cb->args[0];
828 int s_col = cb->args[1], col = s_col;
829
830 for (row = s_row; row < max_rows; row++, s_col = 0) {
831 struct tcp_metrics_block *tm;
832 struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row;
833
834 rcu_read_lock();
835 for (col = 0, tm = rcu_dereference(hb->chain); tm;
836 tm = rcu_dereference(tm->tcpm_next), col++) {
837 if (col < s_col)
838 continue;
839 if (tcp_metrics_dump_info(skb, cb, tm) < 0) {
840 rcu_read_unlock();
841 goto done;
842 }
843 }
844 rcu_read_unlock();
845 }
846
847done:
848 cb->args[0] = row;
849 cb->args[1] = col;
850 return skb->len;
851}
852
853static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
854 unsigned int *hash, int optional)
855{
856 struct nlattr *a;
857
858 a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV4];
859 if (a) {
860 addr->family = AF_INET;
861 addr->addr.a4 = nla_get_be32(a);
862 *hash = (__force unsigned int) addr->addr.a4;
863 return 0;
864 }
865 a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV6];
866 if (a) {
867 if (nla_len(a) != sizeof(sizeof(struct in6_addr)))
868 return -EINVAL;
869 addr->family = AF_INET6;
870 memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6));
871 *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6);
872 return 0;
873 }
874 return optional ? 1 : -EAFNOSUPPORT;
875}
876
877static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
878{
879 struct tcp_metrics_block *tm;
880 struct inetpeer_addr addr;
881 unsigned int hash;
882 struct sk_buff *msg;
883 struct net *net = genl_info_net(info);
884 void *reply;
885 int ret;
886
887 ret = parse_nl_addr(info, &addr, &hash, 0);
888 if (ret < 0)
889 return ret;
890
891 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
892 if (!msg)
893 return -ENOMEM;
894
895 reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0,
896 info->genlhdr->cmd);
897 if (!reply)
898 goto nla_put_failure;
899
900 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
901 ret = -ESRCH;
902 rcu_read_lock();
903 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
904 tm = rcu_dereference(tm->tcpm_next)) {
905 if (addr_same(&tm->tcpm_addr, &addr)) {
906 ret = tcp_metrics_fill_info(msg, tm);
907 break;
908 }
909 }
910 rcu_read_unlock();
911 if (ret < 0)
912 goto out_free;
913
914 genlmsg_end(msg, reply);
915 return genlmsg_reply(msg, info);
916
917nla_put_failure:
918 ret = -EMSGSIZE;
919
920out_free:
921 nlmsg_free(msg);
922 return ret;
923}
924
925#define deref_locked_genl(p) \
926 rcu_dereference_protected(p, lockdep_genl_is_held() && \
927 lockdep_is_held(&tcp_metrics_lock))
928
929#define deref_genl(p) rcu_dereference_protected(p, lockdep_genl_is_held())
930
931static int tcp_metrics_flush_all(struct net *net)
932{
933 unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
934 struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash;
935 struct tcp_metrics_block *tm;
936 unsigned int row;
937
938 for (row = 0; row < max_rows; row++, hb++) {
939 spin_lock_bh(&tcp_metrics_lock);
940 tm = deref_locked_genl(hb->chain);
941 if (tm)
942 hb->chain = NULL;
943 spin_unlock_bh(&tcp_metrics_lock);
944 while (tm) {
945 struct tcp_metrics_block *next;
946
947 next = deref_genl(tm->tcpm_next);
948 kfree_rcu(tm, rcu_head);
949 tm = next;
950 }
951 }
952 return 0;
953}
954
955static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
956{
957 struct tcpm_hash_bucket *hb;
958 struct tcp_metrics_block *tm;
959 struct tcp_metrics_block __rcu **pp;
960 struct inetpeer_addr addr;
961 unsigned int hash;
962 struct net *net = genl_info_net(info);
963 int ret;
964
965 ret = parse_nl_addr(info, &addr, &hash, 1);
966 if (ret < 0)
967 return ret;
968 if (ret > 0)
969 return tcp_metrics_flush_all(net);
970
971 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
972 hb = net->ipv4.tcp_metrics_hash + hash;
973 pp = &hb->chain;
974 spin_lock_bh(&tcp_metrics_lock);
975 for (tm = deref_locked_genl(*pp); tm;
976 pp = &tm->tcpm_next, tm = deref_locked_genl(*pp)) {
977 if (addr_same(&tm->tcpm_addr, &addr)) {
978 *pp = tm->tcpm_next;
979 break;
980 }
981 }
982 spin_unlock_bh(&tcp_metrics_lock);
983 if (!tm)
984 return -ESRCH;
985 kfree_rcu(tm, rcu_head);
986 return 0;
987}
988
989static struct genl_ops tcp_metrics_nl_ops[] = {
990 {
991 .cmd = TCP_METRICS_CMD_GET,
992 .doit = tcp_metrics_nl_cmd_get,
993 .dumpit = tcp_metrics_nl_dump,
994 .policy = tcp_metrics_nl_policy,
995 .flags = GENL_ADMIN_PERM,
996 },
997 {
998 .cmd = TCP_METRICS_CMD_DEL,
999 .doit = tcp_metrics_nl_cmd_del,
1000 .policy = tcp_metrics_nl_policy,
1001 .flags = GENL_ADMIN_PERM,
1002 },
1003};
1004
693static unsigned int tcpmhash_entries; 1005static unsigned int tcpmhash_entries;
694static int __init set_tcpmhash_entries(char *str) 1006static int __init set_tcpmhash_entries(char *str)
695{ 1007{
@@ -753,5 +1065,21 @@ static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
753 1065
754void __init tcp_metrics_init(void) 1066void __init tcp_metrics_init(void)
755{ 1067{
756 register_pernet_subsys(&tcp_net_metrics_ops); 1068 int ret;
1069
1070 ret = register_pernet_subsys(&tcp_net_metrics_ops);
1071 if (ret < 0)
1072 goto cleanup;
1073 ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
1074 tcp_metrics_nl_ops,
1075 ARRAY_SIZE(tcp_metrics_nl_ops));
1076 if (ret < 0)
1077 goto cleanup_subsys;
1078 return;
1079
1080cleanup_subsys:
1081 unregister_pernet_subsys(&tcp_net_metrics_ops);
1082
1083cleanup:
1084 return;
757} 1085}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6ff7f10dce9d..27536ba16c9d 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -85,6 +85,8 @@ static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
85 * spinlock it. I do not want! Well, probability of misbehaviour 85 * spinlock it. I do not want! Well, probability of misbehaviour
86 * is ridiculously low and, seems, we could use some mb() tricks 86 * is ridiculously low and, seems, we could use some mb() tricks
87 * to avoid misread sequence numbers, states etc. --ANK 87 * to avoid misread sequence numbers, states etc. --ANK
88 *
89 * We don't need to initialize tmp_out.sack_ok as we don't use the results
88 */ 90 */
89enum tcp_tw_status 91enum tcp_tw_status
90tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, 92tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
@@ -507,6 +509,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
507 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; 509 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
508 newtp->rx_opt.mss_clamp = req->mss; 510 newtp->rx_opt.mss_clamp = req->mss;
509 TCP_ECN_openreq_child(newtp, req); 511 TCP_ECN_openreq_child(newtp, req);
512 newtp->fastopen_rsk = NULL;
510 513
511 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); 514 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
512 } 515 }
@@ -515,13 +518,20 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
515EXPORT_SYMBOL(tcp_create_openreq_child); 518EXPORT_SYMBOL(tcp_create_openreq_child);
516 519
517/* 520/*
518 * Process an incoming packet for SYN_RECV sockets represented 521 * Process an incoming packet for SYN_RECV sockets represented as a
519 * as a request_sock. 522 * request_sock. Normally sk is the listener socket but for TFO it
523 * points to the child socket.
524 *
525 * XXX (TFO) - The current impl contains a special check for ack
526 * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
527 *
528 * We don't need to initialize tmp_opt.sack_ok as we don't use the results
520 */ 529 */
521 530
522struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, 531struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
523 struct request_sock *req, 532 struct request_sock *req,
524 struct request_sock **prev) 533 struct request_sock **prev,
534 bool fastopen)
525{ 535{
526 struct tcp_options_received tmp_opt; 536 struct tcp_options_received tmp_opt;
527 const u8 *hash_location; 537 const u8 *hash_location;
@@ -530,6 +540,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
530 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 540 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
531 bool paws_reject = false; 541 bool paws_reject = false;
532 542
543 BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
544
533 tmp_opt.saw_tstamp = 0; 545 tmp_opt.saw_tstamp = 0;
534 if (th->doff > (sizeof(struct tcphdr)>>2)) { 546 if (th->doff > (sizeof(struct tcphdr)>>2)) {
535 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); 547 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
@@ -565,6 +577,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
565 * 577 *
566 * Enforce "SYN-ACK" according to figure 8, figure 6 578 * Enforce "SYN-ACK" according to figure 8, figure 6
567 * of RFC793, fixed by RFC1122. 579 * of RFC793, fixed by RFC1122.
580 *
581 * Note that even if there is new data in the SYN packet
582 * they will be thrown away too.
568 */ 583 */
569 req->rsk_ops->rtx_syn_ack(sk, req, NULL); 584 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
570 return NULL; 585 return NULL;
@@ -622,9 +637,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
622 * sent (the segment carries an unacceptable ACK) ... 637 * sent (the segment carries an unacceptable ACK) ...
623 * a reset is sent." 638 * a reset is sent."
624 * 639 *
625 * Invalid ACK: reset will be sent by listening socket 640 * Invalid ACK: reset will be sent by listening socket.
641 * Note that the ACK validity check for a Fast Open socket is done
642 * elsewhere and is checked directly against the child socket rather
643 * than req because user data may have been sent out.
626 */ 644 */
627 if ((flg & TCP_FLAG_ACK) && 645 if ((flg & TCP_FLAG_ACK) && !fastopen &&
628 (TCP_SKB_CB(skb)->ack_seq != 646 (TCP_SKB_CB(skb)->ack_seq !=
629 tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) 647 tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
630 return sk; 648 return sk;
@@ -637,7 +655,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
637 /* RFC793: "first check sequence number". */ 655 /* RFC793: "first check sequence number". */
638 656
639 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 657 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
640 tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { 658 tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
641 /* Out of window: send ACK and drop. */ 659 /* Out of window: send ACK and drop. */
642 if (!(flg & TCP_FLAG_RST)) 660 if (!(flg & TCP_FLAG_RST))
643 req->rsk_ops->send_ack(sk, skb, req); 661 req->rsk_ops->send_ack(sk, skb, req);
@@ -648,7 +666,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
648 666
649 /* In sequence, PAWS is OK. */ 667 /* In sequence, PAWS is OK. */
650 668
651 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) 669 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
652 req->ts_recent = tmp_opt.rcv_tsval; 670 req->ts_recent = tmp_opt.rcv_tsval;
653 671
654 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { 672 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
@@ -667,10 +685,25 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
667 685
668 /* ACK sequence verified above, just make sure ACK is 686 /* ACK sequence verified above, just make sure ACK is
669 * set. If ACK not set, just silently drop the packet. 687 * set. If ACK not set, just silently drop the packet.
688 *
689 * XXX (TFO) - if we ever allow "data after SYN", the
690 * following check needs to be removed.
670 */ 691 */
671 if (!(flg & TCP_FLAG_ACK)) 692 if (!(flg & TCP_FLAG_ACK))
672 return NULL; 693 return NULL;
673 694
695 /* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */
696 if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
697 tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
698 else if (req->retrans) /* don't take RTT sample if retrans && ~TS */
699 tcp_rsk(req)->snt_synack = 0;
700
701 /* For Fast Open no more processing is needed (sk is the
702 * child socket).
703 */
704 if (fastopen)
705 return sk;
706
674 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ 707 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
675 if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && 708 if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
676 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { 709 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
@@ -678,10 +711,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
678 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); 711 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
679 return NULL; 712 return NULL;
680 } 713 }
681 if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
682 tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
683 else if (req->retrans) /* don't take RTT sample if retrans && ~TS */
684 tcp_rsk(req)->snt_synack = 0;
685 714
686 /* OK, ACK is valid, create big socket and 715 /* OK, ACK is valid, create big socket and
687 * feed this segment to it. It will repeat all 716 * feed this segment to it. It will repeat all
@@ -706,11 +735,21 @@ listen_overflow:
706 } 735 }
707 736
708embryonic_reset: 737embryonic_reset:
709 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); 738 if (!(flg & TCP_FLAG_RST)) {
710 if (!(flg & TCP_FLAG_RST)) 739 /* Received a bad SYN pkt - for TFO We try not to reset
740 * the local connection unless it's really necessary to
741 * avoid becoming vulnerable to outside attack aiming at
742 * resetting legit local connections.
743 */
711 req->rsk_ops->send_reset(sk, skb); 744 req->rsk_ops->send_reset(sk, skb);
712 745 } else if (fastopen) { /* received a valid RST pkt */
713 inet_csk_reqsk_queue_drop(sk, req, prev); 746 reqsk_fastopen_remove(sk, req, true);
747 tcp_reset(sk);
748 }
749 if (!fastopen) {
750 inet_csk_reqsk_queue_drop(sk, req, prev);
751 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
752 }
714 return NULL; 753 return NULL;
715} 754}
716EXPORT_SYMBOL(tcp_check_req); 755EXPORT_SYMBOL(tcp_check_req);
@@ -719,6 +758,12 @@ EXPORT_SYMBOL(tcp_check_req);
719 * Queue segment on the new socket if the new socket is active, 758 * Queue segment on the new socket if the new socket is active,
720 * otherwise we just shortcircuit this and continue with 759 * otherwise we just shortcircuit this and continue with
721 * the new socket. 760 * the new socket.
761 *
762 * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
763 * when entering. But other states are possible due to a race condition
764 * where after __inet_lookup_established() fails but before the listener
765 * locked is obtained, other packets cause the same connection to
766 * be created.
722 */ 767 */
723 768
724int tcp_child_process(struct sock *parent, struct sock *child, 769int tcp_child_process(struct sock *parent, struct sock *child,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d04632673a9e..cfe6ffe1c177 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -702,7 +702,8 @@ static unsigned int tcp_synack_options(struct sock *sk,
702 unsigned int mss, struct sk_buff *skb, 702 unsigned int mss, struct sk_buff *skb,
703 struct tcp_out_options *opts, 703 struct tcp_out_options *opts,
704 struct tcp_md5sig_key **md5, 704 struct tcp_md5sig_key **md5,
705 struct tcp_extend_values *xvp) 705 struct tcp_extend_values *xvp,
706 struct tcp_fastopen_cookie *foc)
706{ 707{
707 struct inet_request_sock *ireq = inet_rsk(req); 708 struct inet_request_sock *ireq = inet_rsk(req);
708 unsigned int remaining = MAX_TCP_OPTION_SPACE; 709 unsigned int remaining = MAX_TCP_OPTION_SPACE;
@@ -747,7 +748,15 @@ static unsigned int tcp_synack_options(struct sock *sk,
747 if (unlikely(!ireq->tstamp_ok)) 748 if (unlikely(!ireq->tstamp_ok))
748 remaining -= TCPOLEN_SACKPERM_ALIGNED; 749 remaining -= TCPOLEN_SACKPERM_ALIGNED;
749 } 750 }
750 751 if (foc != NULL) {
752 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
753 need = (need + 3) & ~3U; /* Align to 32 bits */
754 if (remaining >= need) {
755 opts->options |= OPTION_FAST_OPEN_COOKIE;
756 opts->fastopen_cookie = foc;
757 remaining -= need;
758 }
759 }
751 /* Similar rationale to tcp_syn_options() applies here, too. 760 /* Similar rationale to tcp_syn_options() applies here, too.
752 * If the <SYN> options fit, the same options should fit now! 761 * If the <SYN> options fit, the same options should fit now!
753 */ 762 */
@@ -2028,10 +2037,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2028 if (push_one) 2037 if (push_one)
2029 break; 2038 break;
2030 } 2039 }
2031 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery)
2032 tp->prr_out += sent_pkts;
2033 2040
2034 if (likely(sent_pkts)) { 2041 if (likely(sent_pkts)) {
2042 if (tcp_in_cwnd_reduction(sk))
2043 tp->prr_out += sent_pkts;
2035 tcp_cwnd_validate(sk); 2044 tcp_cwnd_validate(sk);
2036 return false; 2045 return false;
2037 } 2046 }
@@ -2533,7 +2542,7 @@ begin_fwd:
2533 } 2542 }
2534 NET_INC_STATS_BH(sock_net(sk), mib_idx); 2543 NET_INC_STATS_BH(sock_net(sk), mib_idx);
2535 2544
2536 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery) 2545 if (tcp_in_cwnd_reduction(sk))
2537 tp->prr_out += tcp_skb_pcount(skb); 2546 tp->prr_out += tcp_skb_pcount(skb);
2538 2547
2539 if (skb == tcp_write_queue_head(sk)) 2548 if (skb == tcp_write_queue_head(sk))
@@ -2658,7 +2667,8 @@ int tcp_send_synack(struct sock *sk)
2658 */ 2667 */
2659struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, 2668struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2660 struct request_sock *req, 2669 struct request_sock *req,
2661 struct request_values *rvp) 2670 struct request_values *rvp,
2671 struct tcp_fastopen_cookie *foc)
2662{ 2672{
2663 struct tcp_out_options opts; 2673 struct tcp_out_options opts;
2664 struct tcp_extend_values *xvp = tcp_xv(rvp); 2674 struct tcp_extend_values *xvp = tcp_xv(rvp);
@@ -2718,7 +2728,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2718#endif 2728#endif
2719 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2729 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2720 tcp_header_size = tcp_synack_options(sk, req, mss, 2730 tcp_header_size = tcp_synack_options(sk, req, mss,
2721 skb, &opts, &md5, xvp) 2731 skb, &opts, &md5, xvp, foc)
2722 + sizeof(*th); 2732 + sizeof(*th);
2723 2733
2724 skb_push(skb, tcp_header_size); 2734 skb_push(skb, tcp_header_size);
@@ -2772,7 +2782,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2772 } 2782 }
2773 2783
2774 th->seq = htonl(TCP_SKB_CB(skb)->seq); 2784 th->seq = htonl(TCP_SKB_CB(skb)->seq);
2775 th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); 2785 /* XXX data is queued and acked as is. No buffer/window check */
2786 th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
2776 2787
2777 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ 2788 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
2778 th->window = htons(min(req->rcv_wnd, 65535U)); 2789 th->window = htons(min(req->rcv_wnd, 65535U));
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b774a03bd1dc..fc04711e80c8 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -305,6 +305,35 @@ static void tcp_probe_timer(struct sock *sk)
305} 305}
306 306
307/* 307/*
308 * Timer for Fast Open socket to retransmit SYNACK. Note that the
309 * sk here is the child socket, not the parent (listener) socket.
310 */
311static void tcp_fastopen_synack_timer(struct sock *sk)
312{
313 struct inet_connection_sock *icsk = inet_csk(sk);
314 int max_retries = icsk->icsk_syn_retries ? :
315 sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
316 struct request_sock *req;
317
318 req = tcp_sk(sk)->fastopen_rsk;
319 req->rsk_ops->syn_ack_timeout(sk, req);
320
321 if (req->retrans >= max_retries) {
322 tcp_write_err(sk);
323 return;
324 }
325 /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
326 * returned from rtx_syn_ack() to make it more persistent like
327 * regular retransmit because if the child socket has been accepted
328 * it's not good to give up too easily.
329 */
330 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
331 req->retrans++;
332 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
333 TCP_TIMEOUT_INIT << req->retrans, TCP_RTO_MAX);
334}
335
336/*
308 * The TCP retransmit timer. 337 * The TCP retransmit timer.
309 */ 338 */
310 339
@@ -317,7 +346,15 @@ void tcp_retransmit_timer(struct sock *sk)
317 tcp_resume_early_retransmit(sk); 346 tcp_resume_early_retransmit(sk);
318 return; 347 return;
319 } 348 }
320 349 if (tp->fastopen_rsk) {
350 BUG_ON(sk->sk_state != TCP_SYN_RECV &&
351 sk->sk_state != TCP_FIN_WAIT1);
352 tcp_fastopen_synack_timer(sk);
353 /* Before we receive ACK to our SYN-ACK don't retransmit
354 * anything else (e.g., data or FIN segments).
355 */
356 return;
357 }
321 if (!tp->packets_out) 358 if (!tp->packets_out)
322 goto out; 359 goto out;
323 360
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index d2f336ea82ca..505b30ad9182 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -26,7 +26,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
26 26
27 return inet_sk_diag_fill(sk, NULL, skb, req, 27 return inet_sk_diag_fill(sk, NULL, skb, req,
28 sk_user_ns(NETLINK_CB(cb->skb).ssk), 28 sk_user_ns(NETLINK_CB(cb->skb).ssk),
29 NETLINK_CB(cb->skb).pid, 29 NETLINK_CB(cb->skb).portid,
30 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); 30 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
31} 31}
32 32
@@ -72,14 +72,14 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
72 72
73 err = inet_sk_diag_fill(sk, NULL, rep, req, 73 err = inet_sk_diag_fill(sk, NULL, rep, req,
74 sk_user_ns(NETLINK_CB(in_skb).ssk), 74 sk_user_ns(NETLINK_CB(in_skb).ssk),
75 NETLINK_CB(in_skb).pid, 75 NETLINK_CB(in_skb).portid,
76 nlh->nlmsg_seq, 0, nlh); 76 nlh->nlmsg_seq, 0, nlh);
77 if (err < 0) { 77 if (err < 0) {
78 WARN_ON(err == -EMSGSIZE); 78 WARN_ON(err == -EMSGSIZE);
79 kfree_skb(rep); 79 kfree_skb(rep);
80 goto out; 80 goto out;
81 } 81 }
82 err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid, 82 err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
83 MSG_DONTWAIT); 83 MSG_DONTWAIT);
84 if (err > 0) 84 if (err > 0)
85 err = 0; 85 err = 0;