aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig8
-rw-r--r--net/ipv4/af_inet.c4
-rw-r--r--net/ipv4/ah4.c6
-rw-r--r--net/ipv4/arp.c26
-rw-r--r--net/ipv4/devinet.c66
-rw-r--r--net/ipv4/fib_frontend.c10
-rw-r--r--net/ipv4/fib_rules.c16
-rw-r--r--net/ipv4/fib_semantics.c47
-rw-r--r--net/ipv4/fib_trie.c2
-rw-r--r--net/ipv4/icmp.c20
-rw-r--r--net/ipv4/igmp.c18
-rw-r--r--net/ipv4/inet_connection_sock.c29
-rw-r--r--net/ipv4/inet_diag.c6
-rw-r--r--net/ipv4/inet_hashtables.c2
-rw-r--r--net/ipv4/inet_timewait_sock.c6
-rw-r--r--net/ipv4/ip_forward.c4
-rw-r--r--net/ipv4/ip_fragment.c45
-rw-r--r--net/ipv4/ip_gre.c102
-rw-r--r--net/ipv4/ip_input.c13
-rw-r--r--net/ipv4/ip_options.c32
-rw-r--r--net/ipv4/ip_output.c4
-rw-r--r--net/ipv4/ip_sockglue.c19
-rw-r--r--net/ipv4/ipconfig.c20
-rw-r--r--net/ipv4/ipip.c57
-rw-r--r--net/ipv4/ipmr.c12
-rw-r--r--net/ipv4/netfilter.c12
-rw-r--r--net/ipv4/netfilter/Makefile3
-rw-r--r--net/ipv4/netfilter/arp_tables.c7
-rw-r--r--net/ipv4/netfilter/ip_queue.c639
-rw-r--r--net/ipv4/netfilter/ip_tables.c5
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c3
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c19
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c12
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c26
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c8
-rw-r--r--net/ipv4/ping.c10
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/route.c162
-rw-r--r--net/ipv4/sysctl_net_ipv4.c26
-rw-r--r--net/ipv4/tcp.c340
-rw-r--r--net/ipv4/tcp_cong.c6
-rw-r--r--net/ipv4/tcp_hybla.c10
-rw-r--r--net/ipv4/tcp_input.c635
-rw-r--r--net/ipv4/tcp_ipv4.c118
-rw-r--r--net/ipv4/tcp_minisocks.c25
-rw-r--r--net/ipv4/tcp_output.c154
-rw-r--r--net/ipv4/tcp_probe.c4
-rw-r--r--net/ipv4/tcp_timer.c5
-rw-r--r--net/ipv4/udp.c28
-rw-r--r--net/ipv4/udp_diag.c9
-rw-r--r--net/ipv4/udp_impl.h2
-rw-r--r--net/ipv4/xfrm4_policy.c6
53 files changed, 1323 insertions, 1529 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index d183262943d9..20f1cb5c8aba 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -262,8 +262,8 @@ config ARPD
262 bool "IP: ARP daemon support" 262 bool "IP: ARP daemon support"
263 ---help--- 263 ---help---
264 The kernel maintains an internal cache which maps IP addresses to 264 The kernel maintains an internal cache which maps IP addresses to
265 hardware addresses on the local network, so that Ethernet/Token Ring/ 265 hardware addresses on the local network, so that Ethernet
266 etc. frames are sent to the proper address on the physical networking 266 frames are sent to the proper address on the physical networking
267 layer. Normally, kernel uses the ARP protocol to resolve these 267 layer. Normally, kernel uses the ARP protocol to resolve these
268 mappings. 268 mappings.
269 269
@@ -312,7 +312,7 @@ config SYN_COOKIES
312 312
313config INET_AH 313config INET_AH
314 tristate "IP: AH transformation" 314 tristate "IP: AH transformation"
315 select XFRM 315 select XFRM_ALGO
316 select CRYPTO 316 select CRYPTO
317 select CRYPTO_HMAC 317 select CRYPTO_HMAC
318 select CRYPTO_MD5 318 select CRYPTO_MD5
@@ -324,7 +324,7 @@ config INET_AH
324 324
325config INET_ESP 325config INET_ESP
326 tristate "IP: ESP transformation" 326 tristate "IP: ESP transformation"
327 select XFRM 327 select XFRM_ALGO
328 select CRYPTO 328 select CRYPTO
329 select CRYPTO_AUTHENC 329 select CRYPTO_AUTHENC
330 select CRYPTO_HMAC 330 select CRYPTO_HMAC
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 10e3751466b5..c8f7aee587d1 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -350,7 +350,7 @@ lookup_protocol:
350 err = 0; 350 err = 0;
351 sk->sk_no_check = answer_no_check; 351 sk->sk_no_check = answer_no_check;
352 if (INET_PROTOSW_REUSE & answer_flags) 352 if (INET_PROTOSW_REUSE & answer_flags)
353 sk->sk_reuse = 1; 353 sk->sk_reuse = SK_CAN_REUSE;
354 354
355 inet = inet_sk(sk); 355 inet = inet_sk(sk);
356 inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; 356 inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
@@ -541,7 +541,7 @@ out:
541} 541}
542EXPORT_SYMBOL(inet_bind); 542EXPORT_SYMBOL(inet_bind);
543 543
544int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, 544int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
545 int addr_len, int flags) 545 int addr_len, int flags)
546{ 546{
547 struct sock *sk = sock->sk; 547 struct sock *sk = sock->sk;
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index fd508b526014..e8f2617ecd47 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -77,7 +77,7 @@ static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
77 77
78static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr) 78static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
79{ 79{
80 unsigned char * optptr = (unsigned char*)(iph+1); 80 unsigned char *optptr = (unsigned char *)(iph+1);
81 int l = iph->ihl*4 - sizeof(struct iphdr); 81 int l = iph->ihl*4 - sizeof(struct iphdr);
82 int optlen; 82 int optlen;
83 83
@@ -406,8 +406,8 @@ static void ah4_err(struct sk_buff *skb, u32 info)
406 ah->spi, IPPROTO_AH, AF_INET); 406 ah->spi, IPPROTO_AH, AF_INET);
407 if (!x) 407 if (!x)
408 return; 408 return;
409 printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", 409 pr_debug("pmtu discovery on SA AH/%08x/%08x\n",
410 ntohl(ah->spi), ntohl(iph->daddr)); 410 ntohl(ah->spi), ntohl(iph->daddr));
411 xfrm_state_put(x); 411 xfrm_state_put(x);
412} 412}
413 413
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 18d9b81ecb1a..cda37be02f8d 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -73,6 +73,8 @@
73 * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support. 73 * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support.
74 */ 74 */
75 75
76#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
77
76#include <linux/module.h> 78#include <linux/module.h>
77#include <linux/types.h> 79#include <linux/types.h>
78#include <linux/string.h> 80#include <linux/string.h>
@@ -89,7 +91,6 @@
89#include <linux/etherdevice.h> 91#include <linux/etherdevice.h>
90#include <linux/fddidevice.h> 92#include <linux/fddidevice.h>
91#include <linux/if_arp.h> 93#include <linux/if_arp.h>
92#include <linux/trdevice.h>
93#include <linux/skbuff.h> 94#include <linux/skbuff.h>
94#include <linux/proc_fs.h> 95#include <linux/proc_fs.h>
95#include <linux/seq_file.h> 96#include <linux/seq_file.h>
@@ -193,9 +194,6 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
193 case ARPHRD_IEEE802: 194 case ARPHRD_IEEE802:
194 ip_eth_mc_map(addr, haddr); 195 ip_eth_mc_map(addr, haddr);
195 return 0; 196 return 0;
196 case ARPHRD_IEEE802_TR:
197 ip_tr_mc_map(addr, haddr);
198 return 0;
199 case ARPHRD_INFINIBAND: 197 case ARPHRD_INFINIBAND:
200 ip_ib_mc_map(addr, dev->broadcast, haddr); 198 ip_ib_mc_map(addr, dev->broadcast, haddr);
201 return 0; 199 return 0;
@@ -364,8 +362,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
364 probes -= neigh->parms->ucast_probes; 362 probes -= neigh->parms->ucast_probes;
365 if (probes < 0) { 363 if (probes < 0) {
366 if (!(neigh->nud_state & NUD_VALID)) 364 if (!(neigh->nud_state & NUD_VALID))
367 printk(KERN_DEBUG 365 pr_debug("trying to ucast probe in NUD_INVALID\n");
368 "trying to ucast probe in NUD_INVALID\n");
369 dst_ha = neigh->ha; 366 dst_ha = neigh->ha;
370 read_lock_bh(&neigh->lock); 367 read_lock_bh(&neigh->lock);
371 } else { 368 } else {
@@ -452,7 +449,7 @@ static int arp_set_predefined(int addr_hint, unsigned char *haddr,
452{ 449{
453 switch (addr_hint) { 450 switch (addr_hint) {
454 case RTN_LOCAL: 451 case RTN_LOCAL:
455 printk(KERN_DEBUG "ARP: arp called for own IP address\n"); 452 pr_debug("arp called for own IP address\n");
456 memcpy(haddr, dev->dev_addr, dev->addr_len); 453 memcpy(haddr, dev->dev_addr, dev->addr_len);
457 return 1; 454 return 1;
458 case RTN_MULTICAST: 455 case RTN_MULTICAST:
@@ -473,7 +470,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
473 struct neighbour *n; 470 struct neighbour *n;
474 471
475 if (!skb_dst(skb)) { 472 if (!skb_dst(skb)) {
476 printk(KERN_DEBUG "arp_find is called with dst==NULL\n"); 473 pr_debug("arp_find is called with dst==NULL\n");
477 kfree_skb(skb); 474 kfree_skb(skb);
478 return 1; 475 return 1;
479 } 476 }
@@ -648,12 +645,6 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
648 arp->ar_pro = htons(ETH_P_IP); 645 arp->ar_pro = htons(ETH_P_IP);
649 break; 646 break;
650#endif 647#endif
651#if IS_ENABLED(CONFIG_TR)
652 case ARPHRD_IEEE802_TR:
653 arp->ar_hrd = htons(ARPHRD_IEEE802);
654 arp->ar_pro = htons(ETH_P_IP);
655 break;
656#endif
657 } 648 }
658 649
659 arp->ar_hln = dev->addr_len; 650 arp->ar_hln = dev->addr_len;
@@ -751,11 +742,10 @@ static int arp_process(struct sk_buff *skb)
751 goto out; 742 goto out;
752 break; 743 break;
753 case ARPHRD_ETHER: 744 case ARPHRD_ETHER:
754 case ARPHRD_IEEE802_TR:
755 case ARPHRD_FDDI: 745 case ARPHRD_FDDI:
756 case ARPHRD_IEEE802: 746 case ARPHRD_IEEE802:
757 /* 747 /*
758 * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802 748 * ETHERNET, and Fibre Channel (which are IEEE 802
759 * devices, according to RFC 2625) devices will accept ARP 749 * devices, according to RFC 2625) devices will accept ARP
760 * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2). 750 * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
761 * This is the case also of FDDI, where the RFC 1390 says that 751 * This is the case also of FDDI, where the RFC 1390 says that
@@ -1059,7 +1049,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1059 neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev); 1049 neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
1060 err = PTR_ERR(neigh); 1050 err = PTR_ERR(neigh);
1061 if (!IS_ERR(neigh)) { 1051 if (!IS_ERR(neigh)) {
1062 unsigned state = NUD_STALE; 1052 unsigned int state = NUD_STALE;
1063 if (r->arp_flags & ATF_PERM) 1053 if (r->arp_flags & ATF_PERM)
1064 state = NUD_PERMANENT; 1054 state = NUD_PERMANENT;
1065 err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? 1055 err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
@@ -1071,7 +1061,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1071 return err; 1061 return err;
1072} 1062}
1073 1063
1074static unsigned arp_state_to_flags(struct neighbour *neigh) 1064static unsigned int arp_state_to_flags(struct neighbour *neigh)
1075{ 1065{
1076 if (neigh->nud_state&NUD_PERMANENT) 1066 if (neigh->nud_state&NUD_PERMANENT)
1077 return ATF_PERM | ATF_COM; 1067 return ATF_PERM | ATF_COM;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 6e447ff94dfa..10e15a144e95 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -217,8 +217,7 @@ void in_dev_finish_destroy(struct in_device *idev)
217 WARN_ON(idev->ifa_list); 217 WARN_ON(idev->ifa_list);
218 WARN_ON(idev->mc_list); 218 WARN_ON(idev->mc_list);
219#ifdef NET_REFCNT_DEBUG 219#ifdef NET_REFCNT_DEBUG
220 printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n", 220 pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL");
221 idev, dev ? dev->name : "NIL");
222#endif 221#endif
223 dev_put(dev); 222 dev_put(dev);
224 if (!idev->dead) 223 if (!idev->dead)
@@ -1125,7 +1124,7 @@ skip:
1125 } 1124 }
1126} 1125}
1127 1126
1128static inline bool inetdev_valid_mtu(unsigned mtu) 1127static inline bool inetdev_valid_mtu(unsigned int mtu)
1129{ 1128{
1130 return mtu >= 68; 1129 return mtu >= 68;
1131} 1130}
@@ -1174,7 +1173,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1174 1173
1175 switch (event) { 1174 switch (event) {
1176 case NETDEV_REGISTER: 1175 case NETDEV_REGISTER:
1177 printk(KERN_DEBUG "inetdev_event: bug\n"); 1176 pr_debug("%s: bug\n", __func__);
1178 RCU_INIT_POINTER(dev->ip_ptr, NULL); 1177 RCU_INIT_POINTER(dev->ip_ptr, NULL);
1179 break; 1178 break;
1180 case NETDEV_UP: 1179 case NETDEV_UP:
@@ -1266,17 +1265,15 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
1266 ifm->ifa_scope = ifa->ifa_scope; 1265 ifm->ifa_scope = ifa->ifa_scope;
1267 ifm->ifa_index = ifa->ifa_dev->dev->ifindex; 1266 ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
1268 1267
1269 if (ifa->ifa_address) 1268 if ((ifa->ifa_address &&
1270 NLA_PUT_BE32(skb, IFA_ADDRESS, ifa->ifa_address); 1269 nla_put_be32(skb, IFA_ADDRESS, ifa->ifa_address)) ||
1271 1270 (ifa->ifa_local &&
1272 if (ifa->ifa_local) 1271 nla_put_be32(skb, IFA_LOCAL, ifa->ifa_local)) ||
1273 NLA_PUT_BE32(skb, IFA_LOCAL, ifa->ifa_local); 1272 (ifa->ifa_broadcast &&
1274 1273 nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
1275 if (ifa->ifa_broadcast) 1274 (ifa->ifa_label[0] &&
1276 NLA_PUT_BE32(skb, IFA_BROADCAST, ifa->ifa_broadcast); 1275 nla_put_string(skb, IFA_LABEL, ifa->ifa_label)))
1277 1276 goto nla_put_failure;
1278 if (ifa->ifa_label[0])
1279 NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label);
1280 1277
1281 return nlmsg_end(skb, nlh); 1278 return nlmsg_end(skb, nlh);
1282 1279
@@ -1587,7 +1584,6 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write,
1587static struct devinet_sysctl_table { 1584static struct devinet_sysctl_table {
1588 struct ctl_table_header *sysctl_header; 1585 struct ctl_table_header *sysctl_header;
1589 struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX]; 1586 struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
1590 char *dev_name;
1591} devinet_sysctl = { 1587} devinet_sysctl = {
1592 .devinet_vars = { 1588 .devinet_vars = {
1593 DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", 1589 DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
@@ -1629,16 +1625,7 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
1629{ 1625{
1630 int i; 1626 int i;
1631 struct devinet_sysctl_table *t; 1627 struct devinet_sysctl_table *t;
1632 1628 char path[sizeof("net/ipv4/conf/") + IFNAMSIZ];
1633#define DEVINET_CTL_PATH_DEV 3
1634
1635 struct ctl_path devinet_ctl_path[] = {
1636 { .procname = "net", },
1637 { .procname = "ipv4", },
1638 { .procname = "conf", },
1639 { /* to be set */ },
1640 { },
1641 };
1642 1629
1643 t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL); 1630 t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL);
1644 if (!t) 1631 if (!t)
@@ -1650,27 +1637,15 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
1650 t->devinet_vars[i].extra2 = net; 1637 t->devinet_vars[i].extra2 = net;
1651 } 1638 }
1652 1639
1653 /* 1640 snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name);
1654 * Make a copy of dev_name, because '.procname' is regarded as const
1655 * by sysctl and we wouldn't want anyone to change it under our feet
1656 * (see SIOCSIFNAME).
1657 */
1658 t->dev_name = kstrdup(dev_name, GFP_KERNEL);
1659 if (!t->dev_name)
1660 goto free;
1661
1662 devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name;
1663 1641
1664 t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path, 1642 t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars);
1665 t->devinet_vars);
1666 if (!t->sysctl_header) 1643 if (!t->sysctl_header)
1667 goto free_procname; 1644 goto free;
1668 1645
1669 p->sysctl = t; 1646 p->sysctl = t;
1670 return 0; 1647 return 0;
1671 1648
1672free_procname:
1673 kfree(t->dev_name);
1674free: 1649free:
1675 kfree(t); 1650 kfree(t);
1676out: 1651out:
@@ -1686,7 +1661,6 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
1686 1661
1687 cnf->sysctl = NULL; 1662 cnf->sysctl = NULL;
1688 unregister_net_sysctl_table(t->sysctl_header); 1663 unregister_net_sysctl_table(t->sysctl_header);
1689 kfree(t->dev_name);
1690 kfree(t); 1664 kfree(t);
1691} 1665}
1692 1666
@@ -1716,12 +1690,6 @@ static struct ctl_table ctl_forward_entry[] = {
1716 }, 1690 },
1717 { }, 1691 { },
1718}; 1692};
1719
1720static __net_initdata struct ctl_path net_ipv4_path[] = {
1721 { .procname = "net", },
1722 { .procname = "ipv4", },
1723 { },
1724};
1725#endif 1693#endif
1726 1694
1727static __net_init int devinet_init_net(struct net *net) 1695static __net_init int devinet_init_net(struct net *net)
@@ -1767,7 +1735,7 @@ static __net_init int devinet_init_net(struct net *net)
1767 goto err_reg_dflt; 1735 goto err_reg_dflt;
1768 1736
1769 err = -ENOMEM; 1737 err = -ENOMEM;
1770 forw_hdr = register_net_sysctl_table(net, net_ipv4_path, tbl); 1738 forw_hdr = register_net_sysctl(net, "net/ipv4", tbl);
1771 if (forw_hdr == NULL) 1739 if (forw_hdr == NULL)
1772 goto err_reg_ctl; 1740 goto err_reg_ctl;
1773 net->ipv4.forw_hdr = forw_hdr; 1741 net->ipv4.forw_hdr = forw_hdr;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index cbe3a68507cf..3854411fa37c 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -136,13 +136,13 @@ static void fib_flush(struct net *net)
136 * Find address type as if only "dev" was present in the system. If 136 * Find address type as if only "dev" was present in the system. If
137 * on_dev is NULL then all interfaces are taken into consideration. 137 * on_dev is NULL then all interfaces are taken into consideration.
138 */ 138 */
139static inline unsigned __inet_dev_addr_type(struct net *net, 139static inline unsigned int __inet_dev_addr_type(struct net *net,
140 const struct net_device *dev, 140 const struct net_device *dev,
141 __be32 addr) 141 __be32 addr)
142{ 142{
143 struct flowi4 fl4 = { .daddr = addr }; 143 struct flowi4 fl4 = { .daddr = addr };
144 struct fib_result res; 144 struct fib_result res;
145 unsigned ret = RTN_BROADCAST; 145 unsigned int ret = RTN_BROADCAST;
146 struct fib_table *local_table; 146 struct fib_table *local_table;
147 147
148 if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) 148 if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
@@ -740,7 +740,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
740#define BRD_OK 2 740#define BRD_OK 2
741#define BRD0_OK 4 741#define BRD0_OK 4
742#define BRD1_OK 8 742#define BRD1_OK 8
743 unsigned ok = 0; 743 unsigned int ok = 0;
744 int subnet = 0; /* Primary network */ 744 int subnet = 0; /* Primary network */
745 int gone = 1; /* Address is missing */ 745 int gone = 1; /* Address is missing */
746 int same_prefsrc = 0; /* Another primary with same IP */ 746 int same_prefsrc = 0; /* Another primary with same IP */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 799fc790b3cf..2d043f71ef70 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -221,15 +221,15 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
221 frh->src_len = rule4->src_len; 221 frh->src_len = rule4->src_len;
222 frh->tos = rule4->tos; 222 frh->tos = rule4->tos;
223 223
224 if (rule4->dst_len) 224 if ((rule4->dst_len &&
225 NLA_PUT_BE32(skb, FRA_DST, rule4->dst); 225 nla_put_be32(skb, FRA_DST, rule4->dst)) ||
226 226 (rule4->src_len &&
227 if (rule4->src_len) 227 nla_put_be32(skb, FRA_SRC, rule4->src)))
228 NLA_PUT_BE32(skb, FRA_SRC, rule4->src); 228 goto nla_put_failure;
229
230#ifdef CONFIG_IP_ROUTE_CLASSID 229#ifdef CONFIG_IP_ROUTE_CLASSID
231 if (rule4->tclassid) 230 if (rule4->tclassid &&
232 NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid); 231 nla_put_u32(skb, FRA_FLOW, rule4->tclassid))
232 goto nla_put_failure;
233#endif 233#endif
234 return 0; 234 return 0;
235 235
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 5063fa38ac7b..a8bdf7405433 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -931,33 +931,36 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
931 rtm->rtm_table = tb_id; 931 rtm->rtm_table = tb_id;
932 else 932 else
933 rtm->rtm_table = RT_TABLE_COMPAT; 933 rtm->rtm_table = RT_TABLE_COMPAT;
934 NLA_PUT_U32(skb, RTA_TABLE, tb_id); 934 if (nla_put_u32(skb, RTA_TABLE, tb_id))
935 goto nla_put_failure;
935 rtm->rtm_type = type; 936 rtm->rtm_type = type;
936 rtm->rtm_flags = fi->fib_flags; 937 rtm->rtm_flags = fi->fib_flags;
937 rtm->rtm_scope = fi->fib_scope; 938 rtm->rtm_scope = fi->fib_scope;
938 rtm->rtm_protocol = fi->fib_protocol; 939 rtm->rtm_protocol = fi->fib_protocol;
939 940
940 if (rtm->rtm_dst_len) 941 if (rtm->rtm_dst_len &&
941 NLA_PUT_BE32(skb, RTA_DST, dst); 942 nla_put_be32(skb, RTA_DST, dst))
942 943 goto nla_put_failure;
943 if (fi->fib_priority) 944 if (fi->fib_priority &&
944 NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority); 945 nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
945 946 goto nla_put_failure;
946 if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) 947 if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
947 goto nla_put_failure; 948 goto nla_put_failure;
948 949
949 if (fi->fib_prefsrc) 950 if (fi->fib_prefsrc &&
950 NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc); 951 nla_put_be32(skb, RTA_PREFSRC, fi->fib_prefsrc))
951 952 goto nla_put_failure;
952 if (fi->fib_nhs == 1) { 953 if (fi->fib_nhs == 1) {
953 if (fi->fib_nh->nh_gw) 954 if (fi->fib_nh->nh_gw &&
954 NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw); 955 nla_put_be32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
955 956 goto nla_put_failure;
956 if (fi->fib_nh->nh_oif) 957 if (fi->fib_nh->nh_oif &&
957 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); 958 nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
959 goto nla_put_failure;
958#ifdef CONFIG_IP_ROUTE_CLASSID 960#ifdef CONFIG_IP_ROUTE_CLASSID
959 if (fi->fib_nh[0].nh_tclassid) 961 if (fi->fib_nh[0].nh_tclassid &&
960 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); 962 nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
963 goto nla_put_failure;
961#endif 964#endif
962 } 965 }
963#ifdef CONFIG_IP_ROUTE_MULTIPATH 966#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -978,11 +981,13 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
978 rtnh->rtnh_hops = nh->nh_weight - 1; 981 rtnh->rtnh_hops = nh->nh_weight - 1;
979 rtnh->rtnh_ifindex = nh->nh_oif; 982 rtnh->rtnh_ifindex = nh->nh_oif;
980 983
981 if (nh->nh_gw) 984 if (nh->nh_gw &&
982 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); 985 nla_put_be32(skb, RTA_GATEWAY, nh->nh_gw))
986 goto nla_put_failure;
983#ifdef CONFIG_IP_ROUTE_CLASSID 987#ifdef CONFIG_IP_ROUTE_CLASSID
984 if (nh->nh_tclassid) 988 if (nh->nh_tclassid &&
985 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); 989 nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
990 goto nla_put_failure;
986#endif 991#endif
987 /* length of rtnetlink header + attributes */ 992 /* length of rtnetlink header + attributes */
988 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; 993 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index bce36f1a37b4..30b88d7b4bd6 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1370,6 +1370,8 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1370 1370
1371 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) 1371 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1372 continue; 1372 continue;
1373 if (fi->fib_dead)
1374 continue;
1373 if (fa->fa_info->fib_scope < flp->flowi4_scope) 1375 if (fa->fa_info->fib_scope < flp->flowi4_scope)
1374 continue; 1376 continue;
1375 fib_alias_accessed(fa); 1377 fib_alias_accessed(fa);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 2cb2bf845641..c75efbdc71cb 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -713,11 +713,10 @@ static void icmp_unreach(struct sk_buff *skb)
713 713
714 if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses && 714 if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&
715 inet_addr_type(net, iph->daddr) == RTN_BROADCAST) { 715 inet_addr_type(net, iph->daddr) == RTN_BROADCAST) {
716 if (net_ratelimit()) 716 net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n",
717 pr_warn("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n", 717 &ip_hdr(skb)->saddr,
718 &ip_hdr(skb)->saddr, 718 icmph->type, icmph->code,
719 icmph->type, icmph->code, 719 &iph->daddr, skb->dev->name);
720 &iph->daddr, skb->dev->name);
721 goto out; 720 goto out;
722 } 721 }
723 722
@@ -906,8 +905,7 @@ out_err:
906static void icmp_address(struct sk_buff *skb) 905static void icmp_address(struct sk_buff *skb)
907{ 906{
908#if 0 907#if 0
909 if (net_ratelimit()) 908 net_dbg_ratelimited("a guy asks for address mask. Who is it?\n");
910 printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n");
911#endif 909#endif
912} 910}
913 911
@@ -943,10 +941,10 @@ static void icmp_address_reply(struct sk_buff *skb)
943 inet_ifa_match(ip_hdr(skb)->saddr, ifa)) 941 inet_ifa_match(ip_hdr(skb)->saddr, ifa))
944 break; 942 break;
945 } 943 }
946 if (!ifa && net_ratelimit()) { 944 if (!ifa)
947 pr_info("Wrong address mask %pI4 from %s/%pI4\n", 945 net_info_ratelimited("Wrong address mask %pI4 from %s/%pI4\n",
948 mp, dev->name, &ip_hdr(skb)->saddr); 946 mp,
949 } 947 dev->name, &ip_hdr(skb)->saddr);
950 } 948 }
951} 949}
952 950
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 5dfecfd7d5e9..6699f23e6f55 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -344,10 +344,10 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
344 pip->protocol = IPPROTO_IGMP; 344 pip->protocol = IPPROTO_IGMP;
345 pip->tot_len = 0; /* filled in later */ 345 pip->tot_len = 0; /* filled in later */
346 ip_select_ident(pip, &rt->dst, NULL); 346 ip_select_ident(pip, &rt->dst, NULL);
347 ((u8*)&pip[1])[0] = IPOPT_RA; 347 ((u8 *)&pip[1])[0] = IPOPT_RA;
348 ((u8*)&pip[1])[1] = 4; 348 ((u8 *)&pip[1])[1] = 4;
349 ((u8*)&pip[1])[2] = 0; 349 ((u8 *)&pip[1])[2] = 0;
350 ((u8*)&pip[1])[3] = 0; 350 ((u8 *)&pip[1])[3] = 0;
351 351
352 skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4; 352 skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4;
353 skb_put(skb, sizeof(*pig)); 353 skb_put(skb, sizeof(*pig));
@@ -688,10 +688,10 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
688 iph->saddr = fl4.saddr; 688 iph->saddr = fl4.saddr;
689 iph->protocol = IPPROTO_IGMP; 689 iph->protocol = IPPROTO_IGMP;
690 ip_select_ident(iph, &rt->dst, NULL); 690 ip_select_ident(iph, &rt->dst, NULL);
691 ((u8*)&iph[1])[0] = IPOPT_RA; 691 ((u8 *)&iph[1])[0] = IPOPT_RA;
692 ((u8*)&iph[1])[1] = 4; 692 ((u8 *)&iph[1])[1] = 4;
693 ((u8*)&iph[1])[2] = 0; 693 ((u8 *)&iph[1])[2] = 0;
694 ((u8*)&iph[1])[3] = 0; 694 ((u8 *)&iph[1])[3] = 0;
695 695
696 ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); 696 ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
697 ih->type = type; 697 ih->type = type;
@@ -774,7 +774,7 @@ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
774 if (psf->sf_count[MCAST_INCLUDE] || 774 if (psf->sf_count[MCAST_INCLUDE] ||
775 pmc->sfcount[MCAST_EXCLUDE] != 775 pmc->sfcount[MCAST_EXCLUDE] !=
776 psf->sf_count[MCAST_EXCLUDE]) 776 psf->sf_count[MCAST_EXCLUDE])
777 continue; 777 break;
778 if (srcs[i] == psf->sf_inaddr) { 778 if (srcs[i] == psf->sf_inaddr) {
779 scount++; 779 scount++;
780 break; 780 break;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 19d66cefd7d3..95e61596e605 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -42,7 +42,8 @@ EXPORT_SYMBOL(sysctl_local_reserved_ports);
42 42
43void inet_get_local_port_range(int *low, int *high) 43void inet_get_local_port_range(int *low, int *high)
44{ 44{
45 unsigned seq; 45 unsigned int seq;
46
46 do { 47 do {
47 seq = read_seqbegin(&sysctl_local_ports.lock); 48 seq = read_seqbegin(&sysctl_local_ports.lock);
48 49
@@ -53,7 +54,7 @@ void inet_get_local_port_range(int *low, int *high)
53EXPORT_SYMBOL(inet_get_local_port_range); 54EXPORT_SYMBOL(inet_get_local_port_range);
54 55
55int inet_csk_bind_conflict(const struct sock *sk, 56int inet_csk_bind_conflict(const struct sock *sk,
56 const struct inet_bind_bucket *tb) 57 const struct inet_bind_bucket *tb, bool relax)
57{ 58{
58 struct sock *sk2; 59 struct sock *sk2;
59 struct hlist_node *node; 60 struct hlist_node *node;
@@ -79,6 +80,14 @@ int inet_csk_bind_conflict(const struct sock *sk,
79 sk2_rcv_saddr == sk_rcv_saddr(sk)) 80 sk2_rcv_saddr == sk_rcv_saddr(sk))
80 break; 81 break;
81 } 82 }
83 if (!relax && reuse && sk2->sk_reuse &&
84 sk2->sk_state != TCP_LISTEN) {
85 const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
86
87 if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
88 sk2_rcv_saddr == sk_rcv_saddr(sk))
89 break;
90 }
82 } 91 }
83 } 92 }
84 return node != NULL; 93 return node != NULL;
@@ -122,12 +131,13 @@ again:
122 (tb->num_owners < smallest_size || smallest_size == -1)) { 131 (tb->num_owners < smallest_size || smallest_size == -1)) {
123 smallest_size = tb->num_owners; 132 smallest_size = tb->num_owners;
124 smallest_rover = rover; 133 smallest_rover = rover;
125 if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) { 134 if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
135 !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
126 snum = smallest_rover; 136 snum = smallest_rover;
127 goto tb_found; 137 goto tb_found;
128 } 138 }
129 } 139 }
130 if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) { 140 if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
131 snum = rover; 141 snum = rover;
132 goto tb_found; 142 goto tb_found;
133 } 143 }
@@ -172,18 +182,22 @@ have_snum:
172 goto tb_not_found; 182 goto tb_not_found;
173tb_found: 183tb_found:
174 if (!hlist_empty(&tb->owners)) { 184 if (!hlist_empty(&tb->owners)) {
185 if (sk->sk_reuse == SK_FORCE_REUSE)
186 goto success;
187
175 if (tb->fastreuse > 0 && 188 if (tb->fastreuse > 0 &&
176 sk->sk_reuse && sk->sk_state != TCP_LISTEN && 189 sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
177 smallest_size == -1) { 190 smallest_size == -1) {
178 goto success; 191 goto success;
179 } else { 192 } else {
180 ret = 1; 193 ret = 1;
181 if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) { 194 if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
182 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN && 195 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
183 smallest_size != -1 && --attempts >= 0) { 196 smallest_size != -1 && --attempts >= 0) {
184 spin_unlock(&head->lock); 197 spin_unlock(&head->lock);
185 goto again; 198 goto again;
186 } 199 }
200
187 goto fail_unlock; 201 goto fail_unlock;
188 } 202 }
189 } 203 }
@@ -514,7 +528,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
514 528
515 /* Normally all the openreqs are young and become mature 529 /* Normally all the openreqs are young and become mature
516 * (i.e. converted to established socket) for first timeout. 530 * (i.e. converted to established socket) for first timeout.
517 * If synack was not acknowledged for 3 seconds, it means 531 * If synack was not acknowledged for 1 second, it means
518 * one of the following things: synack was lost, ack was lost, 532 * one of the following things: synack was lost, ack was lost,
519 * rtt is high or nobody planned to ack (i.e. synflood). 533 * rtt is high or nobody planned to ack (i.e. synflood).
520 * When server is a bit loaded, queue is populated with old 534 * When server is a bit loaded, queue is populated with old
@@ -555,8 +569,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
555 syn_ack_recalc(req, thresh, max_retries, 569 syn_ack_recalc(req, thresh, max_retries,
556 queue->rskq_defer_accept, 570 queue->rskq_defer_accept,
557 &expire, &resend); 571 &expire, &resend);
558 if (req->rsk_ops->syn_ack_timeout) 572 req->rsk_ops->syn_ack_timeout(parent, req);
559 req->rsk_ops->syn_ack_timeout(parent, req);
560 if (!expire && 573 if (!expire &&
561 (!resend || 574 (!resend ||
562 !req->rsk_ops->rtx_syn_ack(parent, req, NULL) || 575 !req->rsk_ops->rtx_syn_ack(parent, req, NULL) ||
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 8d25a1c557eb..46d1e7199a8c 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -141,7 +141,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
141 goto rtattr_failure; 141 goto rtattr_failure;
142 142
143 if (icsk == NULL) { 143 if (icsk == NULL) {
144 r->idiag_rqueue = r->idiag_wqueue = 0; 144 handler->idiag_get_info(sk, r, NULL);
145 goto out; 145 goto out;
146 } 146 }
147 147
@@ -999,12 +999,12 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
999 return inet_diag_get_exact(skb, h, (struct inet_diag_req_v2 *)NLMSG_DATA(h)); 999 return inet_diag_get_exact(skb, h, (struct inet_diag_req_v2 *)NLMSG_DATA(h));
1000} 1000}
1001 1001
1002static struct sock_diag_handler inet_diag_handler = { 1002static const struct sock_diag_handler inet_diag_handler = {
1003 .family = AF_INET, 1003 .family = AF_INET,
1004 .dump = inet_diag_handler_dump, 1004 .dump = inet_diag_handler_dump,
1005}; 1005};
1006 1006
1007static struct sock_diag_handler inet6_diag_handler = { 1007static const struct sock_diag_handler inet6_diag_handler = {
1008 .family = AF_INET6, 1008 .family = AF_INET6,
1009 .dump = inet_diag_handler_dump, 1009 .dump = inet_diag_handler_dump,
1010}; 1010};
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 984ec656b03b..7880af970208 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -217,7 +217,7 @@ begin:
217} 217}
218EXPORT_SYMBOL_GPL(__inet_lookup_listener); 218EXPORT_SYMBOL_GPL(__inet_lookup_listener);
219 219
220struct sock * __inet_lookup_established(struct net *net, 220struct sock *__inet_lookup_established(struct net *net,
221 struct inet_hashinfo *hashinfo, 221 struct inet_hashinfo *hashinfo,
222 const __be32 saddr, const __be16 sport, 222 const __be32 saddr, const __be16 sport,
223 const __be32 daddr, const u16 hnum, 223 const __be32 daddr, const u16 hnum,
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 89168c6351ff..2784db3155fb 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -89,8 +89,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
89 89
90#ifdef SOCK_REFCNT_DEBUG 90#ifdef SOCK_REFCNT_DEBUG
91 if (atomic_read(&tw->tw_refcnt) != 1) { 91 if (atomic_read(&tw->tw_refcnt) != 1) {
92 printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n", 92 pr_debug("%s timewait_sock %p refcnt=%d\n",
93 tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); 93 tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
94 } 94 }
95#endif 95#endif
96 while (refcnt) { 96 while (refcnt) {
@@ -263,7 +263,7 @@ rescan:
263void inet_twdr_hangman(unsigned long data) 263void inet_twdr_hangman(unsigned long data)
264{ 264{
265 struct inet_timewait_death_row *twdr; 265 struct inet_timewait_death_row *twdr;
266 int unsigned need_timer; 266 unsigned int need_timer;
267 267
268 twdr = (struct inet_timewait_death_row *)data; 268 twdr = (struct inet_timewait_death_row *)data;
269 spin_lock(&twdr->death_lock); 269 spin_lock(&twdr->death_lock);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 29a07b6c7168..e5c44fc586ab 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -41,7 +41,7 @@
41 41
42static int ip_forward_finish(struct sk_buff *skb) 42static int ip_forward_finish(struct sk_buff *skb)
43{ 43{
44 struct ip_options * opt = &(IPCB(skb)->opt); 44 struct ip_options *opt = &(IPCB(skb)->opt);
45 45
46 IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); 46 IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
47 47
@@ -55,7 +55,7 @@ int ip_forward(struct sk_buff *skb)
55{ 55{
56 struct iphdr *iph; /* Our header */ 56 struct iphdr *iph; /* Our header */
57 struct rtable *rt; /* Route we use */ 57 struct rtable *rt; /* Route we use */
58 struct ip_options * opt = &(IPCB(skb)->opt); 58 struct ip_options *opt = &(IPCB(skb)->opt);
59 59
60 if (skb_warn_if_lro(skb)) 60 if (skb_warn_if_lro(skb))
61 goto drop; 61 goto drop;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 3727e234c884..9dbd3dd6022d 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -148,17 +148,17 @@ static unsigned int ip4_hashfn(struct inet_frag_queue *q)
148 return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); 148 return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
149} 149}
150 150
151static int ip4_frag_match(struct inet_frag_queue *q, void *a) 151static bool ip4_frag_match(struct inet_frag_queue *q, void *a)
152{ 152{
153 struct ipq *qp; 153 struct ipq *qp;
154 struct ip4_create_arg *arg = a; 154 struct ip4_create_arg *arg = a;
155 155
156 qp = container_of(q, struct ipq, q); 156 qp = container_of(q, struct ipq, q);
157 return qp->id == arg->iph->id && 157 return qp->id == arg->iph->id &&
158 qp->saddr == arg->iph->saddr && 158 qp->saddr == arg->iph->saddr &&
159 qp->daddr == arg->iph->daddr && 159 qp->daddr == arg->iph->daddr &&
160 qp->protocol == arg->iph->protocol && 160 qp->protocol == arg->iph->protocol &&
161 qp->user == arg->user; 161 qp->user == arg->user;
162} 162}
163 163
164/* Memory Tracking Functions. */ 164/* Memory Tracking Functions. */
@@ -545,6 +545,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
545 int len; 545 int len;
546 int ihlen; 546 int ihlen;
547 int err; 547 int err;
548 int sum_truesize;
548 u8 ecn; 549 u8 ecn;
549 550
550 ipq_kill(qp); 551 ipq_kill(qp);
@@ -569,7 +570,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
569 skb_morph(head, qp->q.fragments); 570 skb_morph(head, qp->q.fragments);
570 head->next = qp->q.fragments->next; 571 head->next = qp->q.fragments->next;
571 572
572 kfree_skb(qp->q.fragments); 573 consume_skb(qp->q.fragments);
573 qp->q.fragments = head; 574 qp->q.fragments = head;
574 } 575 }
575 576
@@ -611,19 +612,32 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
611 atomic_add(clone->truesize, &qp->q.net->mem); 612 atomic_add(clone->truesize, &qp->q.net->mem);
612 } 613 }
613 614
614 skb_shinfo(head)->frag_list = head->next;
615 skb_push(head, head->data - skb_network_header(head)); 615 skb_push(head, head->data - skb_network_header(head));
616 616
617 for (fp=head->next; fp; fp = fp->next) { 617 sum_truesize = head->truesize;
618 head->data_len += fp->len; 618 for (fp = head->next; fp;) {
619 head->len += fp->len; 619 bool headstolen;
620 int delta;
621 struct sk_buff *next = fp->next;
622
623 sum_truesize += fp->truesize;
620 if (head->ip_summed != fp->ip_summed) 624 if (head->ip_summed != fp->ip_summed)
621 head->ip_summed = CHECKSUM_NONE; 625 head->ip_summed = CHECKSUM_NONE;
622 else if (head->ip_summed == CHECKSUM_COMPLETE) 626 else if (head->ip_summed == CHECKSUM_COMPLETE)
623 head->csum = csum_add(head->csum, fp->csum); 627 head->csum = csum_add(head->csum, fp->csum);
624 head->truesize += fp->truesize; 628
629 if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
630 kfree_skb_partial(fp, headstolen);
631 } else {
632 if (!skb_shinfo(head)->frag_list)
633 skb_shinfo(head)->frag_list = fp;
634 head->data_len += fp->len;
635 head->len += fp->len;
636 head->truesize += fp->truesize;
637 }
638 fp = next;
625 } 639 }
626 atomic_sub(head->truesize, &qp->q.net->mem); 640 atomic_sub(sum_truesize, &qp->q.net->mem);
627 641
628 head->next = NULL; 642 head->next = NULL;
629 head->dev = dev; 643 head->dev = dev;
@@ -644,8 +658,7 @@ out_nomem:
644 err = -ENOMEM; 658 err = -ENOMEM;
645 goto out_fail; 659 goto out_fail;
646out_oversize: 660out_oversize:
647 if (net_ratelimit()) 661 net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
648 pr_info("Oversized IP packet from %pI4\n", &qp->saddr);
649out_fail: 662out_fail:
650 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); 663 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
651 return err; 664 return err;
@@ -782,7 +795,7 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
782 table[2].data = &net->ipv4.frags.timeout; 795 table[2].data = &net->ipv4.frags.timeout;
783 } 796 }
784 797
785 hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); 798 hdr = register_net_sysctl(net, "net/ipv4", table);
786 if (hdr == NULL) 799 if (hdr == NULL)
787 goto err_reg; 800 goto err_reg;
788 801
@@ -807,7 +820,7 @@ static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
807 820
808static void ip4_frags_ctl_register(void) 821static void ip4_frags_ctl_register(void)
809{ 822{
810 register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table); 823 register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
811} 824}
812#else 825#else
813static inline int ip4_frags_ns_ctl_register(struct net *net) 826static inline int ip4_frags_ns_ctl_register(struct net *net)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index b57532d4742c..f49047b79609 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -169,37 +169,56 @@ struct ipgre_net {
169 169
170/* often modified stats are per cpu, other are shared (netdev->stats) */ 170/* often modified stats are per cpu, other are shared (netdev->stats) */
171struct pcpu_tstats { 171struct pcpu_tstats {
172 unsigned long rx_packets; 172 u64 rx_packets;
173 unsigned long rx_bytes; 173 u64 rx_bytes;
174 unsigned long tx_packets; 174 u64 tx_packets;
175 unsigned long tx_bytes; 175 u64 tx_bytes;
176} __attribute__((aligned(4*sizeof(unsigned long)))); 176 struct u64_stats_sync syncp;
177};
177 178
178static struct net_device_stats *ipgre_get_stats(struct net_device *dev) 179static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
180 struct rtnl_link_stats64 *tot)
179{ 181{
180 struct pcpu_tstats sum = { 0 };
181 int i; 182 int i;
182 183
183 for_each_possible_cpu(i) { 184 for_each_possible_cpu(i) {
184 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); 185 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
185 186 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
186 sum.rx_packets += tstats->rx_packets; 187 unsigned int start;
187 sum.rx_bytes += tstats->rx_bytes; 188
188 sum.tx_packets += tstats->tx_packets; 189 do {
189 sum.tx_bytes += tstats->tx_bytes; 190 start = u64_stats_fetch_begin_bh(&tstats->syncp);
191 rx_packets = tstats->rx_packets;
192 tx_packets = tstats->tx_packets;
193 rx_bytes = tstats->rx_bytes;
194 tx_bytes = tstats->tx_bytes;
195 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
196
197 tot->rx_packets += rx_packets;
198 tot->tx_packets += tx_packets;
199 tot->rx_bytes += rx_bytes;
200 tot->tx_bytes += tx_bytes;
190 } 201 }
191 dev->stats.rx_packets = sum.rx_packets; 202
192 dev->stats.rx_bytes = sum.rx_bytes; 203 tot->multicast = dev->stats.multicast;
193 dev->stats.tx_packets = sum.tx_packets; 204 tot->rx_crc_errors = dev->stats.rx_crc_errors;
194 dev->stats.tx_bytes = sum.tx_bytes; 205 tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
195 return &dev->stats; 206 tot->rx_length_errors = dev->stats.rx_length_errors;
207 tot->rx_errors = dev->stats.rx_errors;
208 tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
209 tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
210 tot->tx_dropped = dev->stats.tx_dropped;
211 tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
212 tot->tx_errors = dev->stats.tx_errors;
213
214 return tot;
196} 215}
197 216
198/* Given src, dst and key, find appropriate for input tunnel. */ 217/* Given src, dst and key, find appropriate for input tunnel. */
199 218
200static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, 219static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
201 __be32 remote, __be32 local, 220 __be32 remote, __be32 local,
202 __be32 key, __be16 gre_proto) 221 __be32 key, __be16 gre_proto)
203{ 222{
204 struct net *net = dev_net(dev); 223 struct net *net = dev_net(dev);
205 int link = dev->ifindex; 224 int link = dev->ifindex;
@@ -464,7 +483,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
464 */ 483 */
465 484
466 const struct iphdr *iph = (const struct iphdr *)skb->data; 485 const struct iphdr *iph = (const struct iphdr *)skb->data;
467 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2)); 486 __be16 *p = (__be16 *)(skb->data+(iph->ihl<<2));
468 int grehlen = (iph->ihl<<2) + 4; 487 int grehlen = (iph->ihl<<2) + 4;
469 const int type = icmp_hdr(skb)->type; 488 const int type = icmp_hdr(skb)->type;
470 const int code = icmp_hdr(skb)->code; 489 const int code = icmp_hdr(skb)->code;
@@ -574,7 +593,7 @@ static int ipgre_rcv(struct sk_buff *skb)
574 593
575 iph = ip_hdr(skb); 594 iph = ip_hdr(skb);
576 h = skb->data; 595 h = skb->data;
577 flags = *(__be16*)h; 596 flags = *(__be16 *)h;
578 597
579 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { 598 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
580 /* - Version must be 0. 599 /* - Version must be 0.
@@ -598,11 +617,11 @@ static int ipgre_rcv(struct sk_buff *skb)
598 offset += 4; 617 offset += 4;
599 } 618 }
600 if (flags&GRE_KEY) { 619 if (flags&GRE_KEY) {
601 key = *(__be32*)(h + offset); 620 key = *(__be32 *)(h + offset);
602 offset += 4; 621 offset += 4;
603 } 622 }
604 if (flags&GRE_SEQ) { 623 if (flags&GRE_SEQ) {
605 seqno = ntohl(*(__be32*)(h + offset)); 624 seqno = ntohl(*(__be32 *)(h + offset));
606 offset += 4; 625 offset += 4;
607 } 626 }
608 } 627 }
@@ -672,8 +691,10 @@ static int ipgre_rcv(struct sk_buff *skb)
672 } 691 }
673 692
674 tstats = this_cpu_ptr(tunnel->dev->tstats); 693 tstats = this_cpu_ptr(tunnel->dev->tstats);
694 u64_stats_update_begin(&tstats->syncp);
675 tstats->rx_packets++; 695 tstats->rx_packets++;
676 tstats->rx_bytes += skb->len; 696 tstats->rx_bytes += skb->len;
697 u64_stats_update_end(&tstats->syncp);
677 698
678 __skb_tunnel_rx(skb, tunnel->dev); 699 __skb_tunnel_rx(skb, tunnel->dev);
679 700
@@ -900,7 +921,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
900 htons(ETH_P_TEB) : skb->protocol; 921 htons(ETH_P_TEB) : skb->protocol;
901 922
902 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { 923 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
903 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4); 924 __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
904 925
905 if (tunnel->parms.o_flags&GRE_SEQ) { 926 if (tunnel->parms.o_flags&GRE_SEQ) {
906 ++tunnel->o_seqno; 927 ++tunnel->o_seqno;
@@ -913,7 +934,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
913 } 934 }
914 if (tunnel->parms.o_flags&GRE_CSUM) { 935 if (tunnel->parms.o_flags&GRE_CSUM) {
915 *ptr = 0; 936 *ptr = 0;
916 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr)); 937 *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
917 } 938 }
918 } 939 }
919 940
@@ -1169,7 +1190,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1169{ 1190{
1170 struct ip_tunnel *t = netdev_priv(dev); 1191 struct ip_tunnel *t = netdev_priv(dev);
1171 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); 1192 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1172 __be16 *p = (__be16*)(iph+1); 1193 __be16 *p = (__be16 *)(iph+1);
1173 1194
1174 memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); 1195 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1175 p[0] = t->parms.o_flags; 1196 p[0] = t->parms.o_flags;
@@ -1253,7 +1274,7 @@ static const struct net_device_ops ipgre_netdev_ops = {
1253 .ndo_start_xmit = ipgre_tunnel_xmit, 1274 .ndo_start_xmit = ipgre_tunnel_xmit,
1254 .ndo_do_ioctl = ipgre_tunnel_ioctl, 1275 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1255 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1276 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1256 .ndo_get_stats = ipgre_get_stats, 1277 .ndo_get_stats64 = ipgre_get_stats64,
1257}; 1278};
1258 1279
1259static void ipgre_dev_free(struct net_device *dev) 1280static void ipgre_dev_free(struct net_device *dev)
@@ -1507,7 +1528,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = {
1507 .ndo_set_mac_address = eth_mac_addr, 1528 .ndo_set_mac_address = eth_mac_addr,
1508 .ndo_validate_addr = eth_validate_addr, 1529 .ndo_validate_addr = eth_validate_addr,
1509 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1530 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1510 .ndo_get_stats = ipgre_get_stats, 1531 .ndo_get_stats64 = ipgre_get_stats64,
1511}; 1532};
1512 1533
1513static void ipgre_tap_setup(struct net_device *dev) 1534static void ipgre_tap_setup(struct net_device *dev)
@@ -1654,17 +1675,18 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1654 struct ip_tunnel *t = netdev_priv(dev); 1675 struct ip_tunnel *t = netdev_priv(dev);
1655 struct ip_tunnel_parm *p = &t->parms; 1676 struct ip_tunnel_parm *p = &t->parms;
1656 1677
1657 NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link); 1678 if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1658 NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags); 1679 nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
1659 NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags); 1680 nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
1660 NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key); 1681 nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1661 NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key); 1682 nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1662 NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr); 1683 nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1663 NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr); 1684 nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1664 NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl); 1685 nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1665 NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos); 1686 nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1666 NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF))); 1687 nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1667 1688 !!(p->iph.frag_off & htons(IP_DF))))
1689 goto nla_put_failure;
1668 return 0; 1690 return 0;
1669 1691
1670nla_put_failure: 1692nla_put_failure:
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 26eccc5bab1c..8590144ca330 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -210,9 +210,8 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
210 int ret; 210 int ret;
211 211
212 if (!net_eq(net, &init_net) && !ipprot->netns_ok) { 212 if (!net_eq(net, &init_net) && !ipprot->netns_ok) {
213 if (net_ratelimit()) 213 net_info_ratelimited("%s: proto %d isn't netns-ready\n",
214 printk("%s: proto %d isn't netns-ready\n", 214 __func__, protocol);
215 __func__, protocol);
216 kfree_skb(skb); 215 kfree_skb(skb);
217 goto out; 216 goto out;
218 } 217 }
@@ -298,10 +297,10 @@ static inline bool ip_rcv_options(struct sk_buff *skb)
298 297
299 if (in_dev) { 298 if (in_dev) {
300 if (!IN_DEV_SOURCE_ROUTE(in_dev)) { 299 if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
301 if (IN_DEV_LOG_MARTIANS(in_dev) && 300 if (IN_DEV_LOG_MARTIANS(in_dev))
302 net_ratelimit()) 301 net_info_ratelimited("source route option %pI4 -> %pI4\n",
303 pr_info("source route option %pI4 -> %pI4\n", 302 &iph->saddr,
304 &iph->saddr, &iph->daddr); 303 &iph->daddr);
305 goto drop; 304 goto drop;
306 } 305 }
307 } 306 }
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index a0d0d9d9b870..708b99494e23 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -210,10 +210,10 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
210 * Simple and stupid 8), but the most efficient way. 210 * Simple and stupid 8), but the most efficient way.
211 */ 211 */
212 212
213void ip_options_fragment(struct sk_buff * skb) 213void ip_options_fragment(struct sk_buff *skb)
214{ 214{
215 unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr); 215 unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr);
216 struct ip_options * opt = &(IPCB(skb)->opt); 216 struct ip_options *opt = &(IPCB(skb)->opt);
217 int l = opt->optlen; 217 int l = opt->optlen;
218 int optlen; 218 int optlen;
219 219
@@ -248,13 +248,13 @@ void ip_options_fragment(struct sk_buff * skb)
248 */ 248 */
249 249
250int ip_options_compile(struct net *net, 250int ip_options_compile(struct net *net,
251 struct ip_options * opt, struct sk_buff * skb) 251 struct ip_options *opt, struct sk_buff *skb)
252{ 252{
253 int l; 253 int l;
254 unsigned char * iph; 254 unsigned char *iph;
255 unsigned char * optptr; 255 unsigned char *optptr;
256 int optlen; 256 int optlen;
257 unsigned char * pp_ptr = NULL; 257 unsigned char *pp_ptr = NULL;
258 struct rtable *rt = NULL; 258 struct rtable *rt = NULL;
259 259
260 if (skb != NULL) { 260 if (skb != NULL) {
@@ -413,7 +413,7 @@ int ip_options_compile(struct net *net,
413 opt->is_changed = 1; 413 opt->is_changed = 1;
414 } 414 }
415 } else { 415 } else {
416 unsigned overflow = optptr[3]>>4; 416 unsigned int overflow = optptr[3]>>4;
417 if (overflow == 15) { 417 if (overflow == 15) {
418 pp_ptr = optptr + 3; 418 pp_ptr = optptr + 3;
419 goto error; 419 goto error;
@@ -473,20 +473,20 @@ EXPORT_SYMBOL(ip_options_compile);
473 * Undo all the changes done by ip_options_compile(). 473 * Undo all the changes done by ip_options_compile().
474 */ 474 */
475 475
476void ip_options_undo(struct ip_options * opt) 476void ip_options_undo(struct ip_options *opt)
477{ 477{
478 if (opt->srr) { 478 if (opt->srr) {
479 unsigned char * optptr = opt->__data+opt->srr-sizeof(struct iphdr); 479 unsigned char *optptr = opt->__data+opt->srr-sizeof(struct iphdr);
480 memmove(optptr+7, optptr+3, optptr[1]-7); 480 memmove(optptr+7, optptr+3, optptr[1]-7);
481 memcpy(optptr+3, &opt->faddr, 4); 481 memcpy(optptr+3, &opt->faddr, 4);
482 } 482 }
483 if (opt->rr_needaddr) { 483 if (opt->rr_needaddr) {
484 unsigned char * optptr = opt->__data+opt->rr-sizeof(struct iphdr); 484 unsigned char *optptr = opt->__data+opt->rr-sizeof(struct iphdr);
485 optptr[2] -= 4; 485 optptr[2] -= 4;
486 memset(&optptr[optptr[2]-1], 0, 4); 486 memset(&optptr[optptr[2]-1], 0, 4);
487 } 487 }
488 if (opt->ts) { 488 if (opt->ts) {
489 unsigned char * optptr = opt->__data+opt->ts-sizeof(struct iphdr); 489 unsigned char *optptr = opt->__data+opt->ts-sizeof(struct iphdr);
490 if (opt->ts_needtime) { 490 if (opt->ts_needtime) {
491 optptr[2] -= 4; 491 optptr[2] -= 4;
492 memset(&optptr[optptr[2]-1], 0, 4); 492 memset(&optptr[optptr[2]-1], 0, 4);
@@ -549,8 +549,8 @@ int ip_options_get(struct net *net, struct ip_options_rcu **optp,
549 549
550void ip_forward_options(struct sk_buff *skb) 550void ip_forward_options(struct sk_buff *skb)
551{ 551{
552 struct ip_options * opt = &(IPCB(skb)->opt); 552 struct ip_options *opt = &(IPCB(skb)->opt);
553 unsigned char * optptr; 553 unsigned char *optptr;
554 struct rtable *rt = skb_rtable(skb); 554 struct rtable *rt = skb_rtable(skb);
555 unsigned char *raw = skb_network_header(skb); 555 unsigned char *raw = skb_network_header(skb);
556 556
@@ -578,8 +578,10 @@ void ip_forward_options(struct sk_buff *skb)
578 ip_hdr(skb)->daddr = opt->nexthop; 578 ip_hdr(skb)->daddr = opt->nexthop;
579 ip_rt_get_source(&optptr[srrptr-1], skb, rt); 579 ip_rt_get_source(&optptr[srrptr-1], skb, rt);
580 optptr[2] = srrptr+4; 580 optptr[2] = srrptr+4;
581 } else if (net_ratelimit()) 581 } else {
582 pr_crit("%s(): Argh! Destination lost!\n", __func__); 582 net_crit_ratelimited("%s(): Argh! Destination lost!\n",
583 __func__);
584 }
583 if (opt->ts_needaddr) { 585 if (opt->ts_needaddr) {
584 optptr = raw + opt->ts; 586 optptr = raw + opt->ts;
585 ip_rt_get_source(&optptr[optptr[2]-9], skb, rt); 587 ip_rt_get_source(&optptr[optptr[2]-9], skb, rt);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4910176d24ed..451f97c42eb4 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -214,8 +214,8 @@ static inline int ip_finish_output2(struct sk_buff *skb)
214 } 214 }
215 rcu_read_unlock(); 215 rcu_read_unlock();
216 216
217 if (net_ratelimit()) 217 net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
218 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n"); 218 __func__);
219 kfree_skb(skb); 219 kfree_skb(skb);
220 return -EINVAL; 220 return -EINVAL;
221} 221}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 2fd0fba77124..0d11f234d615 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -90,7 +90,7 @@ static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
90static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) 90static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
91{ 91{
92 unsigned char optbuf[sizeof(struct ip_options) + 40]; 92 unsigned char optbuf[sizeof(struct ip_options) + 40];
93 struct ip_options * opt = (struct ip_options *)optbuf; 93 struct ip_options *opt = (struct ip_options *)optbuf;
94 94
95 if (IPCB(skb)->opt.optlen == 0) 95 if (IPCB(skb)->opt.optlen == 0)
96 return; 96 return;
@@ -147,7 +147,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
147void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) 147void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
148{ 148{
149 struct inet_sock *inet = inet_sk(skb->sk); 149 struct inet_sock *inet = inet_sk(skb->sk);
150 unsigned flags = inet->cmsg_flags; 150 unsigned int flags = inet->cmsg_flags;
151 151
152 /* Ordered by supposed usage frequency */ 152 /* Ordered by supposed usage frequency */
153 if (flags & 1) 153 if (flags & 1)
@@ -673,10 +673,15 @@ static int do_ip_setsockopt(struct sock *sk, int level,
673 break; 673 break;
674 } else { 674 } else {
675 memset(&mreq, 0, sizeof(mreq)); 675 memset(&mreq, 0, sizeof(mreq));
676 if (optlen >= sizeof(struct in_addr) && 676 if (optlen >= sizeof(struct ip_mreq)) {
677 copy_from_user(&mreq.imr_address, optval, 677 if (copy_from_user(&mreq, optval,
678 sizeof(struct in_addr))) 678 sizeof(struct ip_mreq)))
679 break; 679 break;
680 } else if (optlen >= sizeof(struct in_addr)) {
681 if (copy_from_user(&mreq.imr_address, optval,
682 sizeof(struct in_addr)))
683 break;
684 }
680 } 685 }
681 686
682 if (!mreq.imr_ifindex) { 687 if (!mreq.imr_ifindex) {
@@ -1094,7 +1099,7 @@ EXPORT_SYMBOL(compat_ip_setsockopt);
1094 */ 1099 */
1095 1100
1096static int do_ip_getsockopt(struct sock *sk, int level, int optname, 1101static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1097 char __user *optval, int __user *optlen, unsigned flags) 1102 char __user *optval, int __user *optlen, unsigned int flags)
1098{ 1103{
1099 struct inet_sock *inet = inet_sk(sk); 1104 struct inet_sock *inet = inet_sk(sk);
1100 int val; 1105 int val;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 92ac7e7363a0..67e8a6b086ea 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -808,8 +808,6 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
808 b->op = BOOTP_REQUEST; 808 b->op = BOOTP_REQUEST;
809 if (dev->type < 256) /* check for false types */ 809 if (dev->type < 256) /* check for false types */
810 b->htype = dev->type; 810 b->htype = dev->type;
811 else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */
812 b->htype = ARPHRD_IEEE802;
813 else if (dev->type == ARPHRD_FDDI) 811 else if (dev->type == ARPHRD_FDDI)
814 b->htype = ARPHRD_ETHER; 812 b->htype = ARPHRD_ETHER;
815 else { 813 else {
@@ -955,8 +953,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
955 953
956 /* Fragments are not supported */ 954 /* Fragments are not supported */
957 if (ip_is_fragment(h)) { 955 if (ip_is_fragment(h)) {
958 if (net_ratelimit()) 956 net_err_ratelimited("DHCP/BOOTP: Ignoring fragmented reply\n");
959 pr_err("DHCP/BOOTP: Ignoring fragmented reply\n");
960 goto drop; 957 goto drop;
961 } 958 }
962 959
@@ -1004,16 +1001,14 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
1004 /* Is it a reply to our BOOTP request? */ 1001 /* Is it a reply to our BOOTP request? */
1005 if (b->op != BOOTP_REPLY || 1002 if (b->op != BOOTP_REPLY ||
1006 b->xid != d->xid) { 1003 b->xid != d->xid) {
1007 if (net_ratelimit()) 1004 net_err_ratelimited("DHCP/BOOTP: Reply not for us, op[%x] xid[%x]\n",
1008 pr_err("DHCP/BOOTP: Reply not for us, op[%x] xid[%x]\n", 1005 b->op, b->xid);
1009 b->op, b->xid);
1010 goto drop_unlock; 1006 goto drop_unlock;
1011 } 1007 }
1012 1008
1013 /* Is it a reply for the device we are configuring? */ 1009 /* Is it a reply for the device we are configuring? */
1014 if (b->xid != ic_dev_xid) { 1010 if (b->xid != ic_dev_xid) {
1015 if (net_ratelimit()) 1011 net_err_ratelimited("DHCP/BOOTP: Ignoring delayed packet\n");
1016 pr_err("DHCP/BOOTP: Ignoring delayed packet\n");
1017 goto drop_unlock; 1012 goto drop_unlock;
1018 } 1013 }
1019 1014
@@ -1198,7 +1193,7 @@ static int __init ic_dynamic(void)
1198 d = ic_first_dev; 1193 d = ic_first_dev;
1199 retries = CONF_SEND_RETRIES; 1194 retries = CONF_SEND_RETRIES;
1200 get_random_bytes(&timeout, sizeof(timeout)); 1195 get_random_bytes(&timeout, sizeof(timeout));
1201 timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM); 1196 timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM);
1202 for (;;) { 1197 for (;;) {
1203 /* Track the device we are configuring */ 1198 /* Track the device we are configuring */
1204 ic_dev_xid = d->xid; 1199 ic_dev_xid = d->xid;
@@ -1626,11 +1621,13 @@ static int __init ip_auto_config_setup(char *addrs)
1626 1621
1627 return 1; 1622 return 1;
1628} 1623}
1624__setup("ip=", ip_auto_config_setup);
1629 1625
1630static int __init nfsaddrs_config_setup(char *addrs) 1626static int __init nfsaddrs_config_setup(char *addrs)
1631{ 1627{
1632 return ip_auto_config_setup(addrs); 1628 return ip_auto_config_setup(addrs);
1633} 1629}
1630__setup("nfsaddrs=", nfsaddrs_config_setup);
1634 1631
1635static int __init vendor_class_identifier_setup(char *addrs) 1632static int __init vendor_class_identifier_setup(char *addrs)
1636{ 1633{
@@ -1641,7 +1638,4 @@ static int __init vendor_class_identifier_setup(char *addrs)
1641 vendor_class_identifier); 1638 vendor_class_identifier);
1642 return 1; 1639 return 1;
1643} 1640}
1644
1645__setup("ip=", ip_auto_config_setup);
1646__setup("nfsaddrs=", nfsaddrs_config_setup);
1647__setup("dhcpclass=", vendor_class_identifier_setup); 1641__setup("dhcpclass=", vendor_class_identifier_setup);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index ae1413e3f2f8..2d0f99bf61b3 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -144,33 +144,48 @@ static void ipip_dev_free(struct net_device *dev);
144 144
145/* often modified stats are per cpu, other are shared (netdev->stats) */ 145/* often modified stats are per cpu, other are shared (netdev->stats) */
146struct pcpu_tstats { 146struct pcpu_tstats {
147 unsigned long rx_packets; 147 u64 rx_packets;
148 unsigned long rx_bytes; 148 u64 rx_bytes;
149 unsigned long tx_packets; 149 u64 tx_packets;
150 unsigned long tx_bytes; 150 u64 tx_bytes;
151} __attribute__((aligned(4*sizeof(unsigned long)))); 151 struct u64_stats_sync syncp;
152};
152 153
153static struct net_device_stats *ipip_get_stats(struct net_device *dev) 154static struct rtnl_link_stats64 *ipip_get_stats64(struct net_device *dev,
155 struct rtnl_link_stats64 *tot)
154{ 156{
155 struct pcpu_tstats sum = { 0 };
156 int i; 157 int i;
157 158
158 for_each_possible_cpu(i) { 159 for_each_possible_cpu(i) {
159 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); 160 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
160 161 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
161 sum.rx_packets += tstats->rx_packets; 162 unsigned int start;
162 sum.rx_bytes += tstats->rx_bytes; 163
163 sum.tx_packets += tstats->tx_packets; 164 do {
164 sum.tx_bytes += tstats->tx_bytes; 165 start = u64_stats_fetch_begin_bh(&tstats->syncp);
166 rx_packets = tstats->rx_packets;
167 tx_packets = tstats->tx_packets;
168 rx_bytes = tstats->rx_bytes;
169 tx_bytes = tstats->tx_bytes;
170 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
171
172 tot->rx_packets += rx_packets;
173 tot->tx_packets += tx_packets;
174 tot->rx_bytes += rx_bytes;
175 tot->tx_bytes += tx_bytes;
165 } 176 }
166 dev->stats.rx_packets = sum.rx_packets; 177
167 dev->stats.rx_bytes = sum.rx_bytes; 178 tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
168 dev->stats.tx_packets = sum.tx_packets; 179 tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
169 dev->stats.tx_bytes = sum.tx_bytes; 180 tot->tx_dropped = dev->stats.tx_dropped;
170 return &dev->stats; 181 tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
182 tot->tx_errors = dev->stats.tx_errors;
183 tot->collisions = dev->stats.collisions;
184
185 return tot;
171} 186}
172 187
173static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, 188static struct ip_tunnel *ipip_tunnel_lookup(struct net *net,
174 __be32 remote, __be32 local) 189 __be32 remote, __be32 local)
175{ 190{
176 unsigned int h0 = HASH(remote); 191 unsigned int h0 = HASH(remote);
@@ -245,7 +260,7 @@ static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
245 rcu_assign_pointer(*tp, t); 260 rcu_assign_pointer(*tp, t);
246} 261}
247 262
248static struct ip_tunnel * ipip_tunnel_locate(struct net *net, 263static struct ip_tunnel *ipip_tunnel_locate(struct net *net,
249 struct ip_tunnel_parm *parms, int create) 264 struct ip_tunnel_parm *parms, int create)
250{ 265{
251 __be32 remote = parms->iph.daddr; 266 __be32 remote = parms->iph.daddr;
@@ -404,8 +419,10 @@ static int ipip_rcv(struct sk_buff *skb)
404 skb->pkt_type = PACKET_HOST; 419 skb->pkt_type = PACKET_HOST;
405 420
406 tstats = this_cpu_ptr(tunnel->dev->tstats); 421 tstats = this_cpu_ptr(tunnel->dev->tstats);
422 u64_stats_update_begin(&tstats->syncp);
407 tstats->rx_packets++; 423 tstats->rx_packets++;
408 tstats->rx_bytes += skb->len; 424 tstats->rx_bytes += skb->len;
425 u64_stats_update_end(&tstats->syncp);
409 426
410 __skb_tunnel_rx(skb, tunnel->dev); 427 __skb_tunnel_rx(skb, tunnel->dev);
411 428
@@ -730,7 +747,7 @@ static const struct net_device_ops ipip_netdev_ops = {
730 .ndo_start_xmit = ipip_tunnel_xmit, 747 .ndo_start_xmit = ipip_tunnel_xmit,
731 .ndo_do_ioctl = ipip_tunnel_ioctl, 748 .ndo_do_ioctl = ipip_tunnel_ioctl,
732 .ndo_change_mtu = ipip_tunnel_change_mtu, 749 .ndo_change_mtu = ipip_tunnel_change_mtu,
733 .ndo_get_stats = ipip_get_stats, 750 .ndo_get_stats64 = ipip_get_stats64,
734}; 751};
735 752
736static void ipip_dev_free(struct net_device *dev) 753static void ipip_dev_free(struct net_device *dev)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 960fbfc3e976..a9e519ad6db5 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -949,8 +949,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
949 ret = sock_queue_rcv_skb(mroute_sk, skb); 949 ret = sock_queue_rcv_skb(mroute_sk, skb);
950 rcu_read_unlock(); 950 rcu_read_unlock();
951 if (ret < 0) { 951 if (ret < 0) {
952 if (net_ratelimit()) 952 net_warn_ratelimited("mroute: pending queue full, dropping entries\n");
953 pr_warn("mroute: pending queue full, dropping entries\n");
954 kfree_skb(skb); 953 kfree_skb(skb);
955 } 954 }
956 955
@@ -2119,15 +2118,16 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2119 rtm->rtm_src_len = 32; 2118 rtm->rtm_src_len = 32;
2120 rtm->rtm_tos = 0; 2119 rtm->rtm_tos = 0;
2121 rtm->rtm_table = mrt->id; 2120 rtm->rtm_table = mrt->id;
2122 NLA_PUT_U32(skb, RTA_TABLE, mrt->id); 2121 if (nla_put_u32(skb, RTA_TABLE, mrt->id))
2122 goto nla_put_failure;
2123 rtm->rtm_type = RTN_MULTICAST; 2123 rtm->rtm_type = RTN_MULTICAST;
2124 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 2124 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2125 rtm->rtm_protocol = RTPROT_UNSPEC; 2125 rtm->rtm_protocol = RTPROT_UNSPEC;
2126 rtm->rtm_flags = 0; 2126 rtm->rtm_flags = 0;
2127 2127
2128 NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin); 2128 if (nla_put_be32(skb, RTA_SRC, c->mfc_origin) ||
2129 NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp); 2129 nla_put_be32(skb, RTA_DST, c->mfc_mcastgrp))
2130 2130 goto nla_put_failure;
2131 if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0) 2131 if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0)
2132 goto nla_put_failure; 2132 goto nla_put_failure;
2133 2133
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 4f47e064e262..ed1b36783192 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -12,7 +12,7 @@
12#include <net/netfilter/nf_queue.h> 12#include <net/netfilter/nf_queue.h>
13 13
14/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ 14/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
15int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) 15int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
16{ 16{
17 struct net *net = dev_net(skb_dst(skb)->dev); 17 struct net *net = dev_net(skb_dst(skb)->dev);
18 const struct iphdr *iph = ip_hdr(skb); 18 const struct iphdr *iph = ip_hdr(skb);
@@ -237,13 +237,3 @@ static void ipv4_netfilter_fini(void)
237 237
238module_init(ipv4_netfilter_init); 238module_init(ipv4_netfilter_init);
239module_exit(ipv4_netfilter_fini); 239module_exit(ipv4_netfilter_fini);
240
241#ifdef CONFIG_SYSCTL
242struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = {
243 { .procname = "net", },
244 { .procname = "ipv4", },
245 { .procname = "netfilter", },
246 { }
247};
248EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path);
249#endif /* CONFIG_SYSCTL */
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 240b68469a7a..c20674dc9452 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -66,6 +66,3 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
66 66
67# just filtering instance of ARP tables for now 67# just filtering instance of ARP tables for now
68obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o 68obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
69
70obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
71
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index fd7a3f68917f..97e61eadf580 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -221,9 +221,8 @@ static inline int arp_checkentry(const struct arpt_arp *arp)
221static unsigned int 221static unsigned int
222arpt_error(struct sk_buff *skb, const struct xt_action_param *par) 222arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
223{ 223{
224 if (net_ratelimit()) 224 net_err_ratelimited("arp_tables: error: '%s'\n",
225 pr_err("arp_tables: error: '%s'\n", 225 (const char *)par->targinfo);
226 (const char *)par->targinfo);
227 226
228 return NF_DROP; 227 return NF_DROP;
229} 228}
@@ -303,7 +302,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
303 if (v < 0) { 302 if (v < 0) {
304 /* Pop from stack? */ 303 /* Pop from stack? */
305 if (v != XT_RETURN) { 304 if (v != XT_RETURN) {
306 verdict = (unsigned)(-v) - 1; 305 verdict = (unsigned int)(-v) - 1;
307 break; 306 break;
308 } 307 }
309 e = back; 308 e = back;
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
deleted file mode 100644
index 94d45e1f8882..000000000000
--- a/net/ipv4/netfilter/ip_queue.c
+++ /dev/null
@@ -1,639 +0,0 @@
1/*
2 * This is a module which is used for queueing IPv4 packets and
3 * communicating with userspace via netlink.
4 *
5 * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
6 * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12#include <linux/module.h>
13#include <linux/skbuff.h>
14#include <linux/init.h>
15#include <linux/ip.h>
16#include <linux/notifier.h>
17#include <linux/netdevice.h>
18#include <linux/netfilter.h>
19#include <linux/netfilter_ipv4/ip_queue.h>
20#include <linux/netfilter_ipv4/ip_tables.h>
21#include <linux/netlink.h>
22#include <linux/spinlock.h>
23#include <linux/sysctl.h>
24#include <linux/proc_fs.h>
25#include <linux/seq_file.h>
26#include <linux/security.h>
27#include <linux/net.h>
28#include <linux/mutex.h>
29#include <linux/slab.h>
30#include <net/net_namespace.h>
31#include <net/sock.h>
32#include <net/route.h>
33#include <net/netfilter/nf_queue.h>
34#include <net/ip.h>
35
36#define IPQ_QMAX_DEFAULT 1024
37#define IPQ_PROC_FS_NAME "ip_queue"
38#define NET_IPQ_QMAX 2088
39#define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
40
41typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long);
42
43static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE;
44static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT;
45static DEFINE_SPINLOCK(queue_lock);
46static int peer_pid __read_mostly;
47static unsigned int copy_range __read_mostly;
48static unsigned int queue_total;
49static unsigned int queue_dropped = 0;
50static unsigned int queue_user_dropped = 0;
51static struct sock *ipqnl __read_mostly;
52static LIST_HEAD(queue_list);
53static DEFINE_MUTEX(ipqnl_mutex);
54
55static inline void
56__ipq_enqueue_entry(struct nf_queue_entry *entry)
57{
58 list_add_tail(&entry->list, &queue_list);
59 queue_total++;
60}
61
62static inline int
63__ipq_set_mode(unsigned char mode, unsigned int range)
64{
65 int status = 0;
66
67 switch(mode) {
68 case IPQ_COPY_NONE:
69 case IPQ_COPY_META:
70 copy_mode = mode;
71 copy_range = 0;
72 break;
73
74 case IPQ_COPY_PACKET:
75 if (range > 0xFFFF)
76 range = 0xFFFF;
77 copy_range = range;
78 copy_mode = mode;
79 break;
80
81 default:
82 status = -EINVAL;
83
84 }
85 return status;
86}
87
88static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data);
89
90static inline void
91__ipq_reset(void)
92{
93 peer_pid = 0;
94 net_disable_timestamp();
95 __ipq_set_mode(IPQ_COPY_NONE, 0);
96 __ipq_flush(NULL, 0);
97}
98
99static struct nf_queue_entry *
100ipq_find_dequeue_entry(unsigned long id)
101{
102 struct nf_queue_entry *entry = NULL, *i;
103
104 spin_lock_bh(&queue_lock);
105
106 list_for_each_entry(i, &queue_list, list) {
107 if ((unsigned long)i == id) {
108 entry = i;
109 break;
110 }
111 }
112
113 if (entry) {
114 list_del(&entry->list);
115 queue_total--;
116 }
117
118 spin_unlock_bh(&queue_lock);
119 return entry;
120}
121
122static void
123__ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
124{
125 struct nf_queue_entry *entry, *next;
126
127 list_for_each_entry_safe(entry, next, &queue_list, list) {
128 if (!cmpfn || cmpfn(entry, data)) {
129 list_del(&entry->list);
130 queue_total--;
131 nf_reinject(entry, NF_DROP);
132 }
133 }
134}
135
136static void
137ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
138{
139 spin_lock_bh(&queue_lock);
140 __ipq_flush(cmpfn, data);
141 spin_unlock_bh(&queue_lock);
142}
143
144static struct sk_buff *
145ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
146{
147 sk_buff_data_t old_tail;
148 size_t size = 0;
149 size_t data_len = 0;
150 struct sk_buff *skb;
151 struct ipq_packet_msg *pmsg;
152 struct nlmsghdr *nlh;
153 struct timeval tv;
154
155 switch (ACCESS_ONCE(copy_mode)) {
156 case IPQ_COPY_META:
157 case IPQ_COPY_NONE:
158 size = NLMSG_SPACE(sizeof(*pmsg));
159 break;
160
161 case IPQ_COPY_PACKET:
162 if (entry->skb->ip_summed == CHECKSUM_PARTIAL &&
163 (*errp = skb_checksum_help(entry->skb)))
164 return NULL;
165
166 data_len = ACCESS_ONCE(copy_range);
167 if (data_len == 0 || data_len > entry->skb->len)
168 data_len = entry->skb->len;
169
170 size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
171 break;
172
173 default:
174 *errp = -EINVAL;
175 return NULL;
176 }
177
178 skb = alloc_skb(size, GFP_ATOMIC);
179 if (!skb)
180 goto nlmsg_failure;
181
182 old_tail = skb->tail;
183 nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
184 pmsg = NLMSG_DATA(nlh);
185 memset(pmsg, 0, sizeof(*pmsg));
186
187 pmsg->packet_id = (unsigned long )entry;
188 pmsg->data_len = data_len;
189 tv = ktime_to_timeval(entry->skb->tstamp);
190 pmsg->timestamp_sec = tv.tv_sec;
191 pmsg->timestamp_usec = tv.tv_usec;
192 pmsg->mark = entry->skb->mark;
193 pmsg->hook = entry->hook;
194 pmsg->hw_protocol = entry->skb->protocol;
195
196 if (entry->indev)
197 strcpy(pmsg->indev_name, entry->indev->name);
198 else
199 pmsg->indev_name[0] = '\0';
200
201 if (entry->outdev)
202 strcpy(pmsg->outdev_name, entry->outdev->name);
203 else
204 pmsg->outdev_name[0] = '\0';
205
206 if (entry->indev && entry->skb->dev &&
207 entry->skb->mac_header != entry->skb->network_header) {
208 pmsg->hw_type = entry->skb->dev->type;
209 pmsg->hw_addrlen = dev_parse_header(entry->skb,
210 pmsg->hw_addr);
211 }
212
213 if (data_len)
214 if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len))
215 BUG();
216
217 nlh->nlmsg_len = skb->tail - old_tail;
218 return skb;
219
220nlmsg_failure:
221 kfree_skb(skb);
222 *errp = -EINVAL;
223 printk(KERN_ERR "ip_queue: error creating packet message\n");
224 return NULL;
225}
226
227static int
228ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
229{
230 int status = -EINVAL;
231 struct sk_buff *nskb;
232
233 if (copy_mode == IPQ_COPY_NONE)
234 return -EAGAIN;
235
236 nskb = ipq_build_packet_message(entry, &status);
237 if (nskb == NULL)
238 return status;
239
240 spin_lock_bh(&queue_lock);
241
242 if (!peer_pid)
243 goto err_out_free_nskb;
244
245 if (queue_total >= queue_maxlen) {
246 queue_dropped++;
247 status = -ENOSPC;
248 if (net_ratelimit())
249 printk (KERN_WARNING "ip_queue: full at %d entries, "
250 "dropping packets(s). Dropped: %d\n", queue_total,
251 queue_dropped);
252 goto err_out_free_nskb;
253 }
254
255 /* netlink_unicast will either free the nskb or attach it to a socket */
256 status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT);
257 if (status < 0) {
258 queue_user_dropped++;
259 goto err_out_unlock;
260 }
261
262 __ipq_enqueue_entry(entry);
263
264 spin_unlock_bh(&queue_lock);
265 return status;
266
267err_out_free_nskb:
268 kfree_skb(nskb);
269
270err_out_unlock:
271 spin_unlock_bh(&queue_lock);
272 return status;
273}
274
275static int
276ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct nf_queue_entry *e)
277{
278 int diff;
279 struct iphdr *user_iph = (struct iphdr *)v->payload;
280 struct sk_buff *nskb;
281
282 if (v->data_len < sizeof(*user_iph))
283 return 0;
284 diff = v->data_len - e->skb->len;
285 if (diff < 0) {
286 if (pskb_trim(e->skb, v->data_len))
287 return -ENOMEM;
288 } else if (diff > 0) {
289 if (v->data_len > 0xFFFF)
290 return -EINVAL;
291 if (diff > skb_tailroom(e->skb)) {
292 nskb = skb_copy_expand(e->skb, skb_headroom(e->skb),
293 diff, GFP_ATOMIC);
294 if (!nskb) {
295 printk(KERN_WARNING "ip_queue: error "
296 "in mangle, dropping packet\n");
297 return -ENOMEM;
298 }
299 kfree_skb(e->skb);
300 e->skb = nskb;
301 }
302 skb_put(e->skb, diff);
303 }
304 if (!skb_make_writable(e->skb, v->data_len))
305 return -ENOMEM;
306 skb_copy_to_linear_data(e->skb, v->payload, v->data_len);
307 e->skb->ip_summed = CHECKSUM_NONE;
308
309 return 0;
310}
311
312static int
313ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
314{
315 struct nf_queue_entry *entry;
316
317 if (vmsg->value > NF_MAX_VERDICT || vmsg->value == NF_STOLEN)
318 return -EINVAL;
319
320 entry = ipq_find_dequeue_entry(vmsg->id);
321 if (entry == NULL)
322 return -ENOENT;
323 else {
324 int verdict = vmsg->value;
325
326 if (vmsg->data_len && vmsg->data_len == len)
327 if (ipq_mangle_ipv4(vmsg, entry) < 0)
328 verdict = NF_DROP;
329
330 nf_reinject(entry, verdict);
331 return 0;
332 }
333}
334
335static int
336ipq_set_mode(unsigned char mode, unsigned int range)
337{
338 int status;
339
340 spin_lock_bh(&queue_lock);
341 status = __ipq_set_mode(mode, range);
342 spin_unlock_bh(&queue_lock);
343 return status;
344}
345
346static int
347ipq_receive_peer(struct ipq_peer_msg *pmsg,
348 unsigned char type, unsigned int len)
349{
350 int status = 0;
351
352 if (len < sizeof(*pmsg))
353 return -EINVAL;
354
355 switch (type) {
356 case IPQM_MODE:
357 status = ipq_set_mode(pmsg->msg.mode.value,
358 pmsg->msg.mode.range);
359 break;
360
361 case IPQM_VERDICT:
362 status = ipq_set_verdict(&pmsg->msg.verdict,
363 len - sizeof(*pmsg));
364 break;
365 default:
366 status = -EINVAL;
367 }
368 return status;
369}
370
371static int
372dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
373{
374 if (entry->indev)
375 if (entry->indev->ifindex == ifindex)
376 return 1;
377 if (entry->outdev)
378 if (entry->outdev->ifindex == ifindex)
379 return 1;
380#ifdef CONFIG_BRIDGE_NETFILTER
381 if (entry->skb->nf_bridge) {
382 if (entry->skb->nf_bridge->physindev &&
383 entry->skb->nf_bridge->physindev->ifindex == ifindex)
384 return 1;
385 if (entry->skb->nf_bridge->physoutdev &&
386 entry->skb->nf_bridge->physoutdev->ifindex == ifindex)
387 return 1;
388 }
389#endif
390 return 0;
391}
392
393static void
394ipq_dev_drop(int ifindex)
395{
396 ipq_flush(dev_cmp, ifindex);
397}
398
399#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
400
401static inline void
402__ipq_rcv_skb(struct sk_buff *skb)
403{
404 int status, type, pid, flags;
405 unsigned int nlmsglen, skblen;
406 struct nlmsghdr *nlh;
407 bool enable_timestamp = false;
408
409 skblen = skb->len;
410 if (skblen < sizeof(*nlh))
411 return;
412
413 nlh = nlmsg_hdr(skb);
414 nlmsglen = nlh->nlmsg_len;
415 if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen)
416 return;
417
418 pid = nlh->nlmsg_pid;
419 flags = nlh->nlmsg_flags;
420
421 if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI)
422 RCV_SKB_FAIL(-EINVAL);
423
424 if (flags & MSG_TRUNC)
425 RCV_SKB_FAIL(-ECOMM);
426
427 type = nlh->nlmsg_type;
428 if (type < NLMSG_NOOP || type >= IPQM_MAX)
429 RCV_SKB_FAIL(-EINVAL);
430
431 if (type <= IPQM_BASE)
432 return;
433
434 if (!capable(CAP_NET_ADMIN))
435 RCV_SKB_FAIL(-EPERM);
436
437 spin_lock_bh(&queue_lock);
438
439 if (peer_pid) {
440 if (peer_pid != pid) {
441 spin_unlock_bh(&queue_lock);
442 RCV_SKB_FAIL(-EBUSY);
443 }
444 } else {
445 enable_timestamp = true;
446 peer_pid = pid;
447 }
448
449 spin_unlock_bh(&queue_lock);
450 if (enable_timestamp)
451 net_enable_timestamp();
452 status = ipq_receive_peer(NLMSG_DATA(nlh), type,
453 nlmsglen - NLMSG_LENGTH(0));
454 if (status < 0)
455 RCV_SKB_FAIL(status);
456
457 if (flags & NLM_F_ACK)
458 netlink_ack(skb, nlh, 0);
459}
460
461static void
462ipq_rcv_skb(struct sk_buff *skb)
463{
464 mutex_lock(&ipqnl_mutex);
465 __ipq_rcv_skb(skb);
466 mutex_unlock(&ipqnl_mutex);
467}
468
469static int
470ipq_rcv_dev_event(struct notifier_block *this,
471 unsigned long event, void *ptr)
472{
473 struct net_device *dev = ptr;
474
475 if (!net_eq(dev_net(dev), &init_net))
476 return NOTIFY_DONE;
477
478 /* Drop any packets associated with the downed device */
479 if (event == NETDEV_DOWN)
480 ipq_dev_drop(dev->ifindex);
481 return NOTIFY_DONE;
482}
483
484static struct notifier_block ipq_dev_notifier = {
485 .notifier_call = ipq_rcv_dev_event,
486};
487
488static int
489ipq_rcv_nl_event(struct notifier_block *this,
490 unsigned long event, void *ptr)
491{
492 struct netlink_notify *n = ptr;
493
494 if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) {
495 spin_lock_bh(&queue_lock);
496 if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid))
497 __ipq_reset();
498 spin_unlock_bh(&queue_lock);
499 }
500 return NOTIFY_DONE;
501}
502
503static struct notifier_block ipq_nl_notifier = {
504 .notifier_call = ipq_rcv_nl_event,
505};
506
507#ifdef CONFIG_SYSCTL
508static struct ctl_table_header *ipq_sysctl_header;
509
510static ctl_table ipq_table[] = {
511 {
512 .procname = NET_IPQ_QMAX_NAME,
513 .data = &queue_maxlen,
514 .maxlen = sizeof(queue_maxlen),
515 .mode = 0644,
516 .proc_handler = proc_dointvec
517 },
518 { }
519};
520#endif
521
522#ifdef CONFIG_PROC_FS
523static int ip_queue_show(struct seq_file *m, void *v)
524{
525 spin_lock_bh(&queue_lock);
526
527 seq_printf(m,
528 "Peer PID : %d\n"
529 "Copy mode : %hu\n"
530 "Copy range : %u\n"
531 "Queue length : %u\n"
532 "Queue max. length : %u\n"
533 "Queue dropped : %u\n"
534 "Netlink dropped : %u\n",
535 peer_pid,
536 copy_mode,
537 copy_range,
538 queue_total,
539 queue_maxlen,
540 queue_dropped,
541 queue_user_dropped);
542
543 spin_unlock_bh(&queue_lock);
544 return 0;
545}
546
547static int ip_queue_open(struct inode *inode, struct file *file)
548{
549 return single_open(file, ip_queue_show, NULL);
550}
551
552static const struct file_operations ip_queue_proc_fops = {
553 .open = ip_queue_open,
554 .read = seq_read,
555 .llseek = seq_lseek,
556 .release = single_release,
557 .owner = THIS_MODULE,
558};
559#endif
560
561static const struct nf_queue_handler nfqh = {
562 .name = "ip_queue",
563 .outfn = &ipq_enqueue_packet,
564};
565
566static int __init ip_queue_init(void)
567{
568 int status = -ENOMEM;
569 struct proc_dir_entry *proc __maybe_unused;
570
571 netlink_register_notifier(&ipq_nl_notifier);
572 ipqnl = netlink_kernel_create(&init_net, NETLINK_FIREWALL, 0,
573 ipq_rcv_skb, NULL, THIS_MODULE);
574 if (ipqnl == NULL) {
575 printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
576 goto cleanup_netlink_notifier;
577 }
578
579#ifdef CONFIG_PROC_FS
580 proc = proc_create(IPQ_PROC_FS_NAME, 0, init_net.proc_net,
581 &ip_queue_proc_fops);
582 if (!proc) {
583 printk(KERN_ERR "ip_queue: failed to create proc entry\n");
584 goto cleanup_ipqnl;
585 }
586#endif
587 register_netdevice_notifier(&ipq_dev_notifier);
588#ifdef CONFIG_SYSCTL
589 ipq_sysctl_header = register_sysctl_paths(net_ipv4_ctl_path, ipq_table);
590#endif
591 status = nf_register_queue_handler(NFPROTO_IPV4, &nfqh);
592 if (status < 0) {
593 printk(KERN_ERR "ip_queue: failed to register queue handler\n");
594 goto cleanup_sysctl;
595 }
596 return status;
597
598cleanup_sysctl:
599#ifdef CONFIG_SYSCTL
600 unregister_sysctl_table(ipq_sysctl_header);
601#endif
602 unregister_netdevice_notifier(&ipq_dev_notifier);
603 proc_net_remove(&init_net, IPQ_PROC_FS_NAME);
604cleanup_ipqnl: __maybe_unused
605 netlink_kernel_release(ipqnl);
606 mutex_lock(&ipqnl_mutex);
607 mutex_unlock(&ipqnl_mutex);
608
609cleanup_netlink_notifier:
610 netlink_unregister_notifier(&ipq_nl_notifier);
611 return status;
612}
613
614static void __exit ip_queue_fini(void)
615{
616 nf_unregister_queue_handlers(&nfqh);
617
618 ipq_flush(NULL, 0);
619
620#ifdef CONFIG_SYSCTL
621 unregister_sysctl_table(ipq_sysctl_header);
622#endif
623 unregister_netdevice_notifier(&ipq_dev_notifier);
624 proc_net_remove(&init_net, IPQ_PROC_FS_NAME);
625
626 netlink_kernel_release(ipqnl);
627 mutex_lock(&ipqnl_mutex);
628 mutex_unlock(&ipqnl_mutex);
629
630 netlink_unregister_notifier(&ipq_nl_notifier);
631}
632
633MODULE_DESCRIPTION("IPv4 packet queue handler");
634MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
635MODULE_LICENSE("GPL");
636MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_FIREWALL);
637
638module_init(ip_queue_init);
639module_exit(ip_queue_fini);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 24e556e83a3b..170b1fdd6b72 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -153,8 +153,7 @@ ip_checkentry(const struct ipt_ip *ip)
153static unsigned int 153static unsigned int
154ipt_error(struct sk_buff *skb, const struct xt_action_param *par) 154ipt_error(struct sk_buff *skb, const struct xt_action_param *par)
155{ 155{
156 if (net_ratelimit()) 156 net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo);
157 pr_info("error: `%s'\n", (const char *)par->targinfo);
158 157
159 return NF_DROP; 158 return NF_DROP;
160} 159}
@@ -377,7 +376,7 @@ ipt_do_table(struct sk_buff *skb,
377 if (v < 0) { 376 if (v < 0) {
378 /* Pop from stack? */ 377 /* Pop from stack? */
379 if (v != XT_RETURN) { 378 if (v != XT_RETURN) {
380 verdict = (unsigned)(-v) - 1; 379 verdict = (unsigned int)(-v) - 1;
381 break; 380 break;
382 } 381 }
383 if (*stackptr <= origptr) { 382 if (*stackptr <= origptr) {
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index a639967eb727..fe5daea5214d 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -246,8 +246,7 @@ clusterip_hashfn(const struct sk_buff *skb,
246 dport = ports[1]; 246 dport = ports[1];
247 } 247 }
248 } else { 248 } else {
249 if (net_ratelimit()) 249 net_info_ratelimited("unknown protocol %u\n", iph->protocol);
250 pr_info("unknown protocol %u\n", iph->protocol);
251 } 250 }
252 251
253 switch (config->hash_mode) { 252 switch (config->hash_mode) {
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index de9da21113a1..91747d4ebc26 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -74,16 +74,24 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
74 74
75 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); 75 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
76 if (iph == NULL) 76 if (iph == NULL)
77 return -NF_DROP; 77 return -NF_ACCEPT;
78 78
79 /* Conntrack defragments packets, we might still see fragments 79 /* Conntrack defragments packets, we might still see fragments
80 * inside ICMP packets though. */ 80 * inside ICMP packets though. */
81 if (iph->frag_off & htons(IP_OFFSET)) 81 if (iph->frag_off & htons(IP_OFFSET))
82 return -NF_DROP; 82 return -NF_ACCEPT;
83 83
84 *dataoff = nhoff + (iph->ihl << 2); 84 *dataoff = nhoff + (iph->ihl << 2);
85 *protonum = iph->protocol; 85 *protonum = iph->protocol;
86 86
87 /* Check bogus IP headers */
88 if (*dataoff > skb->len) {
89 pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: "
90 "nhoff %u, ihl %u, skblen %u\n",
91 nhoff, iph->ihl << 2, skb->len);
92 return -NF_ACCEPT;
93 }
94
87 return NF_ACCEPT; 95 return NF_ACCEPT;
88} 96}
89 97
@@ -303,8 +311,9 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
303static int ipv4_tuple_to_nlattr(struct sk_buff *skb, 311static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
304 const struct nf_conntrack_tuple *tuple) 312 const struct nf_conntrack_tuple *tuple)
305{ 313{
306 NLA_PUT_BE32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip); 314 if (nla_put_be32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
307 NLA_PUT_BE32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip); 315 nla_put_be32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
316 goto nla_put_failure;
308 return 0; 317 return 0;
309 318
310nla_put_failure: 319nla_put_failure:
@@ -356,7 +365,7 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
356 .nla_policy = ipv4_nla_policy, 365 .nla_policy = ipv4_nla_policy,
357#endif 366#endif
358#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) 367#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
359 .ctl_table_path = nf_net_ipv4_netfilter_sysctl_path, 368 .ctl_table_path = "net/ipv4/netfilter",
360 .ctl_table = ip_ct_sysctl_table, 369 .ctl_table = ip_ct_sysctl_table,
361#endif 370#endif
362 .me = THIS_MODULE, 371 .me = THIS_MODULE,
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 7cbe9cb261c2..0847e373d33c 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -228,10 +228,10 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
228static int icmp_tuple_to_nlattr(struct sk_buff *skb, 228static int icmp_tuple_to_nlattr(struct sk_buff *skb,
229 const struct nf_conntrack_tuple *t) 229 const struct nf_conntrack_tuple *t)
230{ 230{
231 NLA_PUT_BE16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id); 231 if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) ||
232 NLA_PUT_U8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type); 232 nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) ||
233 NLA_PUT_U8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code); 233 nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code))
234 234 goto nla_put_failure;
235 return 0; 235 return 0;
236 236
237nla_put_failure: 237nla_put_failure:
@@ -293,8 +293,8 @@ icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
293{ 293{
294 const unsigned int *timeout = data; 294 const unsigned int *timeout = data;
295 295
296 NLA_PUT_BE32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ)); 296 if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ)))
297 297 goto nla_put_failure;
298 return 0; 298 return 0;
299 299
300nla_put_failure: 300nla_put_failure:
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 82536701e3a3..cad29c121318 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -42,9 +42,7 @@ static int set_addr(struct sk_buff *skb,
42 if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, 42 if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
43 addroff, sizeof(buf), 43 addroff, sizeof(buf),
44 (char *) &buf, sizeof(buf))) { 44 (char *) &buf, sizeof(buf))) {
45 if (net_ratelimit()) 45 net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n");
46 pr_notice("nf_nat_h323: nf_nat_mangle_tcp_packet"
47 " error\n");
48 return -1; 46 return -1;
49 } 47 }
50 48
@@ -58,9 +56,7 @@ static int set_addr(struct sk_buff *skb,
58 if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, 56 if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
59 addroff, sizeof(buf), 57 addroff, sizeof(buf),
60 (char *) &buf, sizeof(buf))) { 58 (char *) &buf, sizeof(buf))) {
61 if (net_ratelimit()) 59 net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n");
62 pr_notice("nf_nat_h323: nf_nat_mangle_udp_packet"
63 " error\n");
64 return -1; 60 return -1;
65 } 61 }
66 /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy 62 /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy
@@ -214,8 +210,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
214 210
215 /* Run out of expectations */ 211 /* Run out of expectations */
216 if (i >= H323_RTP_CHANNEL_MAX) { 212 if (i >= H323_RTP_CHANNEL_MAX) {
217 if (net_ratelimit()) 213 net_notice_ratelimited("nf_nat_h323: out of expectations\n");
218 pr_notice("nf_nat_h323: out of expectations\n");
219 return 0; 214 return 0;
220 } 215 }
221 216
@@ -244,8 +239,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
244 } 239 }
245 240
246 if (nated_port == 0) { /* No port available */ 241 if (nated_port == 0) { /* No port available */
247 if (net_ratelimit()) 242 net_notice_ratelimited("nf_nat_h323: out of RTP ports\n");
248 pr_notice("nf_nat_h323: out of RTP ports\n");
249 return 0; 243 return 0;
250 } 244 }
251 245
@@ -308,8 +302,7 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
308 } 302 }
309 303
310 if (nated_port == 0) { /* No port available */ 304 if (nated_port == 0) { /* No port available */
311 if (net_ratelimit()) 305 net_notice_ratelimited("nf_nat_h323: out of TCP ports\n");
312 pr_notice("nf_nat_h323: out of TCP ports\n");
313 return 0; 306 return 0;
314 } 307 }
315 308
@@ -365,8 +358,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
365 } 358 }
366 359
367 if (nated_port == 0) { /* No port available */ 360 if (nated_port == 0) { /* No port available */
368 if (net_ratelimit()) 361 net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
369 pr_notice("nf_nat_q931: out of TCP ports\n");
370 return 0; 362 return 0;
371 } 363 }
372 364
@@ -456,8 +448,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
456 } 448 }
457 449
458 if (nated_port == 0) { /* No port available */ 450 if (nated_port == 0) { /* No port available */
459 if (net_ratelimit()) 451 net_notice_ratelimited("nf_nat_ras: out of TCP ports\n");
460 pr_notice("nf_nat_ras: out of TCP ports\n");
461 return 0; 452 return 0;
462 } 453 }
463 454
@@ -545,8 +536,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
545 } 536 }
546 537
547 if (nated_port == 0) { /* No port available */ 538 if (nated_port == 0) { /* No port available */
548 if (net_ratelimit()) 539 net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
549 pr_notice("nf_nat_q931: out of TCP ports\n");
550 return 0; 540 return 0;
551 } 541 }
552 542
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index 57932c43960e..ea4a23813d26 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -283,7 +283,7 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
283 __be32 newip; 283 __be32 newip;
284 u_int16_t port; 284 u_int16_t port;
285 char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; 285 char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
286 unsigned buflen; 286 unsigned int buflen;
287 287
288 /* Connection will come from reply */ 288 /* Connection will come from reply */
289 if (ct->tuplehash[dir].tuple.src.u3.ip == ct->tuplehash[!dir].tuple.dst.u3.ip) 289 if (ct->tuplehash[dir].tuple.src.u3.ip == ct->tuplehash[!dir].tuple.dst.u3.ip)
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 2133c30a4a5f..746edec8b86e 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -1206,8 +1206,7 @@ static int snmp_translate(struct nf_conn *ct,
1206 1206
1207 if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr), 1207 if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr),
1208 paylen, &map, &udph->check)) { 1208 paylen, &map, &udph->check)) {
1209 if (net_ratelimit()) 1209 net_warn_ratelimited("bsalg: parser failed\n");
1210 printk(KERN_WARNING "bsalg: parser failed\n");
1211 return NF_DROP; 1210 return NF_DROP;
1212 } 1211 }
1213 return NF_ACCEPT; 1212 return NF_ACCEPT;
@@ -1241,9 +1240,8 @@ static int help(struct sk_buff *skb, unsigned int protoff,
1241 * can mess around with the payload. 1240 * can mess around with the payload.
1242 */ 1241 */
1243 if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) { 1242 if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) {
1244 if (net_ratelimit()) 1243 net_warn_ratelimited("SNMP: dropping malformed packet src=%pI4 dst=%pI4\n",
1245 printk(KERN_WARNING "SNMP: dropping malformed packet src=%pI4 dst=%pI4\n", 1244 &iph->saddr, &iph->daddr);
1246 &iph->saddr, &iph->daddr);
1247 return NF_DROP; 1245 return NF_DROP;
1248 } 1246 }
1249 1247
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 50009c787bcd..6e930c7174dd 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -51,15 +51,16 @@ static struct ping_table ping_table;
51 51
52static u16 ping_port_rover; 52static u16 ping_port_rover;
53 53
54static inline int ping_hashfn(struct net *net, unsigned num, unsigned mask) 54static inline int ping_hashfn(struct net *net, unsigned int num, unsigned int mask)
55{ 55{
56 int res = (num + net_hash_mix(net)) & mask; 56 int res = (num + net_hash_mix(net)) & mask;
57
57 pr_debug("hash(%d) = %d\n", num, res); 58 pr_debug("hash(%d) = %d\n", num, res);
58 return res; 59 return res;
59} 60}
60 61
61static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table, 62static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table,
62 struct net *net, unsigned num) 63 struct net *net, unsigned int num)
63{ 64{
64 return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)]; 65 return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)];
65} 66}
@@ -188,7 +189,8 @@ static void inet_get_ping_group_range_net(struct net *net, gid_t *low,
188 gid_t *high) 189 gid_t *high)
189{ 190{
190 gid_t *data = net->ipv4.sysctl_ping_group_range; 191 gid_t *data = net->ipv4.sysctl_ping_group_range;
191 unsigned seq; 192 unsigned int seq;
193
192 do { 194 do {
193 seq = read_seqbegin(&sysctl_local_ports.lock); 195 seq = read_seqbegin(&sysctl_local_ports.lock);
194 196
@@ -410,7 +412,7 @@ struct pingfakehdr {
410 __wsum wcheck; 412 __wsum wcheck;
411}; 413};
412 414
413static int ping_getfrag(void *from, char * to, 415static int ping_getfrag(void *from, char *to,
414 int offset, int fraglen, int odd, struct sk_buff *skb) 416 int offset, int fraglen, int odd, struct sk_buff *skb)
415{ 417{
416 struct pingfakehdr *pfh = (struct pingfakehdr *)from; 418 struct pingfakehdr *pfh = (struct pingfakehdr *)from;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index bbd604c68e68..4032b818f3e4 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -288,7 +288,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
288 read_unlock(&raw_v4_hashinfo.lock); 288 read_unlock(&raw_v4_hashinfo.lock);
289} 289}
290 290
291static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) 291static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
292{ 292{
293 /* Charge it to the socket. */ 293 /* Charge it to the socket. */
294 294
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4dc1c104c942..ffcb3b016843 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -109,6 +109,7 @@
109#include <net/rtnetlink.h> 109#include <net/rtnetlink.h>
110#ifdef CONFIG_SYSCTL 110#ifdef CONFIG_SYSCTL
111#include <linux/sysctl.h> 111#include <linux/sysctl.h>
112#include <linux/kmemleak.h>
112#endif 113#endif
113#include <net/secure_seq.h> 114#include <net/secure_seq.h>
114 115
@@ -229,7 +230,7 @@ const __u8 ip_tos2prio[16] = {
229 TC_PRIO_INTERACTIVE_BULK, 230 TC_PRIO_INTERACTIVE_BULK,
230 ECN_OR_COST(INTERACTIVE_BULK) 231 ECN_OR_COST(INTERACTIVE_BULK)
231}; 232};
232 233EXPORT_SYMBOL(ip_tos2prio);
233 234
234/* 235/*
235 * Route cache. 236 * Route cache.
@@ -296,7 +297,7 @@ static inline void rt_hash_lock_init(void)
296#endif 297#endif
297 298
298static struct rt_hash_bucket *rt_hash_table __read_mostly; 299static struct rt_hash_bucket *rt_hash_table __read_mostly;
299static unsigned rt_hash_mask __read_mostly; 300static unsigned int rt_hash_mask __read_mostly;
300static unsigned int rt_hash_log __read_mostly; 301static unsigned int rt_hash_log __read_mostly;
301 302
302static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 303static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
@@ -959,8 +960,7 @@ void rt_cache_flush_batch(struct net *net)
959 960
960static void rt_emergency_hash_rebuild(struct net *net) 961static void rt_emergency_hash_rebuild(struct net *net)
961{ 962{
962 if (net_ratelimit()) 963 net_warn_ratelimited("Route hash chain too long!\n");
963 pr_warn("Route hash chain too long!\n");
964 rt_cache_invalidate(net); 964 rt_cache_invalidate(net);
965} 965}
966 966
@@ -1083,8 +1083,7 @@ static int rt_garbage_collect(struct dst_ops *ops)
1083 goto out; 1083 goto out;
1084 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size) 1084 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1085 goto out; 1085 goto out;
1086 if (net_ratelimit()) 1086 net_warn_ratelimited("dst cache overflow\n");
1087 pr_warn("dst cache overflow\n");
1088 RT_CACHE_STAT_INC(gc_dst_overflow); 1087 RT_CACHE_STAT_INC(gc_dst_overflow);
1089 return 1; 1088 return 1;
1090 1089
@@ -1143,7 +1142,7 @@ static int rt_bind_neighbour(struct rtable *rt)
1143 return 0; 1142 return 0;
1144} 1143}
1145 1144
1146static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt, 1145static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1147 struct sk_buff *skb, int ifindex) 1146 struct sk_buff *skb, int ifindex)
1148{ 1147{
1149 struct rtable *rth, *cand; 1148 struct rtable *rth, *cand;
@@ -1181,8 +1180,7 @@ restart:
1181 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { 1180 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1182 int err = rt_bind_neighbour(rt); 1181 int err = rt_bind_neighbour(rt);
1183 if (err) { 1182 if (err) {
1184 if (net_ratelimit()) 1183 net_warn_ratelimited("Neighbour table failure & not caching routes\n");
1185 pr_warn("Neighbour table failure & not caching routes\n");
1186 ip_rt_put(rt); 1184 ip_rt_put(rt);
1187 return ERR_PTR(err); 1185 return ERR_PTR(err);
1188 } 1186 }
@@ -1298,8 +1296,7 @@ restart:
1298 goto restart; 1296 goto restart;
1299 } 1297 }
1300 1298
1301 if (net_ratelimit()) 1299 net_warn_ratelimited("Neighbour table overflow\n");
1302 pr_warn("Neighbour table overflow\n");
1303 rt_drop(rt); 1300 rt_drop(rt);
1304 return ERR_PTR(-ENOBUFS); 1301 return ERR_PTR(-ENOBUFS);
1305 } 1302 }
@@ -1377,14 +1374,13 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1377 return; 1374 return;
1378 } 1375 }
1379 } else if (!rt) 1376 } else if (!rt)
1380 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", 1377 pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1381 __builtin_return_address(0));
1382 1378
1383 ip_select_fb_ident(iph); 1379 ip_select_fb_ident(iph);
1384} 1380}
1385EXPORT_SYMBOL(__ip_select_ident); 1381EXPORT_SYMBOL(__ip_select_ident);
1386 1382
1387static void rt_del(unsigned hash, struct rtable *rt) 1383static void rt_del(unsigned int hash, struct rtable *rt)
1388{ 1384{
1389 struct rtable __rcu **rthp; 1385 struct rtable __rcu **rthp;
1390 struct rtable *aux; 1386 struct rtable *aux;
@@ -1502,11 +1498,11 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1502 1498
1503reject_redirect: 1499reject_redirect:
1504#ifdef CONFIG_IP_ROUTE_VERBOSE 1500#ifdef CONFIG_IP_ROUTE_VERBOSE
1505 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 1501 if (IN_DEV_LOG_MARTIANS(in_dev))
1506 pr_info("Redirect from %pI4 on %s about %pI4 ignored\n" 1502 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1507 " Advised path = %pI4 -> %pI4\n", 1503 " Advised path = %pI4 -> %pI4\n",
1508 &old_gw, dev->name, &new_gw, 1504 &old_gw, dev->name, &new_gw,
1509 &saddr, &daddr); 1505 &saddr, &daddr);
1510#endif 1506#endif
1511 ; 1507 ;
1512} 1508}
@@ -1538,7 +1534,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1538 ip_rt_put(rt); 1534 ip_rt_put(rt);
1539 ret = NULL; 1535 ret = NULL;
1540 } else if (rt->rt_flags & RTCF_REDIRECTED) { 1536 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1541 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, 1537 unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1542 rt->rt_oif, 1538 rt->rt_oif,
1543 rt_genid(dev_net(dst->dev))); 1539 rt_genid(dev_net(dst->dev)));
1544 rt_del(hash, rt); 1540 rt_del(hash, rt);
@@ -1616,11 +1612,10 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1616 ++peer->rate_tokens; 1612 ++peer->rate_tokens;
1617#ifdef CONFIG_IP_ROUTE_VERBOSE 1613#ifdef CONFIG_IP_ROUTE_VERBOSE
1618 if (log_martians && 1614 if (log_martians &&
1619 peer->rate_tokens == ip_rt_redirect_number && 1615 peer->rate_tokens == ip_rt_redirect_number)
1620 net_ratelimit()) 1616 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1621 pr_warn("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", 1617 &ip_hdr(skb)->saddr, rt->rt_iif,
1622 &ip_hdr(skb)->saddr, rt->rt_iif, 1618 &rt->rt_dst, &rt->rt_gateway);
1623 &rt->rt_dst, &rt->rt_gateway);
1624#endif 1619#endif
1625 } 1620 }
1626} 1621}
@@ -1843,9 +1838,9 @@ static void ipv4_link_failure(struct sk_buff *skb)
1843 1838
1844static int ip_rt_bug(struct sk_buff *skb) 1839static int ip_rt_bug(struct sk_buff *skb)
1845{ 1840{
1846 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n", 1841 pr_debug("%s: %pI4 -> %pI4, %s\n",
1847 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1842 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1848 skb->dev ? skb->dev->name : "?"); 1843 skb->dev ? skb->dev->name : "?");
1849 kfree_skb(skb); 1844 kfree_skb(skb);
1850 WARN_ON(1); 1845 WARN_ON(1);
1851 return 0; 1846 return 0;
@@ -2041,7 +2036,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2041 if (err < 0) 2036 if (err < 0)
2042 goto e_err; 2037 goto e_err;
2043 } 2038 }
2044 rth = rt_dst_alloc(init_net.loopback_dev, 2039 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
2045 IN_DEV_CONF_GET(in_dev, NOPOLICY), false); 2040 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2046 if (!rth) 2041 if (!rth)
2047 goto e_nobufs; 2042 goto e_nobufs;
@@ -2134,8 +2129,7 @@ static int __mkroute_input(struct sk_buff *skb,
2134 /* get a working reference to the output device */ 2129 /* get a working reference to the output device */
2135 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); 2130 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2136 if (out_dev == NULL) { 2131 if (out_dev == NULL) {
2137 if (net_ratelimit()) 2132 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2138 pr_crit("Bug in ip_route_input_slow(). Please report.\n");
2139 return -EINVAL; 2133 return -EINVAL;
2140 } 2134 }
2141 2135
@@ -2215,9 +2209,9 @@ static int ip_mkroute_input(struct sk_buff *skb,
2215 struct in_device *in_dev, 2209 struct in_device *in_dev,
2216 __be32 daddr, __be32 saddr, u32 tos) 2210 __be32 daddr, __be32 saddr, u32 tos)
2217{ 2211{
2218 struct rtable* rth = NULL; 2212 struct rtable *rth = NULL;
2219 int err; 2213 int err;
2220 unsigned hash; 2214 unsigned int hash;
2221 2215
2222#ifdef CONFIG_IP_ROUTE_MULTIPATH 2216#ifdef CONFIG_IP_ROUTE_MULTIPATH
2223 if (res->fi && res->fi->fib_nhs > 1) 2217 if (res->fi && res->fi->fib_nhs > 1)
@@ -2255,13 +2249,13 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2255 struct fib_result res; 2249 struct fib_result res;
2256 struct in_device *in_dev = __in_dev_get_rcu(dev); 2250 struct in_device *in_dev = __in_dev_get_rcu(dev);
2257 struct flowi4 fl4; 2251 struct flowi4 fl4;
2258 unsigned flags = 0; 2252 unsigned int flags = 0;
2259 u32 itag = 0; 2253 u32 itag = 0;
2260 struct rtable * rth; 2254 struct rtable *rth;
2261 unsigned hash; 2255 unsigned int hash;
2262 __be32 spec_dst; 2256 __be32 spec_dst;
2263 int err = -EINVAL; 2257 int err = -EINVAL;
2264 struct net * net = dev_net(dev); 2258 struct net *net = dev_net(dev);
2265 2259
2266 /* IP on this device is disabled. */ 2260 /* IP on this device is disabled. */
2267 2261
@@ -2406,9 +2400,9 @@ no_route:
2406martian_destination: 2400martian_destination:
2407 RT_CACHE_STAT_INC(in_martian_dst); 2401 RT_CACHE_STAT_INC(in_martian_dst);
2408#ifdef CONFIG_IP_ROUTE_VERBOSE 2402#ifdef CONFIG_IP_ROUTE_VERBOSE
2409 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 2403 if (IN_DEV_LOG_MARTIANS(in_dev))
2410 pr_warn("martian destination %pI4 from %pI4, dev %s\n", 2404 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2411 &daddr, &saddr, dev->name); 2405 &daddr, &saddr, dev->name);
2412#endif 2406#endif
2413 2407
2414e_hostunreach: 2408e_hostunreach:
@@ -2433,8 +2427,8 @@ martian_source_keep_err:
2433int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2427int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2434 u8 tos, struct net_device *dev, bool noref) 2428 u8 tos, struct net_device *dev, bool noref)
2435{ 2429{
2436 struct rtable * rth; 2430 struct rtable *rth;
2437 unsigned hash; 2431 unsigned int hash;
2438 int iif = dev->ifindex; 2432 int iif = dev->ifindex;
2439 struct net *net; 2433 struct net *net;
2440 int res; 2434 int res;
@@ -2972,7 +2966,8 @@ static int rt_fill_info(struct net *net,
2972 r->rtm_src_len = 0; 2966 r->rtm_src_len = 0;
2973 r->rtm_tos = rt->rt_key_tos; 2967 r->rtm_tos = rt->rt_key_tos;
2974 r->rtm_table = RT_TABLE_MAIN; 2968 r->rtm_table = RT_TABLE_MAIN;
2975 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); 2969 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2970 goto nla_put_failure;
2976 r->rtm_type = rt->rt_type; 2971 r->rtm_type = rt->rt_type;
2977 r->rtm_scope = RT_SCOPE_UNIVERSE; 2972 r->rtm_scope = RT_SCOPE_UNIVERSE;
2978 r->rtm_protocol = RTPROT_UNSPEC; 2973 r->rtm_protocol = RTPROT_UNSPEC;
@@ -2980,31 +2975,38 @@ static int rt_fill_info(struct net *net,
2980 if (rt->rt_flags & RTCF_NOTIFY) 2975 if (rt->rt_flags & RTCF_NOTIFY)
2981 r->rtm_flags |= RTM_F_NOTIFY; 2976 r->rtm_flags |= RTM_F_NOTIFY;
2982 2977
2983 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); 2978 if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2984 2979 goto nla_put_failure;
2985 if (rt->rt_key_src) { 2980 if (rt->rt_key_src) {
2986 r->rtm_src_len = 32; 2981 r->rtm_src_len = 32;
2987 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src); 2982 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2983 goto nla_put_failure;
2988 } 2984 }
2989 if (rt->dst.dev) 2985 if (rt->dst.dev &&
2990 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); 2986 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2987 goto nla_put_failure;
2991#ifdef CONFIG_IP_ROUTE_CLASSID 2988#ifdef CONFIG_IP_ROUTE_CLASSID
2992 if (rt->dst.tclassid) 2989 if (rt->dst.tclassid &&
2993 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); 2990 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2991 goto nla_put_failure;
2994#endif 2992#endif
2995 if (rt_is_input_route(rt)) 2993 if (rt_is_input_route(rt)) {
2996 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); 2994 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst))
2997 else if (rt->rt_src != rt->rt_key_src) 2995 goto nla_put_failure;
2998 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); 2996 } else if (rt->rt_src != rt->rt_key_src) {
2999 2997 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
3000 if (rt->rt_dst != rt->rt_gateway) 2998 goto nla_put_failure;
3001 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); 2999 }
3000 if (rt->rt_dst != rt->rt_gateway &&
3001 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
3002 goto nla_put_failure;
3002 3003
3003 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) 3004 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3004 goto nla_put_failure; 3005 goto nla_put_failure;
3005 3006
3006 if (rt->rt_mark) 3007 if (rt->rt_mark &&
3007 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark); 3008 nla_put_be32(skb, RTA_MARK, rt->rt_mark))
3009 goto nla_put_failure;
3008 3010
3009 error = rt->dst.error; 3011 error = rt->dst.error;
3010 if (peer) { 3012 if (peer) {
@@ -3045,7 +3047,8 @@ static int rt_fill_info(struct net *net,
3045 } 3047 }
3046 } else 3048 } else
3047#endif 3049#endif
3048 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif); 3050 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
3051 goto nla_put_failure;
3049 } 3052 }
3050 3053
3051 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, 3054 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
@@ -3059,7 +3062,7 @@ nla_put_failure:
3059 return -EMSGSIZE; 3062 return -EMSGSIZE;
3060} 3063}
3061 3064
3062static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) 3065static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
3063{ 3066{
3064 struct net *net = sock_net(in_skb->sk); 3067 struct net *net = sock_net(in_skb->sk);
3065 struct rtmsg *rtm; 3068 struct rtmsg *rtm;
@@ -3334,23 +3337,6 @@ static ctl_table ipv4_route_table[] = {
3334 { } 3337 { }
3335}; 3338};
3336 3339
3337static struct ctl_table empty[1];
3338
3339static struct ctl_table ipv4_skeleton[] =
3340{
3341 { .procname = "route",
3342 .mode = 0555, .child = ipv4_route_table},
3343 { .procname = "neigh",
3344 .mode = 0555, .child = empty},
3345 { }
3346};
3347
3348static __net_initdata struct ctl_path ipv4_path[] = {
3349 { .procname = "net", },
3350 { .procname = "ipv4", },
3351 { },
3352};
3353
3354static struct ctl_table ipv4_route_flush_table[] = { 3340static struct ctl_table ipv4_route_flush_table[] = {
3355 { 3341 {
3356 .procname = "flush", 3342 .procname = "flush",
@@ -3361,13 +3347,6 @@ static struct ctl_table ipv4_route_flush_table[] = {
3361 { }, 3347 { },
3362}; 3348};
3363 3349
3364static __net_initdata struct ctl_path ipv4_route_path[] = {
3365 { .procname = "net", },
3366 { .procname = "ipv4", },
3367 { .procname = "route", },
3368 { },
3369};
3370
3371static __net_init int sysctl_route_net_init(struct net *net) 3350static __net_init int sysctl_route_net_init(struct net *net)
3372{ 3351{
3373 struct ctl_table *tbl; 3352 struct ctl_table *tbl;
@@ -3380,8 +3359,7 @@ static __net_init int sysctl_route_net_init(struct net *net)
3380 } 3359 }
3381 tbl[0].extra1 = net; 3360 tbl[0].extra1 = net;
3382 3361
3383 net->ipv4.route_hdr = 3362 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3384 register_net_sysctl_table(net, ipv4_route_path, tbl);
3385 if (net->ipv4.route_hdr == NULL) 3363 if (net->ipv4.route_hdr == NULL)
3386 goto err_reg; 3364 goto err_reg;
3387 return 0; 3365 return 0;
@@ -3430,9 +3408,15 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3430static __initdata unsigned long rhash_entries; 3408static __initdata unsigned long rhash_entries;
3431static int __init set_rhash_entries(char *str) 3409static int __init set_rhash_entries(char *str)
3432{ 3410{
3411 ssize_t ret;
3412
3433 if (!str) 3413 if (!str)
3434 return 0; 3414 return 0;
3435 rhash_entries = simple_strtoul(str, &str, 0); 3415
3416 ret = kstrtoul(str, 0, &rhash_entries);
3417 if (ret)
3418 return 0;
3419
3436 return 1; 3420 return 1;
3437} 3421}
3438__setup("rhash_entries=", set_rhash_entries); 3422__setup("rhash_entries=", set_rhash_entries);
@@ -3505,6 +3489,6 @@ int __init ip_rt_init(void)
3505 */ 3489 */
3506void __init ip_static_sysctl_init(void) 3490void __init ip_static_sysctl_init(void)
3507{ 3491{
3508 register_sysctl_paths(ipv4_path, ipv4_skeleton); 3492 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3509} 3493}
3510#endif 3494#endif
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7a7724da9bff..ef32956ed655 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -27,6 +27,7 @@
27#include <net/tcp_memcontrol.h> 27#include <net/tcp_memcontrol.h>
28 28
29static int zero; 29static int zero;
30static int two = 2;
30static int tcp_retr1_max = 255; 31static int tcp_retr1_max = 255;
31static int ip_local_port_range_min[] = { 1, 1 }; 32static int ip_local_port_range_min[] = { 1, 1 };
32static int ip_local_port_range_max[] = { 65535, 65535 }; 33static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -78,7 +79,7 @@ static int ipv4_local_port_range(ctl_table *table, int write,
78static void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high) 79static void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high)
79{ 80{
80 gid_t *data = table->data; 81 gid_t *data = table->data;
81 unsigned seq; 82 unsigned int seq;
82 do { 83 do {
83 seq = read_seqbegin(&sysctl_local_ports.lock); 84 seq = read_seqbegin(&sysctl_local_ports.lock);
84 85
@@ -677,6 +678,15 @@ static struct ctl_table ipv4_table[] = {
677 .proc_handler = proc_dointvec 678 .proc_handler = proc_dointvec
678 }, 679 },
679 { 680 {
681 .procname = "tcp_early_retrans",
682 .data = &sysctl_tcp_early_retrans,
683 .maxlen = sizeof(int),
684 .mode = 0644,
685 .proc_handler = proc_dointvec_minmax,
686 .extra1 = &zero,
687 .extra2 = &two,
688 },
689 {
680 .procname = "udp_mem", 690 .procname = "udp_mem",
681 .data = &sysctl_udp_mem, 691 .data = &sysctl_udp_mem,
682 .maxlen = sizeof(sysctl_udp_mem), 692 .maxlen = sizeof(sysctl_udp_mem),
@@ -768,13 +778,6 @@ static struct ctl_table ipv4_net_table[] = {
768 { } 778 { }
769}; 779};
770 780
771struct ctl_path net_ipv4_ctl_path[] = {
772 { .procname = "net", },
773 { .procname = "ipv4", },
774 { },
775};
776EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
777
778static __net_init int ipv4_sysctl_init_net(struct net *net) 781static __net_init int ipv4_sysctl_init_net(struct net *net)
779{ 782{
780 struct ctl_table *table; 783 struct ctl_table *table;
@@ -815,8 +818,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
815 818
816 tcp_init_mem(net); 819 tcp_init_mem(net);
817 820
818 net->ipv4.ipv4_hdr = register_net_sysctl_table(net, 821 net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
819 net_ipv4_ctl_path, table);
820 if (net->ipv4.ipv4_hdr == NULL) 822 if (net->ipv4.ipv4_hdr == NULL)
821 goto err_reg; 823 goto err_reg;
822 824
@@ -857,12 +859,12 @@ static __init int sysctl_ipv4_init(void)
857 if (!i->procname) 859 if (!i->procname)
858 return -EINVAL; 860 return -EINVAL;
859 861
860 hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table); 862 hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table);
861 if (hdr == NULL) 863 if (hdr == NULL)
862 return -ENOMEM; 864 return -ENOMEM;
863 865
864 if (register_pernet_subsys(&ipv4_sysctl_ops)) { 866 if (register_pernet_subsys(&ipv4_sysctl_ops)) {
865 unregister_sysctl_table(hdr); 867 unregister_net_sysctl_table(hdr);
866 return -ENOMEM; 868 return -ENOMEM;
867 } 869 }
868 870
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index cfd7edda0a8e..bb485fcb077e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -363,6 +363,71 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
363 return period; 363 return period;
364} 364}
365 365
366/* Address-family independent initialization for a tcp_sock.
367 *
368 * NOTE: A lot of things set to zero explicitly by call to
369 * sk_alloc() so need not be done here.
370 */
371void tcp_init_sock(struct sock *sk)
372{
373 struct inet_connection_sock *icsk = inet_csk(sk);
374 struct tcp_sock *tp = tcp_sk(sk);
375
376 skb_queue_head_init(&tp->out_of_order_queue);
377 tcp_init_xmit_timers(sk);
378 tcp_prequeue_init(tp);
379
380 icsk->icsk_rto = TCP_TIMEOUT_INIT;
381 tp->mdev = TCP_TIMEOUT_INIT;
382
383 /* So many TCP implementations out there (incorrectly) count the
384 * initial SYN frame in their delayed-ACK and congestion control
385 * algorithms that we must have the following bandaid to talk
386 * efficiently to them. -DaveM
387 */
388 tp->snd_cwnd = TCP_INIT_CWND;
389
390 /* See draft-stevens-tcpca-spec-01 for discussion of the
391 * initialization of these values.
392 */
393 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
394 tp->snd_cwnd_clamp = ~0;
395 tp->mss_cache = TCP_MSS_DEFAULT;
396
397 tp->reordering = sysctl_tcp_reordering;
398 tcp_enable_early_retrans(tp);
399 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
400
401 sk->sk_state = TCP_CLOSE;
402
403 sk->sk_write_space = sk_stream_write_space;
404 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
405
406 icsk->icsk_sync_mss = tcp_sync_mss;
407
408 /* TCP Cookie Transactions */
409 if (sysctl_tcp_cookie_size > 0) {
410 /* Default, cookies without s_data_payload. */
411 tp->cookie_values =
412 kzalloc(sizeof(*tp->cookie_values),
413 sk->sk_allocation);
414 if (tp->cookie_values != NULL)
415 kref_init(&tp->cookie_values->kref);
416 }
417 /* Presumed zeroed, in order of appearance:
418 * cookie_in_always, cookie_out_never,
419 * s_data_constant, s_data_in, s_data_out
420 */
421 sk->sk_sndbuf = sysctl_tcp_wmem[1];
422 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
423
424 local_bh_disable();
425 sock_update_memcg(sk);
426 sk_sockets_allocated_inc(sk);
427 local_bh_enable();
428}
429EXPORT_SYMBOL(tcp_init_sock);
430
366/* 431/*
367 * Wait for a TCP event. 432 * Wait for a TCP event.
368 * 433 *
@@ -528,7 +593,7 @@ static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
528 tp->pushed_seq = tp->write_seq; 593 tp->pushed_seq = tp->write_seq;
529} 594}
530 595
531static inline int forced_push(const struct tcp_sock *tp) 596static inline bool forced_push(const struct tcp_sock *tp)
532{ 597{
533 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); 598 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
534} 599}
@@ -701,11 +766,12 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
701 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); 766 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
702 if (skb) { 767 if (skb) {
703 if (sk_wmem_schedule(sk, skb->truesize)) { 768 if (sk_wmem_schedule(sk, skb->truesize)) {
769 skb_reserve(skb, sk->sk_prot->max_header);
704 /* 770 /*
705 * Make sure that we have exactly size bytes 771 * Make sure that we have exactly size bytes
706 * available to the caller, no more, no less. 772 * available to the caller, no more, no less.
707 */ 773 */
708 skb_reserve(skb, skb_tailroom(skb) - size); 774 skb->avail_size = size;
709 return skb; 775 return skb;
710 } 776 }
711 __kfree_skb(skb); 777 __kfree_skb(skb);
@@ -783,9 +849,10 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
783 while (psize > 0) { 849 while (psize > 0) {
784 struct sk_buff *skb = tcp_write_queue_tail(sk); 850 struct sk_buff *skb = tcp_write_queue_tail(sk);
785 struct page *page = pages[poffset / PAGE_SIZE]; 851 struct page *page = pages[poffset / PAGE_SIZE];
786 int copy, i, can_coalesce; 852 int copy, i;
787 int offset = poffset % PAGE_SIZE; 853 int offset = poffset % PAGE_SIZE;
788 int size = min_t(size_t, psize, PAGE_SIZE - offset); 854 int size = min_t(size_t, psize, PAGE_SIZE - offset);
855 bool can_coalesce;
789 856
790 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { 857 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
791new_segment: 858new_segment:
@@ -850,8 +917,7 @@ new_segment:
850wait_for_sndbuf: 917wait_for_sndbuf:
851 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 918 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
852wait_for_memory: 919wait_for_memory:
853 if (copied) 920 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
854 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
855 921
856 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) 922 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
857 goto do_error; 923 goto do_error;
@@ -860,7 +926,7 @@ wait_for_memory:
860 } 926 }
861 927
862out: 928out:
863 if (copied) 929 if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
864 tcp_push(sk, flags, mss_now, tp->nonagle); 930 tcp_push(sk, flags, mss_now, tp->nonagle);
865 return copied; 931 return copied;
866 932
@@ -918,7 +984,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
918 struct tcp_sock *tp = tcp_sk(sk); 984 struct tcp_sock *tp = tcp_sk(sk);
919 struct sk_buff *skb; 985 struct sk_buff *skb;
920 int iovlen, flags, err, copied; 986 int iovlen, flags, err, copied;
921 int mss_now, size_goal; 987 int mss_now = 0, size_goal;
922 bool sg; 988 bool sg;
923 long timeo; 989 long timeo;
924 990
@@ -932,6 +998,19 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
932 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) 998 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
933 goto out_err; 999 goto out_err;
934 1000
1001 if (unlikely(tp->repair)) {
1002 if (tp->repair_queue == TCP_RECV_QUEUE) {
1003 copied = tcp_send_rcvq(sk, msg, size);
1004 goto out;
1005 }
1006
1007 err = -EINVAL;
1008 if (tp->repair_queue == TCP_NO_QUEUE)
1009 goto out_err;
1010
1011 /* 'common' sending to sendq */
1012 }
1013
935 /* This should be in poll */ 1014 /* This should be in poll */
936 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 1015 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
937 1016
@@ -995,15 +1074,14 @@ new_segment:
995 copy = seglen; 1074 copy = seglen;
996 1075
997 /* Where to copy to? */ 1076 /* Where to copy to? */
998 if (skb_tailroom(skb) > 0) { 1077 if (skb_availroom(skb) > 0) {
999 /* We have some space in skb head. Superb! */ 1078 /* We have some space in skb head. Superb! */
1000 if (copy > skb_tailroom(skb)) 1079 copy = min_t(int, copy, skb_availroom(skb));
1001 copy = skb_tailroom(skb);
1002 err = skb_add_data_nocache(sk, skb, from, copy); 1080 err = skb_add_data_nocache(sk, skb, from, copy);
1003 if (err) 1081 if (err)
1004 goto do_fault; 1082 goto do_fault;
1005 } else { 1083 } else {
1006 int merge = 0; 1084 bool merge = false;
1007 int i = skb_shinfo(skb)->nr_frags; 1085 int i = skb_shinfo(skb)->nr_frags;
1008 struct page *page = sk->sk_sndmsg_page; 1086 struct page *page = sk->sk_sndmsg_page;
1009 int off; 1087 int off;
@@ -1017,7 +1095,7 @@ new_segment:
1017 off != PAGE_SIZE) { 1095 off != PAGE_SIZE) {
1018 /* We can extend the last page 1096 /* We can extend the last page
1019 * fragment. */ 1097 * fragment. */
1020 merge = 1; 1098 merge = true;
1021 } else if (i == MAX_SKB_FRAGS || !sg) { 1099 } else if (i == MAX_SKB_FRAGS || !sg) {
1022 /* Need to add new fragment and cannot 1100 /* Need to add new fragment and cannot
1023 * do this because interface is non-SG, 1101 * do this because interface is non-SG,
@@ -1089,7 +1167,7 @@ new_segment:
1089 if ((seglen -= copy) == 0 && iovlen == 0) 1167 if ((seglen -= copy) == 0 && iovlen == 0)
1090 goto out; 1168 goto out;
1091 1169
1092 if (skb->len < max || (flags & MSG_OOB)) 1170 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1093 continue; 1171 continue;
1094 1172
1095 if (forced_push(tp)) { 1173 if (forced_push(tp)) {
@@ -1102,7 +1180,7 @@ new_segment:
1102wait_for_sndbuf: 1180wait_for_sndbuf:
1103 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1181 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1104wait_for_memory: 1182wait_for_memory:
1105 if (copied) 1183 if (copied && likely(!tp->repair))
1106 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); 1184 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1107 1185
1108 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) 1186 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
@@ -1113,7 +1191,7 @@ wait_for_memory:
1113 } 1191 }
1114 1192
1115out: 1193out:
1116 if (copied) 1194 if (copied && likely(!tp->repair))
1117 tcp_push(sk, flags, mss_now, tp->nonagle); 1195 tcp_push(sk, flags, mss_now, tp->nonagle);
1118 release_sock(sk); 1196 release_sock(sk);
1119 return copied; 1197 return copied;
@@ -1187,6 +1265,24 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1187 return -EAGAIN; 1265 return -EAGAIN;
1188} 1266}
1189 1267
1268static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1269{
1270 struct sk_buff *skb;
1271 int copied = 0, err = 0;
1272
1273 /* XXX -- need to support SO_PEEK_OFF */
1274
1275 skb_queue_walk(&sk->sk_write_queue, skb) {
1276 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
1277 if (err)
1278 break;
1279
1280 copied += skb->len;
1281 }
1282
1283 return err ?: copied;
1284}
1285
1190/* Clean up the receive buffer for full frames taken by the user, 1286/* Clean up the receive buffer for full frames taken by the user,
1191 * then send an ACK if necessary. COPIED is the number of bytes 1287 * then send an ACK if necessary. COPIED is the number of bytes
1192 * tcp_recvmsg has given to the user so far, it speeds up the 1288 * tcp_recvmsg has given to the user so far, it speeds up the
@@ -1196,7 +1292,7 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1196void tcp_cleanup_rbuf(struct sock *sk, int copied) 1292void tcp_cleanup_rbuf(struct sock *sk, int copied)
1197{ 1293{
1198 struct tcp_sock *tp = tcp_sk(sk); 1294 struct tcp_sock *tp = tcp_sk(sk);
1199 int time_to_ack = 0; 1295 bool time_to_ack = false;
1200 1296
1201 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 1297 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1202 1298
@@ -1222,7 +1318,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
1222 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && 1318 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1223 !icsk->icsk_ack.pingpong)) && 1319 !icsk->icsk_ack.pingpong)) &&
1224 !atomic_read(&sk->sk_rmem_alloc))) 1320 !atomic_read(&sk->sk_rmem_alloc)))
1225 time_to_ack = 1; 1321 time_to_ack = true;
1226 } 1322 }
1227 1323
1228 /* We send an ACK if we can now advertise a non-zero window 1324 /* We send an ACK if we can now advertise a non-zero window
@@ -1244,7 +1340,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
1244 * "Lots" means "at least twice" here. 1340 * "Lots" means "at least twice" here.
1245 */ 1341 */
1246 if (new_window && new_window >= 2 * rcv_window_now) 1342 if (new_window && new_window >= 2 * rcv_window_now)
1247 time_to_ack = 1; 1343 time_to_ack = true;
1248 } 1344 }
1249 } 1345 }
1250 if (time_to_ack) 1346 if (time_to_ack)
@@ -1376,11 +1472,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1376 break; 1472 break;
1377 } 1473 }
1378 if (tcp_hdr(skb)->fin) { 1474 if (tcp_hdr(skb)->fin) {
1379 sk_eat_skb(sk, skb, 0); 1475 sk_eat_skb(sk, skb, false);
1380 ++seq; 1476 ++seq;
1381 break; 1477 break;
1382 } 1478 }
1383 sk_eat_skb(sk, skb, 0); 1479 sk_eat_skb(sk, skb, false);
1384 if (!desc->count) 1480 if (!desc->count)
1385 break; 1481 break;
1386 tp->copied_seq = seq; 1482 tp->copied_seq = seq;
@@ -1416,7 +1512,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1416 int target; /* Read at least this many bytes */ 1512 int target; /* Read at least this many bytes */
1417 long timeo; 1513 long timeo;
1418 struct task_struct *user_recv = NULL; 1514 struct task_struct *user_recv = NULL;
1419 int copied_early = 0; 1515 bool copied_early = false;
1420 struct sk_buff *skb; 1516 struct sk_buff *skb;
1421 u32 urg_hole = 0; 1517 u32 urg_hole = 0;
1422 1518
@@ -1432,6 +1528,21 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1432 if (flags & MSG_OOB) 1528 if (flags & MSG_OOB)
1433 goto recv_urg; 1529 goto recv_urg;
1434 1530
1531 if (unlikely(tp->repair)) {
1532 err = -EPERM;
1533 if (!(flags & MSG_PEEK))
1534 goto out;
1535
1536 if (tp->repair_queue == TCP_SEND_QUEUE)
1537 goto recv_sndq;
1538
1539 err = -EINVAL;
1540 if (tp->repair_queue == TCP_NO_QUEUE)
1541 goto out;
1542
1543 /* 'common' recv queue MSG_PEEK-ing */
1544 }
1545
1435 seq = &tp->copied_seq; 1546 seq = &tp->copied_seq;
1436 if (flags & MSG_PEEK) { 1547 if (flags & MSG_PEEK) {
1437 peek_seq = tp->copied_seq; 1548 peek_seq = tp->copied_seq;
@@ -1452,7 +1563,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1452 if ((available < target) && 1563 if ((available < target) &&
1453 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && 1564 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1454 !sysctl_tcp_low_latency && 1565 !sysctl_tcp_low_latency &&
1455 dma_find_channel(DMA_MEMCPY)) { 1566 net_dma_find_channel()) {
1456 preempt_enable_no_resched(); 1567 preempt_enable_no_resched();
1457 tp->ucopy.pinned_list = 1568 tp->ucopy.pinned_list =
1458 dma_pin_iovec_pages(msg->msg_iov, len); 1569 dma_pin_iovec_pages(msg->msg_iov, len);
@@ -1633,9 +1744,9 @@ do_prequeue:
1633 } 1744 }
1634 if ((flags & MSG_PEEK) && 1745 if ((flags & MSG_PEEK) &&
1635 (peek_seq - copied - urg_hole != tp->copied_seq)) { 1746 (peek_seq - copied - urg_hole != tp->copied_seq)) {
1636 if (net_ratelimit()) 1747 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
1637 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n", 1748 current->comm,
1638 current->comm, task_pid_nr(current)); 1749 task_pid_nr(current));
1639 peek_seq = tp->copied_seq; 1750 peek_seq = tp->copied_seq;
1640 } 1751 }
1641 continue; 1752 continue;
@@ -1667,7 +1778,7 @@ do_prequeue:
1667 if (!(flags & MSG_TRUNC)) { 1778 if (!(flags & MSG_TRUNC)) {
1668#ifdef CONFIG_NET_DMA 1779#ifdef CONFIG_NET_DMA
1669 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) 1780 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1670 tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); 1781 tp->ucopy.dma_chan = net_dma_find_channel();
1671 1782
1672 if (tp->ucopy.dma_chan) { 1783 if (tp->ucopy.dma_chan) {
1673 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( 1784 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
@@ -1689,7 +1800,7 @@ do_prequeue:
1689 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); 1800 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1690 1801
1691 if ((offset + used) == skb->len) 1802 if ((offset + used) == skb->len)
1692 copied_early = 1; 1803 copied_early = true;
1693 1804
1694 } else 1805 } else
1695#endif 1806#endif
@@ -1723,7 +1834,7 @@ skip_copy:
1723 goto found_fin_ok; 1834 goto found_fin_ok;
1724 if (!(flags & MSG_PEEK)) { 1835 if (!(flags & MSG_PEEK)) {
1725 sk_eat_skb(sk, skb, copied_early); 1836 sk_eat_skb(sk, skb, copied_early);
1726 copied_early = 0; 1837 copied_early = false;
1727 } 1838 }
1728 continue; 1839 continue;
1729 1840
@@ -1732,7 +1843,7 @@ skip_copy:
1732 ++*seq; 1843 ++*seq;
1733 if (!(flags & MSG_PEEK)) { 1844 if (!(flags & MSG_PEEK)) {
1734 sk_eat_skb(sk, skb, copied_early); 1845 sk_eat_skb(sk, skb, copied_early);
1735 copied_early = 0; 1846 copied_early = false;
1736 } 1847 }
1737 break; 1848 break;
1738 } while (len > 0); 1849 } while (len > 0);
@@ -1783,6 +1894,10 @@ out:
1783recv_urg: 1894recv_urg:
1784 err = tcp_recv_urg(sk, msg, len, flags); 1895 err = tcp_recv_urg(sk, msg, len, flags);
1785 goto out; 1896 goto out;
1897
1898recv_sndq:
1899 err = tcp_peek_sndq(sk, msg, len);
1900 goto out;
1786} 1901}
1787EXPORT_SYMBOL(tcp_recvmsg); 1902EXPORT_SYMBOL(tcp_recvmsg);
1788 1903
@@ -1886,10 +2001,10 @@ bool tcp_check_oom(struct sock *sk, int shift)
1886 too_many_orphans = tcp_too_many_orphans(sk, shift); 2001 too_many_orphans = tcp_too_many_orphans(sk, shift);
1887 out_of_socket_memory = tcp_out_of_memory(sk); 2002 out_of_socket_memory = tcp_out_of_memory(sk);
1888 2003
1889 if (too_many_orphans && net_ratelimit()) 2004 if (too_many_orphans)
1890 pr_info("too many orphaned sockets\n"); 2005 net_info_ratelimited("too many orphaned sockets\n");
1891 if (out_of_socket_memory && net_ratelimit()) 2006 if (out_of_socket_memory)
1892 pr_info("out of memory -- consider tuning tcp_mem\n"); 2007 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
1893 return too_many_orphans || out_of_socket_memory; 2008 return too_many_orphans || out_of_socket_memory;
1894} 2009}
1895 2010
@@ -1935,7 +2050,9 @@ void tcp_close(struct sock *sk, long timeout)
1935 * advertise a zero window, then kill -9 the FTP client, wheee... 2050 * advertise a zero window, then kill -9 the FTP client, wheee...
1936 * Note: timeout is always zero in such a case. 2051 * Note: timeout is always zero in such a case.
1937 */ 2052 */
1938 if (data_was_unread) { 2053 if (unlikely(tcp_sk(sk)->repair)) {
2054 sk->sk_prot->disconnect(sk, 0);
2055 } else if (data_was_unread) {
1939 /* Unread data was tossed, zap the connection. */ 2056 /* Unread data was tossed, zap the connection. */
1940 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); 2057 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
1941 tcp_set_state(sk, TCP_CLOSE); 2058 tcp_set_state(sk, TCP_CLOSE);
@@ -2053,7 +2170,7 @@ EXPORT_SYMBOL(tcp_close);
2053 2170
2054/* These states need RST on ABORT according to RFC793 */ 2171/* These states need RST on ABORT according to RFC793 */
2055 2172
2056static inline int tcp_need_reset(int state) 2173static inline bool tcp_need_reset(int state)
2057{ 2174{
2058 return (1 << state) & 2175 return (1 << state) &
2059 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | 2176 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
@@ -2074,6 +2191,8 @@ int tcp_disconnect(struct sock *sk, int flags)
2074 /* ABORT function of RFC793 */ 2191 /* ABORT function of RFC793 */
2075 if (old_state == TCP_LISTEN) { 2192 if (old_state == TCP_LISTEN) {
2076 inet_csk_listen_stop(sk); 2193 inet_csk_listen_stop(sk);
2194 } else if (unlikely(tp->repair)) {
2195 sk->sk_err = ECONNABORTED;
2077 } else if (tcp_need_reset(old_state) || 2196 } else if (tcp_need_reset(old_state) ||
2078 (tp->snd_nxt != tp->write_seq && 2197 (tp->snd_nxt != tp->write_seq &&
2079 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { 2198 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
@@ -2125,6 +2244,54 @@ int tcp_disconnect(struct sock *sk, int flags)
2125} 2244}
2126EXPORT_SYMBOL(tcp_disconnect); 2245EXPORT_SYMBOL(tcp_disconnect);
2127 2246
2247static inline bool tcp_can_repair_sock(const struct sock *sk)
2248{
2249 return capable(CAP_NET_ADMIN) &&
2250 ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
2251}
2252
2253static int tcp_repair_options_est(struct tcp_sock *tp,
2254 struct tcp_repair_opt __user *optbuf, unsigned int len)
2255{
2256 struct tcp_repair_opt opt;
2257
2258 while (len >= sizeof(opt)) {
2259 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2260 return -EFAULT;
2261
2262 optbuf++;
2263 len -= sizeof(opt);
2264
2265 switch (opt.opt_code) {
2266 case TCPOPT_MSS:
2267 tp->rx_opt.mss_clamp = opt.opt_val;
2268 break;
2269 case TCPOPT_WINDOW:
2270 if (opt.opt_val > 14)
2271 return -EFBIG;
2272
2273 tp->rx_opt.snd_wscale = opt.opt_val;
2274 break;
2275 case TCPOPT_SACK_PERM:
2276 if (opt.opt_val != 0)
2277 return -EINVAL;
2278
2279 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2280 if (sysctl_tcp_fack)
2281 tcp_enable_fack(tp);
2282 break;
2283 case TCPOPT_TIMESTAMP:
2284 if (opt.opt_val != 0)
2285 return -EINVAL;
2286
2287 tp->rx_opt.tstamp_ok = 1;
2288 break;
2289 }
2290 }
2291
2292 return 0;
2293}
2294
2128/* 2295/*
2129 * Socket option code for TCP. 2296 * Socket option code for TCP.
2130 */ 2297 */
@@ -2295,6 +2462,55 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2295 err = -EINVAL; 2462 err = -EINVAL;
2296 else 2463 else
2297 tp->thin_dupack = val; 2464 tp->thin_dupack = val;
2465 if (tp->thin_dupack)
2466 tcp_disable_early_retrans(tp);
2467 break;
2468
2469 case TCP_REPAIR:
2470 if (!tcp_can_repair_sock(sk))
2471 err = -EPERM;
2472 else if (val == 1) {
2473 tp->repair = 1;
2474 sk->sk_reuse = SK_FORCE_REUSE;
2475 tp->repair_queue = TCP_NO_QUEUE;
2476 } else if (val == 0) {
2477 tp->repair = 0;
2478 sk->sk_reuse = SK_NO_REUSE;
2479 tcp_send_window_probe(sk);
2480 } else
2481 err = -EINVAL;
2482
2483 break;
2484
2485 case TCP_REPAIR_QUEUE:
2486 if (!tp->repair)
2487 err = -EPERM;
2488 else if (val < TCP_QUEUES_NR)
2489 tp->repair_queue = val;
2490 else
2491 err = -EINVAL;
2492 break;
2493
2494 case TCP_QUEUE_SEQ:
2495 if (sk->sk_state != TCP_CLOSE)
2496 err = -EPERM;
2497 else if (tp->repair_queue == TCP_SEND_QUEUE)
2498 tp->write_seq = val;
2499 else if (tp->repair_queue == TCP_RECV_QUEUE)
2500 tp->rcv_nxt = val;
2501 else
2502 err = -EINVAL;
2503 break;
2504
2505 case TCP_REPAIR_OPTIONS:
2506 if (!tp->repair)
2507 err = -EINVAL;
2508 else if (sk->sk_state == TCP_ESTABLISHED)
2509 err = tcp_repair_options_est(tp,
2510 (struct tcp_repair_opt __user *)optval,
2511 optlen);
2512 else
2513 err = -EPERM;
2298 break; 2514 break;
2299 2515
2300 case TCP_CORK: 2516 case TCP_CORK:
@@ -2530,6 +2746,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2530 val = tp->mss_cache; 2746 val = tp->mss_cache;
2531 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) 2747 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2532 val = tp->rx_opt.user_mss; 2748 val = tp->rx_opt.user_mss;
2749 if (tp->repair)
2750 val = tp->rx_opt.mss_clamp;
2533 break; 2751 break;
2534 case TCP_NODELAY: 2752 case TCP_NODELAY:
2535 val = !!(tp->nonagle&TCP_NAGLE_OFF); 2753 val = !!(tp->nonagle&TCP_NAGLE_OFF);
@@ -2632,6 +2850,26 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2632 val = tp->thin_dupack; 2850 val = tp->thin_dupack;
2633 break; 2851 break;
2634 2852
2853 case TCP_REPAIR:
2854 val = tp->repair;
2855 break;
2856
2857 case TCP_REPAIR_QUEUE:
2858 if (tp->repair)
2859 val = tp->repair_queue;
2860 else
2861 return -EINVAL;
2862 break;
2863
2864 case TCP_QUEUE_SEQ:
2865 if (tp->repair_queue == TCP_SEND_QUEUE)
2866 val = tp->write_seq;
2867 else if (tp->repair_queue == TCP_RECV_QUEUE)
2868 val = tp->rcv_nxt;
2869 else
2870 return -EINVAL;
2871 break;
2872
2635 case TCP_USER_TIMEOUT: 2873 case TCP_USER_TIMEOUT:
2636 val = jiffies_to_msecs(icsk->icsk_user_timeout); 2874 val = jiffies_to_msecs(icsk->icsk_user_timeout);
2637 break; 2875 break;
@@ -2675,7 +2913,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
2675{ 2913{
2676 struct sk_buff *segs = ERR_PTR(-EINVAL); 2914 struct sk_buff *segs = ERR_PTR(-EINVAL);
2677 struct tcphdr *th; 2915 struct tcphdr *th;
2678 unsigned thlen; 2916 unsigned int thlen;
2679 unsigned int seq; 2917 unsigned int seq;
2680 __be32 delta; 2918 __be32 delta;
2681 unsigned int oldlen; 2919 unsigned int oldlen;
@@ -2933,13 +3171,13 @@ out_free:
2933struct tcp_md5sig_pool __percpu *tcp_alloc_md5sig_pool(struct sock *sk) 3171struct tcp_md5sig_pool __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
2934{ 3172{
2935 struct tcp_md5sig_pool __percpu *pool; 3173 struct tcp_md5sig_pool __percpu *pool;
2936 int alloc = 0; 3174 bool alloc = false;
2937 3175
2938retry: 3176retry:
2939 spin_lock_bh(&tcp_md5sig_pool_lock); 3177 spin_lock_bh(&tcp_md5sig_pool_lock);
2940 pool = tcp_md5sig_pool; 3178 pool = tcp_md5sig_pool;
2941 if (tcp_md5sig_users++ == 0) { 3179 if (tcp_md5sig_users++ == 0) {
2942 alloc = 1; 3180 alloc = true;
2943 spin_unlock_bh(&tcp_md5sig_pool_lock); 3181 spin_unlock_bh(&tcp_md5sig_pool_lock);
2944 } else if (!pool) { 3182 } else if (!pool) {
2945 tcp_md5sig_users--; 3183 tcp_md5sig_users--;
@@ -3033,9 +3271,9 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3033 struct scatterlist sg; 3271 struct scatterlist sg;
3034 const struct tcphdr *tp = tcp_hdr(skb); 3272 const struct tcphdr *tp = tcp_hdr(skb);
3035 struct hash_desc *desc = &hp->md5_desc; 3273 struct hash_desc *desc = &hp->md5_desc;
3036 unsigned i; 3274 unsigned int i;
3037 const unsigned head_data_len = skb_headlen(skb) > header_len ? 3275 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
3038 skb_headlen(skb) - header_len : 0; 3276 skb_headlen(skb) - header_len : 0;
3039 const struct skb_shared_info *shi = skb_shinfo(skb); 3277 const struct skb_shared_info *shi = skb_shinfo(skb);
3040 struct sk_buff *frag_iter; 3278 struct sk_buff *frag_iter;
3041 3279
@@ -3223,9 +3461,15 @@ extern struct tcp_congestion_ops tcp_reno;
3223static __initdata unsigned long thash_entries; 3461static __initdata unsigned long thash_entries;
3224static int __init set_thash_entries(char *str) 3462static int __init set_thash_entries(char *str)
3225{ 3463{
3464 ssize_t ret;
3465
3226 if (!str) 3466 if (!str)
3227 return 0; 3467 return 0;
3228 thash_entries = simple_strtoul(str, &str, 0); 3468
3469 ret = kstrtoul(str, 0, &thash_entries);
3470 if (ret)
3471 return 0;
3472
3229 return 1; 3473 return 1;
3230} 3474}
3231__setup("thash_entries=", set_thash_entries); 3475__setup("thash_entries=", set_thash_entries);
@@ -3243,7 +3487,7 @@ void __init tcp_init(void)
3243{ 3487{
3244 struct sk_buff *skb = NULL; 3488 struct sk_buff *skb = NULL;
3245 unsigned long limit; 3489 unsigned long limit;
3246 int max_share, cnt; 3490 int max_rshare, max_wshare, cnt;
3247 unsigned int i; 3491 unsigned int i;
3248 unsigned long jiffy = jiffies; 3492 unsigned long jiffy = jiffies;
3249 3493
@@ -3302,17 +3546,17 @@ void __init tcp_init(void)
3302 3546
3303 tcp_init_mem(&init_net); 3547 tcp_init_mem(&init_net);
3304 /* Set per-socket limits to no more than 1/128 the pressure threshold */ 3548 /* Set per-socket limits to no more than 1/128 the pressure threshold */
3305 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 10); 3549 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
3306 limit = max(limit, 128UL); 3550 max_wshare = min(4UL*1024*1024, limit);
3307 max_share = min(4UL*1024*1024, limit); 3551 max_rshare = min(6UL*1024*1024, limit);
3308 3552
3309 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; 3553 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3310 sysctl_tcp_wmem[1] = 16*1024; 3554 sysctl_tcp_wmem[1] = 16*1024;
3311 sysctl_tcp_wmem[2] = max(64*1024, max_share); 3555 sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3312 3556
3313 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; 3557 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3314 sysctl_tcp_rmem[1] = 87380; 3558 sysctl_tcp_rmem[1] = 87380;
3315 sysctl_tcp_rmem[2] = max(87380, max_share); 3559 sysctl_tcp_rmem[2] = max(87380, max_rshare);
3316 3560
3317 pr_info("Hash tables configured (established %u bind %u)\n", 3561 pr_info("Hash tables configured (established %u bind %u)\n",
3318 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); 3562 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 272a84593c85..04dbd7ae7c62 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -280,19 +280,19 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
280/* RFC2861 Check whether we are limited by application or congestion window 280/* RFC2861 Check whether we are limited by application or congestion window
281 * This is the inverse of cwnd check in tcp_tso_should_defer 281 * This is the inverse of cwnd check in tcp_tso_should_defer
282 */ 282 */
283int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) 283bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
284{ 284{
285 const struct tcp_sock *tp = tcp_sk(sk); 285 const struct tcp_sock *tp = tcp_sk(sk);
286 u32 left; 286 u32 left;
287 287
288 if (in_flight >= tp->snd_cwnd) 288 if (in_flight >= tp->snd_cwnd)
289 return 1; 289 return true;
290 290
291 left = tp->snd_cwnd - in_flight; 291 left = tp->snd_cwnd - in_flight;
292 if (sk_can_gso(sk) && 292 if (sk_can_gso(sk) &&
293 left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && 293 left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
294 left * tp->mss_cache < sk->sk_gso_max_size) 294 left * tp->mss_cache < sk->sk_gso_max_size)
295 return 1; 295 return true;
296 return left <= tcp_max_tso_deferred_mss(tp); 296 return left <= tcp_max_tso_deferred_mss(tp);
297} 297}
298EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); 298EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index fe3ecf484b44..57bdd17dff4d 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -15,7 +15,7 @@
15 15
16/* Tcp Hybla structure. */ 16/* Tcp Hybla structure. */
17struct hybla { 17struct hybla {
18 u8 hybla_en; 18 bool hybla_en;
19 u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */ 19 u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
20 u32 rho; /* Rho parameter, integer part */ 20 u32 rho; /* Rho parameter, integer part */
21 u32 rho2; /* Rho * Rho, integer part */ 21 u32 rho2; /* Rho * Rho, integer part */
@@ -24,8 +24,7 @@ struct hybla {
24 u32 minrtt; /* Minimum smoothed round trip time value seen */ 24 u32 minrtt; /* Minimum smoothed round trip time value seen */
25}; 25};
26 26
27/* Hybla reference round trip time (default= 1/40 sec = 25 ms), 27/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */
28 expressed in jiffies */
29static int rtt0 = 25; 28static int rtt0 = 25;
30module_param(rtt0, int, 0644); 29module_param(rtt0, int, 0644);
31MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); 30MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
@@ -39,7 +38,7 @@ static inline void hybla_recalc_param (struct sock *sk)
39 ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8); 38 ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8);
40 ca->rho = ca->rho_3ls >> 3; 39 ca->rho = ca->rho_3ls >> 3;
41 ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; 40 ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
42 ca->rho2 = ca->rho2_7ls >>7; 41 ca->rho2 = ca->rho2_7ls >> 7;
43} 42}
44 43
45static void hybla_init(struct sock *sk) 44static void hybla_init(struct sock *sk)
@@ -52,7 +51,7 @@ static void hybla_init(struct sock *sk)
52 ca->rho_3ls = 0; 51 ca->rho_3ls = 0;
53 ca->rho2_7ls = 0; 52 ca->rho2_7ls = 0;
54 ca->snd_cwnd_cents = 0; 53 ca->snd_cwnd_cents = 0;
55 ca->hybla_en = 1; 54 ca->hybla_en = true;
56 tp->snd_cwnd = 2; 55 tp->snd_cwnd = 2;
57 tp->snd_cwnd_clamp = 65535; 56 tp->snd_cwnd_clamp = 65535;
58 57
@@ -67,6 +66,7 @@ static void hybla_init(struct sock *sk)
67static void hybla_state(struct sock *sk, u8 ca_state) 66static void hybla_state(struct sock *sk, u8 ca_state)
68{ 67{
69 struct hybla *ca = inet_csk_ca(sk); 68 struct hybla *ca = inet_csk_ca(sk);
69
70 ca->hybla_en = (ca_state == TCP_CA_Open); 70 ca->hybla_en = (ca_state == TCP_CA_Open);
71} 71}
72 72
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e886e2f7fa8d..cfa2aa128342 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -85,7 +85,7 @@ int sysctl_tcp_ecn __read_mostly = 2;
85EXPORT_SYMBOL(sysctl_tcp_ecn); 85EXPORT_SYMBOL(sysctl_tcp_ecn);
86int sysctl_tcp_dsack __read_mostly = 1; 86int sysctl_tcp_dsack __read_mostly = 1;
87int sysctl_tcp_app_win __read_mostly = 31; 87int sysctl_tcp_app_win __read_mostly = 31;
88int sysctl_tcp_adv_win_scale __read_mostly = 2; 88int sysctl_tcp_adv_win_scale __read_mostly = 1;
89EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); 89EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
90 90
91int sysctl_tcp_stdurg __read_mostly; 91int sysctl_tcp_stdurg __read_mostly;
@@ -99,6 +99,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
99 99
100int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; 100int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
101int sysctl_tcp_abc __read_mostly; 101int sysctl_tcp_abc __read_mostly;
102int sysctl_tcp_early_retrans __read_mostly = 2;
102 103
103#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 104#define FLAG_DATA 0x01 /* Incoming frame contained data. */
104#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 105#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -175,7 +176,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
175static void tcp_incr_quickack(struct sock *sk) 176static void tcp_incr_quickack(struct sock *sk)
176{ 177{
177 struct inet_connection_sock *icsk = inet_csk(sk); 178 struct inet_connection_sock *icsk = inet_csk(sk);
178 unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); 179 unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
179 180
180 if (quickacks == 0) 181 if (quickacks == 0)
181 quickacks = 2; 182 quickacks = 2;
@@ -195,9 +196,10 @@ static void tcp_enter_quickack_mode(struct sock *sk)
195 * and the session is not interactive. 196 * and the session is not interactive.
196 */ 197 */
197 198
198static inline int tcp_in_quickack_mode(const struct sock *sk) 199static inline bool tcp_in_quickack_mode(const struct sock *sk)
199{ 200{
200 const struct inet_connection_sock *icsk = inet_csk(sk); 201 const struct inet_connection_sock *icsk = inet_csk(sk);
202
201 return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; 203 return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
202} 204}
203 205
@@ -252,11 +254,11 @@ static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
252 tp->ecn_flags &= ~TCP_ECN_OK; 254 tp->ecn_flags &= ~TCP_ECN_OK;
253} 255}
254 256
255static inline int TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) 257static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
256{ 258{
257 if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) 259 if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
258 return 1; 260 return true;
259 return 0; 261 return false;
260} 262}
261 263
262/* Buffer size and advertised window tuning. 264/* Buffer size and advertised window tuning.
@@ -335,6 +337,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
335 incr = __tcp_grow_window(sk, skb); 337 incr = __tcp_grow_window(sk, skb);
336 338
337 if (incr) { 339 if (incr) {
340 incr = max_t(int, incr, 2 * skb->len);
338 tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, 341 tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
339 tp->window_clamp); 342 tp->window_clamp);
340 inet_csk(sk)->icsk_ack.quick |= 1; 343 inet_csk(sk)->icsk_ack.quick |= 1;
@@ -474,8 +477,11 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
474 if (!win_dep) { 477 if (!win_dep) {
475 m -= (new_sample >> 3); 478 m -= (new_sample >> 3);
476 new_sample += m; 479 new_sample += m;
477 } else if (m < new_sample) 480 } else {
478 new_sample = m << 3; 481 m <<= 3;
482 if (m < new_sample)
483 new_sample = m;
484 }
479 } else { 485 } else {
480 /* No previous measure. */ 486 /* No previous measure. */
481 new_sample = m << 3; 487 new_sample = m << 3;
@@ -491,7 +497,7 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
491 goto new_measure; 497 goto new_measure;
492 if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) 498 if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
493 return; 499 return;
494 tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1); 500 tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1);
495 501
496new_measure: 502new_measure:
497 tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; 503 tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
@@ -902,6 +908,7 @@ static void tcp_init_metrics(struct sock *sk)
902 if (dst_metric(dst, RTAX_REORDERING) && 908 if (dst_metric(dst, RTAX_REORDERING) &&
903 tp->reordering != dst_metric(dst, RTAX_REORDERING)) { 909 tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
904 tcp_disable_fack(tp); 910 tcp_disable_fack(tp);
911 tcp_disable_early_retrans(tp);
905 tp->reordering = dst_metric(dst, RTAX_REORDERING); 912 tp->reordering = dst_metric(dst, RTAX_REORDERING);
906 } 913 }
907 914
@@ -933,7 +940,7 @@ static void tcp_init_metrics(struct sock *sk)
933 tcp_set_rto(sk); 940 tcp_set_rto(sk);
934reset: 941reset:
935 if (tp->srtt == 0) { 942 if (tp->srtt == 0) {
936 /* RFC2988bis: We've failed to get a valid RTT sample from 943 /* RFC6298: 5.7 We've failed to get a valid RTT sample from
937 * 3WHS. This is most likely due to retransmission, 944 * 3WHS. This is most likely due to retransmission,
938 * including spurious one. Reset the RTO back to 3secs 945 * including spurious one. Reset the RTO back to 3secs
939 * from the more aggressive 1sec to avoid more spurious 946 * from the more aggressive 1sec to avoid more spurious
@@ -943,7 +950,7 @@ reset:
943 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; 950 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
944 } 951 }
945 /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been 952 /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
946 * retransmitted. In light of RFC2988bis' more aggressive 1sec 953 * retransmitted. In light of RFC6298 more aggressive 1sec
947 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK 954 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
948 * retransmission has occurred. 955 * retransmission has occurred.
949 */ 956 */
@@ -975,15 +982,18 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
975 982
976 NET_INC_STATS_BH(sock_net(sk), mib_idx); 983 NET_INC_STATS_BH(sock_net(sk), mib_idx);
977#if FASTRETRANS_DEBUG > 1 984#if FASTRETRANS_DEBUG > 1
978 printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", 985 pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
979 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, 986 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
980 tp->reordering, 987 tp->reordering,
981 tp->fackets_out, 988 tp->fackets_out,
982 tp->sacked_out, 989 tp->sacked_out,
983 tp->undo_marker ? tp->undo_retrans : 0); 990 tp->undo_marker ? tp->undo_retrans : 0);
984#endif 991#endif
985 tcp_disable_fack(tp); 992 tcp_disable_fack(tp);
986 } 993 }
994
995 if (metric > 0)
996 tcp_disable_early_retrans(tp);
987} 997}
988 998
989/* This must be called before lost_out is incremented */ 999/* This must be called before lost_out is incremented */
@@ -1114,36 +1124,36 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
1114 * the exact amount is rather hard to quantify. However, tp->max_window can 1124 * the exact amount is rather hard to quantify. However, tp->max_window can
1115 * be used as an exaggerated estimate. 1125 * be used as an exaggerated estimate.
1116 */ 1126 */
1117static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, 1127static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
1118 u32 start_seq, u32 end_seq) 1128 u32 start_seq, u32 end_seq)
1119{ 1129{
1120 /* Too far in future, or reversed (interpretation is ambiguous) */ 1130 /* Too far in future, or reversed (interpretation is ambiguous) */
1121 if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) 1131 if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
1122 return 0; 1132 return false;
1123 1133
1124 /* Nasty start_seq wrap-around check (see comments above) */ 1134 /* Nasty start_seq wrap-around check (see comments above) */
1125 if (!before(start_seq, tp->snd_nxt)) 1135 if (!before(start_seq, tp->snd_nxt))
1126 return 0; 1136 return false;
1127 1137
1128 /* In outstanding window? ...This is valid exit for D-SACKs too. 1138 /* In outstanding window? ...This is valid exit for D-SACKs too.
1129 * start_seq == snd_una is non-sensical (see comments above) 1139 * start_seq == snd_una is non-sensical (see comments above)
1130 */ 1140 */
1131 if (after(start_seq, tp->snd_una)) 1141 if (after(start_seq, tp->snd_una))
1132 return 1; 1142 return true;
1133 1143
1134 if (!is_dsack || !tp->undo_marker) 1144 if (!is_dsack || !tp->undo_marker)
1135 return 0; 1145 return false;
1136 1146
1137 /* ...Then it's D-SACK, and must reside below snd_una completely */ 1147 /* ...Then it's D-SACK, and must reside below snd_una completely */
1138 if (after(end_seq, tp->snd_una)) 1148 if (after(end_seq, tp->snd_una))
1139 return 0; 1149 return false;
1140 1150
1141 if (!before(start_seq, tp->undo_marker)) 1151 if (!before(start_seq, tp->undo_marker))
1142 return 1; 1152 return true;
1143 1153
1144 /* Too old */ 1154 /* Too old */
1145 if (!after(end_seq, tp->undo_marker)) 1155 if (!after(end_seq, tp->undo_marker))
1146 return 0; 1156 return false;
1147 1157
1148 /* Undo_marker boundary crossing (overestimates a lot). Known already: 1158 /* Undo_marker boundary crossing (overestimates a lot). Known already:
1149 * start_seq < undo_marker and end_seq >= undo_marker. 1159 * start_seq < undo_marker and end_seq >= undo_marker.
@@ -1215,17 +1225,17 @@ static void tcp_mark_lost_retrans(struct sock *sk)
1215 tp->lost_retrans_low = new_low_seq; 1225 tp->lost_retrans_low = new_low_seq;
1216} 1226}
1217 1227
1218static int tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, 1228static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
1219 struct tcp_sack_block_wire *sp, int num_sacks, 1229 struct tcp_sack_block_wire *sp, int num_sacks,
1220 u32 prior_snd_una) 1230 u32 prior_snd_una)
1221{ 1231{
1222 struct tcp_sock *tp = tcp_sk(sk); 1232 struct tcp_sock *tp = tcp_sk(sk);
1223 u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); 1233 u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
1224 u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); 1234 u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
1225 int dup_sack = 0; 1235 bool dup_sack = false;
1226 1236
1227 if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { 1237 if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
1228 dup_sack = 1; 1238 dup_sack = true;
1229 tcp_dsack_seen(tp); 1239 tcp_dsack_seen(tp);
1230 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV); 1240 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
1231 } else if (num_sacks > 1) { 1241 } else if (num_sacks > 1) {
@@ -1234,7 +1244,7 @@ static int tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
1234 1244
1235 if (!after(end_seq_0, end_seq_1) && 1245 if (!after(end_seq_0, end_seq_1) &&
1236 !before(start_seq_0, start_seq_1)) { 1246 !before(start_seq_0, start_seq_1)) {
1237 dup_sack = 1; 1247 dup_sack = true;
1238 tcp_dsack_seen(tp); 1248 tcp_dsack_seen(tp);
1239 NET_INC_STATS_BH(sock_net(sk), 1249 NET_INC_STATS_BH(sock_net(sk),
1240 LINUX_MIB_TCPDSACKOFORECV); 1250 LINUX_MIB_TCPDSACKOFORECV);
@@ -1265,9 +1275,10 @@ struct tcp_sacktag_state {
1265 * FIXME: this could be merged to shift decision code 1275 * FIXME: this could be merged to shift decision code
1266 */ 1276 */
1267static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, 1277static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1268 u32 start_seq, u32 end_seq) 1278 u32 start_seq, u32 end_seq)
1269{ 1279{
1270 int in_sack, err; 1280 int err;
1281 bool in_sack;
1271 unsigned int pkt_len; 1282 unsigned int pkt_len;
1272 unsigned int mss; 1283 unsigned int mss;
1273 1284
@@ -1313,7 +1324,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1313static u8 tcp_sacktag_one(struct sock *sk, 1324static u8 tcp_sacktag_one(struct sock *sk,
1314 struct tcp_sacktag_state *state, u8 sacked, 1325 struct tcp_sacktag_state *state, u8 sacked,
1315 u32 start_seq, u32 end_seq, 1326 u32 start_seq, u32 end_seq,
1316 int dup_sack, int pcount) 1327 bool dup_sack, int pcount)
1317{ 1328{
1318 struct tcp_sock *tp = tcp_sk(sk); 1329 struct tcp_sock *tp = tcp_sk(sk);
1319 int fack_count = state->fack_count; 1330 int fack_count = state->fack_count;
@@ -1393,10 +1404,10 @@ static u8 tcp_sacktag_one(struct sock *sk,
1393/* Shift newly-SACKed bytes from this skb to the immediately previous 1404/* Shift newly-SACKed bytes from this skb to the immediately previous
1394 * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. 1405 * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
1395 */ 1406 */
1396static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, 1407static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1397 struct tcp_sacktag_state *state, 1408 struct tcp_sacktag_state *state,
1398 unsigned int pcount, int shifted, int mss, 1409 unsigned int pcount, int shifted, int mss,
1399 int dup_sack) 1410 bool dup_sack)
1400{ 1411{
1401 struct tcp_sock *tp = tcp_sk(sk); 1412 struct tcp_sock *tp = tcp_sk(sk);
1402 struct sk_buff *prev = tcp_write_queue_prev(sk, skb); 1413 struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
@@ -1446,7 +1457,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1446 if (skb->len > 0) { 1457 if (skb->len > 0) {
1447 BUG_ON(!tcp_skb_pcount(skb)); 1458 BUG_ON(!tcp_skb_pcount(skb));
1448 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED); 1459 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
1449 return 0; 1460 return false;
1450 } 1461 }
1451 1462
1452 /* Whole SKB was eaten :-) */ 1463 /* Whole SKB was eaten :-) */
@@ -1469,7 +1480,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1469 1480
1470 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED); 1481 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
1471 1482
1472 return 1; 1483 return true;
1473} 1484}
1474 1485
1475/* I wish gso_size would have a bit more sane initialization than 1486/* I wish gso_size would have a bit more sane initialization than
@@ -1492,7 +1503,7 @@ static int skb_can_shift(const struct sk_buff *skb)
1492static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, 1503static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1493 struct tcp_sacktag_state *state, 1504 struct tcp_sacktag_state *state,
1494 u32 start_seq, u32 end_seq, 1505 u32 start_seq, u32 end_seq,
1495 int dup_sack) 1506 bool dup_sack)
1496{ 1507{
1497 struct tcp_sock *tp = tcp_sk(sk); 1508 struct tcp_sock *tp = tcp_sk(sk);
1498 struct sk_buff *prev; 1509 struct sk_buff *prev;
@@ -1631,14 +1642,14 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1631 struct tcp_sack_block *next_dup, 1642 struct tcp_sack_block *next_dup,
1632 struct tcp_sacktag_state *state, 1643 struct tcp_sacktag_state *state,
1633 u32 start_seq, u32 end_seq, 1644 u32 start_seq, u32 end_seq,
1634 int dup_sack_in) 1645 bool dup_sack_in)
1635{ 1646{
1636 struct tcp_sock *tp = tcp_sk(sk); 1647 struct tcp_sock *tp = tcp_sk(sk);
1637 struct sk_buff *tmp; 1648 struct sk_buff *tmp;
1638 1649
1639 tcp_for_write_queue_from(skb, sk) { 1650 tcp_for_write_queue_from(skb, sk) {
1640 int in_sack = 0; 1651 int in_sack = 0;
1641 int dup_sack = dup_sack_in; 1652 bool dup_sack = dup_sack_in;
1642 1653
1643 if (skb == tcp_send_head(sk)) 1654 if (skb == tcp_send_head(sk))
1644 break; 1655 break;
@@ -1653,7 +1664,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1653 next_dup->start_seq, 1664 next_dup->start_seq,
1654 next_dup->end_seq); 1665 next_dup->end_seq);
1655 if (in_sack > 0) 1666 if (in_sack > 0)
1656 dup_sack = 1; 1667 dup_sack = true;
1657 } 1668 }
1658 1669
1659 /* skb reference here is a bit tricky to get right, since 1670 /* skb reference here is a bit tricky to get right, since
@@ -1758,7 +1769,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1758 struct sk_buff *skb; 1769 struct sk_buff *skb;
1759 int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); 1770 int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
1760 int used_sacks; 1771 int used_sacks;
1761 int found_dup_sack = 0; 1772 bool found_dup_sack = false;
1762 int i, j; 1773 int i, j;
1763 int first_sack_index; 1774 int first_sack_index;
1764 1775
@@ -1789,7 +1800,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1789 used_sacks = 0; 1800 used_sacks = 0;
1790 first_sack_index = 0; 1801 first_sack_index = 0;
1791 for (i = 0; i < num_sacks; i++) { 1802 for (i = 0; i < num_sacks; i++) {
1792 int dup_sack = !i && found_dup_sack; 1803 bool dup_sack = !i && found_dup_sack;
1793 1804
1794 sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq); 1805 sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
1795 sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq); 1806 sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
@@ -1856,7 +1867,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1856 while (i < used_sacks) { 1867 while (i < used_sacks) {
1857 u32 start_seq = sp[i].start_seq; 1868 u32 start_seq = sp[i].start_seq;
1858 u32 end_seq = sp[i].end_seq; 1869 u32 end_seq = sp[i].end_seq;
1859 int dup_sack = (found_dup_sack && (i == first_sack_index)); 1870 bool dup_sack = (found_dup_sack && (i == first_sack_index));
1860 struct tcp_sack_block *next_dup = NULL; 1871 struct tcp_sack_block *next_dup = NULL;
1861 1872
1862 if (found_dup_sack && ((i + 1) == first_sack_index)) 1873 if (found_dup_sack && ((i + 1) == first_sack_index))
@@ -1958,9 +1969,9 @@ out:
1958} 1969}
1959 1970
1960/* Limits sacked_out so that sum with lost_out isn't ever larger than 1971/* Limits sacked_out so that sum with lost_out isn't ever larger than
1961 * packets_out. Returns zero if sacked_out adjustement wasn't necessary. 1972 * packets_out. Returns false if sacked_out adjustement wasn't necessary.
1962 */ 1973 */
1963static int tcp_limit_reno_sacked(struct tcp_sock *tp) 1974static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
1964{ 1975{
1965 u32 holes; 1976 u32 holes;
1966 1977
@@ -1969,9 +1980,9 @@ static int tcp_limit_reno_sacked(struct tcp_sock *tp)
1969 1980
1970 if ((tp->sacked_out + holes) > tp->packets_out) { 1981 if ((tp->sacked_out + holes) > tp->packets_out) {
1971 tp->sacked_out = tp->packets_out - holes; 1982 tp->sacked_out = tp->packets_out - holes;
1972 return 1; 1983 return true;
1973 } 1984 }
1974 return 0; 1985 return false;
1975} 1986}
1976 1987
1977/* If we receive more dupacks than we expected counting segments 1988/* If we receive more dupacks than we expected counting segments
@@ -2025,40 +2036,40 @@ static int tcp_is_sackfrto(const struct tcp_sock *tp)
2025/* F-RTO can only be used if TCP has never retransmitted anything other than 2036/* F-RTO can only be used if TCP has never retransmitted anything other than
2026 * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) 2037 * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
2027 */ 2038 */
2028int tcp_use_frto(struct sock *sk) 2039bool tcp_use_frto(struct sock *sk)
2029{ 2040{
2030 const struct tcp_sock *tp = tcp_sk(sk); 2041 const struct tcp_sock *tp = tcp_sk(sk);
2031 const struct inet_connection_sock *icsk = inet_csk(sk); 2042 const struct inet_connection_sock *icsk = inet_csk(sk);
2032 struct sk_buff *skb; 2043 struct sk_buff *skb;
2033 2044
2034 if (!sysctl_tcp_frto) 2045 if (!sysctl_tcp_frto)
2035 return 0; 2046 return false;
2036 2047
2037 /* MTU probe and F-RTO won't really play nicely along currently */ 2048 /* MTU probe and F-RTO won't really play nicely along currently */
2038 if (icsk->icsk_mtup.probe_size) 2049 if (icsk->icsk_mtup.probe_size)
2039 return 0; 2050 return false;
2040 2051
2041 if (tcp_is_sackfrto(tp)) 2052 if (tcp_is_sackfrto(tp))
2042 return 1; 2053 return true;
2043 2054
2044 /* Avoid expensive walking of rexmit queue if possible */ 2055 /* Avoid expensive walking of rexmit queue if possible */
2045 if (tp->retrans_out > 1) 2056 if (tp->retrans_out > 1)
2046 return 0; 2057 return false;
2047 2058
2048 skb = tcp_write_queue_head(sk); 2059 skb = tcp_write_queue_head(sk);
2049 if (tcp_skb_is_last(sk, skb)) 2060 if (tcp_skb_is_last(sk, skb))
2050 return 1; 2061 return true;
2051 skb = tcp_write_queue_next(sk, skb); /* Skips head */ 2062 skb = tcp_write_queue_next(sk, skb); /* Skips head */
2052 tcp_for_write_queue_from(skb, sk) { 2063 tcp_for_write_queue_from(skb, sk) {
2053 if (skb == tcp_send_head(sk)) 2064 if (skb == tcp_send_head(sk))
2054 break; 2065 break;
2055 if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) 2066 if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
2056 return 0; 2067 return false;
2057 /* Short-circuit when first non-SACKed skb has been checked */ 2068 /* Short-circuit when first non-SACKed skb has been checked */
2058 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) 2069 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
2059 break; 2070 break;
2060 } 2071 }
2061 return 1; 2072 return true;
2062} 2073}
2063 2074
2064/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO 2075/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
@@ -2294,7 +2305,7 @@ void tcp_enter_loss(struct sock *sk, int how)
2294 * 2305 *
2295 * Do processing similar to RTO timeout. 2306 * Do processing similar to RTO timeout.
2296 */ 2307 */
2297static int tcp_check_sack_reneging(struct sock *sk, int flag) 2308static bool tcp_check_sack_reneging(struct sock *sk, int flag)
2298{ 2309{
2299 if (flag & FLAG_SACK_RENEGING) { 2310 if (flag & FLAG_SACK_RENEGING) {
2300 struct inet_connection_sock *icsk = inet_csk(sk); 2311 struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2305,9 +2316,9 @@ static int tcp_check_sack_reneging(struct sock *sk, int flag)
2305 tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); 2316 tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
2306 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 2317 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2307 icsk->icsk_rto, TCP_RTO_MAX); 2318 icsk->icsk_rto, TCP_RTO_MAX);
2308 return 1; 2319 return true;
2309 } 2320 }
2310 return 0; 2321 return false;
2311} 2322}
2312 2323
2313static inline int tcp_fackets_out(const struct tcp_sock *tp) 2324static inline int tcp_fackets_out(const struct tcp_sock *tp)
@@ -2335,6 +2346,27 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
2335 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; 2346 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
2336} 2347}
2337 2348
2349static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
2350{
2351 struct tcp_sock *tp = tcp_sk(sk);
2352 unsigned long delay;
2353
2354 /* Delay early retransmit and entering fast recovery for
2355 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
2356 * available, or RTO is scheduled to fire first.
2357 */
2358 if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt)
2359 return false;
2360
2361 delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2));
2362 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
2363 return false;
2364
2365 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX);
2366 tp->early_retrans_delayed = 1;
2367 return true;
2368}
2369
2338static inline int tcp_skb_timedout(const struct sock *sk, 2370static inline int tcp_skb_timedout(const struct sock *sk,
2339 const struct sk_buff *skb) 2371 const struct sk_buff *skb)
2340{ 2372{
@@ -2442,28 +2474,28 @@ static inline int tcp_head_timedout(const struct sock *sk)
2442 * Main question: may we further continue forward transmission 2474 * Main question: may we further continue forward transmission
2443 * with the same cwnd? 2475 * with the same cwnd?
2444 */ 2476 */
2445static int tcp_time_to_recover(struct sock *sk) 2477static bool tcp_time_to_recover(struct sock *sk, int flag)
2446{ 2478{
2447 struct tcp_sock *tp = tcp_sk(sk); 2479 struct tcp_sock *tp = tcp_sk(sk);
2448 __u32 packets_out; 2480 __u32 packets_out;
2449 2481
2450 /* Do not perform any recovery during F-RTO algorithm */ 2482 /* Do not perform any recovery during F-RTO algorithm */
2451 if (tp->frto_counter) 2483 if (tp->frto_counter)
2452 return 0; 2484 return false;
2453 2485
2454 /* Trick#1: The loss is proven. */ 2486 /* Trick#1: The loss is proven. */
2455 if (tp->lost_out) 2487 if (tp->lost_out)
2456 return 1; 2488 return true;
2457 2489
2458 /* Not-A-Trick#2 : Classic rule... */ 2490 /* Not-A-Trick#2 : Classic rule... */
2459 if (tcp_dupack_heuristics(tp) > tp->reordering) 2491 if (tcp_dupack_heuristics(tp) > tp->reordering)
2460 return 1; 2492 return true;
2461 2493
2462 /* Trick#3 : when we use RFC2988 timer restart, fast 2494 /* Trick#3 : when we use RFC2988 timer restart, fast
2463 * retransmit can be triggered by timeout of queue head. 2495 * retransmit can be triggered by timeout of queue head.
2464 */ 2496 */
2465 if (tcp_is_fack(tp) && tcp_head_timedout(sk)) 2497 if (tcp_is_fack(tp) && tcp_head_timedout(sk))
2466 return 1; 2498 return true;
2467 2499
2468 /* Trick#4: It is still not OK... But will it be useful to delay 2500 /* Trick#4: It is still not OK... But will it be useful to delay
2469 * recovery more? 2501 * recovery more?
@@ -2475,7 +2507,7 @@ static int tcp_time_to_recover(struct sock *sk)
2475 /* We have nothing to send. This connection is limited 2507 /* We have nothing to send. This connection is limited
2476 * either by receiver window or by application. 2508 * either by receiver window or by application.
2477 */ 2509 */
2478 return 1; 2510 return true;
2479 } 2511 }
2480 2512
2481 /* If a thin stream is detected, retransmit after first 2513 /* If a thin stream is detected, retransmit after first
@@ -2486,9 +2518,19 @@ static int tcp_time_to_recover(struct sock *sk)
2486 if ((tp->thin_dupack || sysctl_tcp_thin_dupack) && 2518 if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
2487 tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 && 2519 tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
2488 tcp_is_sack(tp) && !tcp_send_head(sk)) 2520 tcp_is_sack(tp) && !tcp_send_head(sk))
2489 return 1; 2521 return true;
2490 2522
2491 return 0; 2523 /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious
2524 * retransmissions due to small network reorderings, we implement
2525 * Mitigation A.3 in the RFC and delay the retransmission for a short
2526 * interval if appropriate.
2527 */
2528 if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
2529 (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) &&
2530 !tcp_may_send_now(sk))
2531 return !tcp_pause_early_retransmit(sk, flag);
2532
2533 return false;
2492} 2534}
2493 2535
2494/* New heuristics: it is possible only after we switched to restart timer 2536/* New heuristics: it is possible only after we switched to restart timer
@@ -2676,22 +2718,22 @@ static void DBGUNDO(struct sock *sk, const char *msg)
2676 struct inet_sock *inet = inet_sk(sk); 2718 struct inet_sock *inet = inet_sk(sk);
2677 2719
2678 if (sk->sk_family == AF_INET) { 2720 if (sk->sk_family == AF_INET) {
2679 printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", 2721 pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
2680 msg, 2722 msg,
2681 &inet->inet_daddr, ntohs(inet->inet_dport), 2723 &inet->inet_daddr, ntohs(inet->inet_dport),
2682 tp->snd_cwnd, tcp_left_out(tp), 2724 tp->snd_cwnd, tcp_left_out(tp),
2683 tp->snd_ssthresh, tp->prior_ssthresh, 2725 tp->snd_ssthresh, tp->prior_ssthresh,
2684 tp->packets_out); 2726 tp->packets_out);
2685 } 2727 }
2686#if IS_ENABLED(CONFIG_IPV6) 2728#if IS_ENABLED(CONFIG_IPV6)
2687 else if (sk->sk_family == AF_INET6) { 2729 else if (sk->sk_family == AF_INET6) {
2688 struct ipv6_pinfo *np = inet6_sk(sk); 2730 struct ipv6_pinfo *np = inet6_sk(sk);
2689 printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", 2731 pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
2690 msg, 2732 msg,
2691 &np->daddr, ntohs(inet->inet_dport), 2733 &np->daddr, ntohs(inet->inet_dport),
2692 tp->snd_cwnd, tcp_left_out(tp), 2734 tp->snd_cwnd, tcp_left_out(tp),
2693 tp->snd_ssthresh, tp->prior_ssthresh, 2735 tp->snd_ssthresh, tp->prior_ssthresh,
2694 tp->packets_out); 2736 tp->packets_out);
2695 } 2737 }
2696#endif 2738#endif
2697} 2739}
@@ -2727,7 +2769,7 @@ static inline int tcp_may_undo(const struct tcp_sock *tp)
2727} 2769}
2728 2770
2729/* People celebrate: "We love our President!" */ 2771/* People celebrate: "We love our President!" */
2730static int tcp_try_undo_recovery(struct sock *sk) 2772static bool tcp_try_undo_recovery(struct sock *sk)
2731{ 2773{
2732 struct tcp_sock *tp = tcp_sk(sk); 2774 struct tcp_sock *tp = tcp_sk(sk);
2733 2775
@@ -2752,10 +2794,10 @@ static int tcp_try_undo_recovery(struct sock *sk)
2752 * is ACKed. For Reno it is MUST to prevent false 2794 * is ACKed. For Reno it is MUST to prevent false
2753 * fast retransmits (RFC2582). SACK TCP is safe. */ 2795 * fast retransmits (RFC2582). SACK TCP is safe. */
2754 tcp_moderate_cwnd(tp); 2796 tcp_moderate_cwnd(tp);
2755 return 1; 2797 return true;
2756 } 2798 }
2757 tcp_set_ca_state(sk, TCP_CA_Open); 2799 tcp_set_ca_state(sk, TCP_CA_Open);
2758 return 0; 2800 return false;
2759} 2801}
2760 2802
2761/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ 2803/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
@@ -2785,19 +2827,19 @@ static void tcp_try_undo_dsack(struct sock *sk)
2785 * that successive retransmissions of a segment must not advance 2827 * that successive retransmissions of a segment must not advance
2786 * retrans_stamp under any conditions. 2828 * retrans_stamp under any conditions.
2787 */ 2829 */
2788static int tcp_any_retrans_done(const struct sock *sk) 2830static bool tcp_any_retrans_done(const struct sock *sk)
2789{ 2831{
2790 const struct tcp_sock *tp = tcp_sk(sk); 2832 const struct tcp_sock *tp = tcp_sk(sk);
2791 struct sk_buff *skb; 2833 struct sk_buff *skb;
2792 2834
2793 if (tp->retrans_out) 2835 if (tp->retrans_out)
2794 return 1; 2836 return true;
2795 2837
2796 skb = tcp_write_queue_head(sk); 2838 skb = tcp_write_queue_head(sk);
2797 if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) 2839 if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
2798 return 1; 2840 return true;
2799 2841
2800 return 0; 2842 return false;
2801} 2843}
2802 2844
2803/* Undo during fast recovery after partial ACK. */ 2845/* Undo during fast recovery after partial ACK. */
@@ -2831,7 +2873,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
2831} 2873}
2832 2874
2833/* Undo during loss recovery after partial ACK. */ 2875/* Undo during loss recovery after partial ACK. */
2834static int tcp_try_undo_loss(struct sock *sk) 2876static bool tcp_try_undo_loss(struct sock *sk)
2835{ 2877{
2836 struct tcp_sock *tp = tcp_sk(sk); 2878 struct tcp_sock *tp = tcp_sk(sk);
2837 2879
@@ -2853,9 +2895,9 @@ static int tcp_try_undo_loss(struct sock *sk)
2853 tp->undo_marker = 0; 2895 tp->undo_marker = 0;
2854 if (tcp_is_sack(tp)) 2896 if (tcp_is_sack(tp))
2855 tcp_set_ca_state(sk, TCP_CA_Open); 2897 tcp_set_ca_state(sk, TCP_CA_Open);
2856 return 1; 2898 return true;
2857 } 2899 }
2858 return 0; 2900 return false;
2859} 2901}
2860 2902
2861static inline void tcp_complete_cwr(struct sock *sk) 2903static inline void tcp_complete_cwr(struct sock *sk)
@@ -2864,11 +2906,14 @@ static inline void tcp_complete_cwr(struct sock *sk)
2864 2906
2865 /* Do not moderate cwnd if it's already undone in cwr or recovery. */ 2907 /* Do not moderate cwnd if it's already undone in cwr or recovery. */
2866 if (tp->undo_marker) { 2908 if (tp->undo_marker) {
2867 if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) 2909 if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) {
2868 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); 2910 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
2869 else /* PRR */ 2911 tp->snd_cwnd_stamp = tcp_time_stamp;
2912 } else if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) {
2913 /* PRR algorithm. */
2870 tp->snd_cwnd = tp->snd_ssthresh; 2914 tp->snd_cwnd = tp->snd_ssthresh;
2871 tp->snd_cwnd_stamp = tcp_time_stamp; 2915 tp->snd_cwnd_stamp = tcp_time_stamp;
2916 }
2872 } 2917 }
2873 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); 2918 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
2874} 2919}
@@ -3018,6 +3063,38 @@ static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
3018 tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; 3063 tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
3019} 3064}
3020 3065
3066static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
3067{
3068 struct tcp_sock *tp = tcp_sk(sk);
3069 int mib_idx;
3070
3071 if (tcp_is_reno(tp))
3072 mib_idx = LINUX_MIB_TCPRENORECOVERY;
3073 else
3074 mib_idx = LINUX_MIB_TCPSACKRECOVERY;
3075
3076 NET_INC_STATS_BH(sock_net(sk), mib_idx);
3077
3078 tp->high_seq = tp->snd_nxt;
3079 tp->prior_ssthresh = 0;
3080 tp->undo_marker = tp->snd_una;
3081 tp->undo_retrans = tp->retrans_out;
3082
3083 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
3084 if (!ece_ack)
3085 tp->prior_ssthresh = tcp_current_ssthresh(sk);
3086 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
3087 TCP_ECN_queue_cwr(tp);
3088 }
3089
3090 tp->bytes_acked = 0;
3091 tp->snd_cwnd_cnt = 0;
3092 tp->prior_cwnd = tp->snd_cwnd;
3093 tp->prr_delivered = 0;
3094 tp->prr_out = 0;
3095 tcp_set_ca_state(sk, TCP_CA_Recovery);
3096}
3097
3021/* Process an event, which can update packets-in-flight not trivially. 3098/* Process an event, which can update packets-in-flight not trivially.
3022 * Main goal of this function is to calculate new estimate for left_out, 3099 * Main goal of this function is to calculate new estimate for left_out,
3023 * taking into account both packets sitting in receiver's buffer and 3100 * taking into account both packets sitting in receiver's buffer and
@@ -3037,7 +3114,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
3037 struct tcp_sock *tp = tcp_sk(sk); 3114 struct tcp_sock *tp = tcp_sk(sk);
3038 int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && 3115 int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
3039 (tcp_fackets_out(tp) > tp->reordering)); 3116 (tcp_fackets_out(tp) > tp->reordering));
3040 int fast_rexmit = 0, mib_idx; 3117 int fast_rexmit = 0;
3041 3118
3042 if (WARN_ON(!tp->packets_out && tp->sacked_out)) 3119 if (WARN_ON(!tp->packets_out && tp->sacked_out))
3043 tp->sacked_out = 0; 3120 tp->sacked_out = 0;
@@ -3121,7 +3198,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
3121 if (icsk->icsk_ca_state <= TCP_CA_Disorder) 3198 if (icsk->icsk_ca_state <= TCP_CA_Disorder)
3122 tcp_try_undo_dsack(sk); 3199 tcp_try_undo_dsack(sk);
3123 3200
3124 if (!tcp_time_to_recover(sk)) { 3201 if (!tcp_time_to_recover(sk, flag)) {
3125 tcp_try_to_open(sk, flag); 3202 tcp_try_to_open(sk, flag);
3126 return; 3203 return;
3127 } 3204 }
@@ -3138,32 +3215,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
3138 } 3215 }
3139 3216
3140 /* Otherwise enter Recovery state */ 3217 /* Otherwise enter Recovery state */
3141 3218 tcp_enter_recovery(sk, (flag & FLAG_ECE));
3142 if (tcp_is_reno(tp))
3143 mib_idx = LINUX_MIB_TCPRENORECOVERY;
3144 else
3145 mib_idx = LINUX_MIB_TCPSACKRECOVERY;
3146
3147 NET_INC_STATS_BH(sock_net(sk), mib_idx);
3148
3149 tp->high_seq = tp->snd_nxt;
3150 tp->prior_ssthresh = 0;
3151 tp->undo_marker = tp->snd_una;
3152 tp->undo_retrans = tp->retrans_out;
3153
3154 if (icsk->icsk_ca_state < TCP_CA_CWR) {
3155 if (!(flag & FLAG_ECE))
3156 tp->prior_ssthresh = tcp_current_ssthresh(sk);
3157 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
3158 TCP_ECN_queue_cwr(tp);
3159 }
3160
3161 tp->bytes_acked = 0;
3162 tp->snd_cwnd_cnt = 0;
3163 tp->prior_cwnd = tp->snd_cwnd;
3164 tp->prr_delivered = 0;
3165 tp->prr_out = 0;
3166 tcp_set_ca_state(sk, TCP_CA_Recovery);
3167 fast_rexmit = 1; 3219 fast_rexmit = 1;
3168 } 3220 }
3169 3221
@@ -3245,16 +3297,47 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
3245/* Restart timer after forward progress on connection. 3297/* Restart timer after forward progress on connection.
3246 * RFC2988 recommends to restart timer to now+rto. 3298 * RFC2988 recommends to restart timer to now+rto.
3247 */ 3299 */
3248static void tcp_rearm_rto(struct sock *sk) 3300void tcp_rearm_rto(struct sock *sk)
3249{ 3301{
3250 const struct tcp_sock *tp = tcp_sk(sk); 3302 struct tcp_sock *tp = tcp_sk(sk);
3251 3303
3252 if (!tp->packets_out) { 3304 if (!tp->packets_out) {
3253 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); 3305 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
3254 } else { 3306 } else {
3255 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 3307 u32 rto = inet_csk(sk)->icsk_rto;
3256 inet_csk(sk)->icsk_rto, TCP_RTO_MAX); 3308 /* Offset the time elapsed after installing regular RTO */
3309 if (tp->early_retrans_delayed) {
3310 struct sk_buff *skb = tcp_write_queue_head(sk);
3311 const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
3312 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
3313 /* delta may not be positive if the socket is locked
3314 * when the delayed ER timer fires and is rescheduled.
3315 */
3316 if (delta > 0)
3317 rto = delta;
3318 }
3319 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
3320 TCP_RTO_MAX);
3257 } 3321 }
3322 tp->early_retrans_delayed = 0;
3323}
3324
3325/* This function is called when the delayed ER timer fires. TCP enters
3326 * fast recovery and performs fast-retransmit.
3327 */
3328void tcp_resume_early_retransmit(struct sock *sk)
3329{
3330 struct tcp_sock *tp = tcp_sk(sk);
3331
3332 tcp_rearm_rto(sk);
3333
3334 /* Stop if ER is disabled after the delayed ER timer is scheduled */
3335 if (!tp->do_early_retrans)
3336 return;
3337
3338 tcp_enter_recovery(sk, false);
3339 tcp_update_scoreboard(sk, 1);
3340 tcp_xmit_retransmit_queue(sk);
3258} 3341}
3259 3342
3260/* If we get here, the whole TSO packet has not been acked. */ 3343/* If we get here, the whole TSO packet has not been acked. */
@@ -3289,7 +3372,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3289 const struct inet_connection_sock *icsk = inet_csk(sk); 3372 const struct inet_connection_sock *icsk = inet_csk(sk);
3290 struct sk_buff *skb; 3373 struct sk_buff *skb;
3291 u32 now = tcp_time_stamp; 3374 u32 now = tcp_time_stamp;
3292 int fully_acked = 1; 3375 int fully_acked = true;
3293 int flag = 0; 3376 int flag = 0;
3294 u32 pkts_acked = 0; 3377 u32 pkts_acked = 0;
3295 u32 reord = tp->packets_out; 3378 u32 reord = tp->packets_out;
@@ -3313,7 +3396,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3313 if (!acked_pcount) 3396 if (!acked_pcount)
3314 break; 3397 break;
3315 3398
3316 fully_acked = 0; 3399 fully_acked = false;
3317 } else { 3400 } else {
3318 acked_pcount = tcp_skb_pcount(skb); 3401 acked_pcount = tcp_skb_pcount(skb);
3319 } 3402 }
@@ -3430,18 +3513,18 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3430 if (!tp->packets_out && tcp_is_sack(tp)) { 3513 if (!tp->packets_out && tcp_is_sack(tp)) {
3431 icsk = inet_csk(sk); 3514 icsk = inet_csk(sk);
3432 if (tp->lost_out) { 3515 if (tp->lost_out) {
3433 printk(KERN_DEBUG "Leak l=%u %d\n", 3516 pr_debug("Leak l=%u %d\n",
3434 tp->lost_out, icsk->icsk_ca_state); 3517 tp->lost_out, icsk->icsk_ca_state);
3435 tp->lost_out = 0; 3518 tp->lost_out = 0;
3436 } 3519 }
3437 if (tp->sacked_out) { 3520 if (tp->sacked_out) {
3438 printk(KERN_DEBUG "Leak s=%u %d\n", 3521 pr_debug("Leak s=%u %d\n",
3439 tp->sacked_out, icsk->icsk_ca_state); 3522 tp->sacked_out, icsk->icsk_ca_state);
3440 tp->sacked_out = 0; 3523 tp->sacked_out = 0;
3441 } 3524 }
3442 if (tp->retrans_out) { 3525 if (tp->retrans_out) {
3443 printk(KERN_DEBUG "Leak r=%u %d\n", 3526 pr_debug("Leak r=%u %d\n",
3444 tp->retrans_out, icsk->icsk_ca_state); 3527 tp->retrans_out, icsk->icsk_ca_state);
3445 tp->retrans_out = 0; 3528 tp->retrans_out = 0;
3446 } 3529 }
3447 } 3530 }
@@ -3592,7 +3675,7 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag)
3592 * to prove that the RTO is indeed spurious. It transfers the control 3675 * to prove that the RTO is indeed spurious. It transfers the control
3593 * from F-RTO to the conventional RTO recovery 3676 * from F-RTO to the conventional RTO recovery
3594 */ 3677 */
3595static int tcp_process_frto(struct sock *sk, int flag) 3678static bool tcp_process_frto(struct sock *sk, int flag)
3596{ 3679{
3597 struct tcp_sock *tp = tcp_sk(sk); 3680 struct tcp_sock *tp = tcp_sk(sk);
3598 3681
@@ -3608,7 +3691,7 @@ static int tcp_process_frto(struct sock *sk, int flag)
3608 3691
3609 if (!before(tp->snd_una, tp->frto_highmark)) { 3692 if (!before(tp->snd_una, tp->frto_highmark)) {
3610 tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); 3693 tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
3611 return 1; 3694 return true;
3612 } 3695 }
3613 3696
3614 if (!tcp_is_sackfrto(tp)) { 3697 if (!tcp_is_sackfrto(tp)) {
@@ -3617,19 +3700,19 @@ static int tcp_process_frto(struct sock *sk, int flag)
3617 * data, winupdate 3700 * data, winupdate
3618 */ 3701 */
3619 if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP)) 3702 if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP))
3620 return 1; 3703 return true;
3621 3704
3622 if (!(flag & FLAG_DATA_ACKED)) { 3705 if (!(flag & FLAG_DATA_ACKED)) {
3623 tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), 3706 tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
3624 flag); 3707 flag);
3625 return 1; 3708 return true;
3626 } 3709 }
3627 } else { 3710 } else {
3628 if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { 3711 if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
3629 /* Prevent sending of new data. */ 3712 /* Prevent sending of new data. */
3630 tp->snd_cwnd = min(tp->snd_cwnd, 3713 tp->snd_cwnd = min(tp->snd_cwnd,
3631 tcp_packets_in_flight(tp)); 3714 tcp_packets_in_flight(tp));
3632 return 1; 3715 return true;
3633 } 3716 }
3634 3717
3635 if ((tp->frto_counter >= 2) && 3718 if ((tp->frto_counter >= 2) &&
@@ -3639,10 +3722,10 @@ static int tcp_process_frto(struct sock *sk, int flag)
3639 /* RFC4138 shortcoming (see comment above) */ 3722 /* RFC4138 shortcoming (see comment above) */
3640 if (!(flag & FLAG_FORWARD_PROGRESS) && 3723 if (!(flag & FLAG_FORWARD_PROGRESS) &&
3641 (flag & FLAG_NOT_DUP)) 3724 (flag & FLAG_NOT_DUP))
3642 return 1; 3725 return true;
3643 3726
3644 tcp_enter_frto_loss(sk, 3, flag); 3727 tcp_enter_frto_loss(sk, 3, flag);
3645 return 1; 3728 return true;
3646 } 3729 }
3647 } 3730 }
3648 3731
@@ -3654,7 +3737,7 @@ static int tcp_process_frto(struct sock *sk, int flag)
3654 if (!tcp_may_send_now(sk)) 3737 if (!tcp_may_send_now(sk))
3655 tcp_enter_frto_loss(sk, 2, flag); 3738 tcp_enter_frto_loss(sk, 2, flag);
3656 3739
3657 return 1; 3740 return true;
3658 } else { 3741 } else {
3659 switch (sysctl_tcp_frto_response) { 3742 switch (sysctl_tcp_frto_response) {
3660 case 2: 3743 case 2:
@@ -3671,7 +3754,7 @@ static int tcp_process_frto(struct sock *sk, int flag)
3671 tp->undo_marker = 0; 3754 tp->undo_marker = 0;
3672 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); 3755 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS);
3673 } 3756 }
3674 return 0; 3757 return false;
3675} 3758}
3676 3759
3677/* This routine deals with incoming acks, but not outgoing ones. */ 3760/* This routine deals with incoming acks, but not outgoing ones. */
@@ -3689,7 +3772,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3689 int prior_sacked = tp->sacked_out; 3772 int prior_sacked = tp->sacked_out;
3690 int pkts_acked = 0; 3773 int pkts_acked = 0;
3691 int newly_acked_sacked = 0; 3774 int newly_acked_sacked = 0;
3692 int frto_cwnd = 0; 3775 bool frto_cwnd = false;
3693 3776
3694 /* If the ack is older than previous acks 3777 /* If the ack is older than previous acks
3695 * then we can probably ignore it. 3778 * then we can probably ignore it.
@@ -3703,6 +3786,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3703 if (after(ack, tp->snd_nxt)) 3786 if (after(ack, tp->snd_nxt))
3704 goto invalid_ack; 3787 goto invalid_ack;
3705 3788
3789 if (tp->early_retrans_delayed)
3790 tcp_rearm_rto(sk);
3791
3706 if (after(ack, prior_snd_una)) 3792 if (after(ack, prior_snd_una))
3707 flag |= FLAG_SND_UNA_ADVANCED; 3793 flag |= FLAG_SND_UNA_ADVANCED;
3708 3794
@@ -3868,10 +3954,9 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
3868 __u8 snd_wscale = *(__u8 *)ptr; 3954 __u8 snd_wscale = *(__u8 *)ptr;
3869 opt_rx->wscale_ok = 1; 3955 opt_rx->wscale_ok = 1;
3870 if (snd_wscale > 14) { 3956 if (snd_wscale > 14) {
3871 if (net_ratelimit()) 3957 net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n",
3872 pr_info("%s: Illegal window scaling value %d >14 received\n", 3958 __func__,
3873 __func__, 3959 snd_wscale);
3874 snd_wscale);
3875 snd_wscale = 14; 3960 snd_wscale = 14;
3876 } 3961 }
3877 opt_rx->snd_wscale = snd_wscale; 3962 opt_rx->snd_wscale = snd_wscale;
@@ -3942,7 +4027,7 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
3942} 4027}
3943EXPORT_SYMBOL(tcp_parse_options); 4028EXPORT_SYMBOL(tcp_parse_options);
3944 4029
3945static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th) 4030static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
3946{ 4031{
3947 const __be32 *ptr = (const __be32 *)(th + 1); 4032 const __be32 *ptr = (const __be32 *)(th + 1);
3948 4033
@@ -3953,31 +4038,31 @@ static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr
3953 tp->rx_opt.rcv_tsval = ntohl(*ptr); 4038 tp->rx_opt.rcv_tsval = ntohl(*ptr);
3954 ++ptr; 4039 ++ptr;
3955 tp->rx_opt.rcv_tsecr = ntohl(*ptr); 4040 tp->rx_opt.rcv_tsecr = ntohl(*ptr);
3956 return 1; 4041 return true;
3957 } 4042 }
3958 return 0; 4043 return false;
3959} 4044}
3960 4045
3961/* Fast parse options. This hopes to only see timestamps. 4046/* Fast parse options. This hopes to only see timestamps.
3962 * If it is wrong it falls back on tcp_parse_options(). 4047 * If it is wrong it falls back on tcp_parse_options().
3963 */ 4048 */
3964static int tcp_fast_parse_options(const struct sk_buff *skb, 4049static bool tcp_fast_parse_options(const struct sk_buff *skb,
3965 const struct tcphdr *th, 4050 const struct tcphdr *th,
3966 struct tcp_sock *tp, const u8 **hvpp) 4051 struct tcp_sock *tp, const u8 **hvpp)
3967{ 4052{
3968 /* In the spirit of fast parsing, compare doff directly to constant 4053 /* In the spirit of fast parsing, compare doff directly to constant
3969 * values. Because equality is used, short doff can be ignored here. 4054 * values. Because equality is used, short doff can be ignored here.
3970 */ 4055 */
3971 if (th->doff == (sizeof(*th) / 4)) { 4056 if (th->doff == (sizeof(*th) / 4)) {
3972 tp->rx_opt.saw_tstamp = 0; 4057 tp->rx_opt.saw_tstamp = 0;
3973 return 0; 4058 return false;
3974 } else if (tp->rx_opt.tstamp_ok && 4059 } else if (tp->rx_opt.tstamp_ok &&
3975 th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { 4060 th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
3976 if (tcp_parse_aligned_timestamp(tp, th)) 4061 if (tcp_parse_aligned_timestamp(tp, th))
3977 return 1; 4062 return true;
3978 } 4063 }
3979 tcp_parse_options(skb, &tp->rx_opt, hvpp, 1); 4064 tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
3980 return 1; 4065 return true;
3981} 4066}
3982 4067
3983#ifdef CONFIG_TCP_MD5SIG 4068#ifdef CONFIG_TCP_MD5SIG
@@ -4218,7 +4303,7 @@ static void tcp_fin(struct sock *sk)
4218 } 4303 }
4219} 4304}
4220 4305
4221static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, 4306static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
4222 u32 end_seq) 4307 u32 end_seq)
4223{ 4308{
4224 if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { 4309 if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
@@ -4226,9 +4311,9 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
4226 sp->start_seq = seq; 4311 sp->start_seq = seq;
4227 if (after(end_seq, sp->end_seq)) 4312 if (after(end_seq, sp->end_seq))
4228 sp->end_seq = end_seq; 4313 sp->end_seq = end_seq;
4229 return 1; 4314 return true;
4230 } 4315 }
4231 return 0; 4316 return false;
4232} 4317}
4233 4318
4234static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) 4319static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
@@ -4424,10 +4509,10 @@ static void tcp_ofo_queue(struct sock *sk)
4424 } 4509 }
4425} 4510}
4426 4511
4427static int tcp_prune_ofo_queue(struct sock *sk); 4512static bool tcp_prune_ofo_queue(struct sock *sk);
4428static int tcp_prune_queue(struct sock *sk); 4513static int tcp_prune_queue(struct sock *sk);
4429 4514
4430static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) 4515static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
4431{ 4516{
4432 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || 4517 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
4433 !sk_rmem_schedule(sk, size)) { 4518 !sk_rmem_schedule(sk, size)) {
@@ -4446,6 +4531,41 @@ static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
4446 return 0; 4531 return 0;
4447} 4532}
4448 4533
4534/**
4535 * tcp_try_coalesce - try to merge skb to prior one
4536 * @sk: socket
4537 * @to: prior buffer
4538 * @from: buffer to add in queue
4539 * @fragstolen: pointer to boolean
4540 *
4541 * Before queueing skb @from after @to, try to merge them
4542 * to reduce overall memory use and queue lengths, if cost is small.
4543 * Packets in ofo or receive queues can stay a long time.
4544 * Better try to coalesce them right now to avoid future collapses.
4545 * Returns true if caller should free @from instead of queueing it
4546 */
4547static bool tcp_try_coalesce(struct sock *sk,
4548 struct sk_buff *to,
4549 struct sk_buff *from,
4550 bool *fragstolen)
4551{
4552 int delta;
4553
4554 *fragstolen = false;
4555
4556 if (tcp_hdr(from)->fin)
4557 return false;
4558 if (!skb_try_coalesce(to, from, fragstolen, &delta))
4559 return false;
4560
4561 atomic_add(delta, &sk->sk_rmem_alloc);
4562 sk_mem_charge(sk, delta);
4563 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
4564 TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
4565 TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
4566 return true;
4567}
4568
4449static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) 4569static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4450{ 4570{
4451 struct tcp_sock *tp = tcp_sk(sk); 4571 struct tcp_sock *tp = tcp_sk(sk);
@@ -4484,23 +4604,13 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4484 end_seq = TCP_SKB_CB(skb)->end_seq; 4604 end_seq = TCP_SKB_CB(skb)->end_seq;
4485 4605
4486 if (seq == TCP_SKB_CB(skb1)->end_seq) { 4606 if (seq == TCP_SKB_CB(skb1)->end_seq) {
4487 /* Packets in ofo can stay in queue a long time. 4607 bool fragstolen;
4488 * Better try to coalesce them right now 4608
4489 * to avoid future tcp_collapse_ofo_queue(), 4609 if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
4490 * probably the most expensive function in tcp stack.
4491 */
4492 if (skb->len <= skb_tailroom(skb1) && !tcp_hdr(skb)->fin) {
4493 NET_INC_STATS_BH(sock_net(sk),
4494 LINUX_MIB_TCPRCVCOALESCE);
4495 BUG_ON(skb_copy_bits(skb, 0,
4496 skb_put(skb1, skb->len),
4497 skb->len));
4498 TCP_SKB_CB(skb1)->end_seq = end_seq;
4499 TCP_SKB_CB(skb1)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
4500 __kfree_skb(skb);
4501 skb = NULL;
4502 } else {
4503 __skb_queue_after(&tp->out_of_order_queue, skb1, skb); 4610 __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
4611 } else {
4612 kfree_skb_partial(skb, fragstolen);
4613 skb = NULL;
4504 } 4614 }
4505 4615
4506 if (!tp->rx_opt.num_sacks || 4616 if (!tp->rx_opt.num_sacks ||
@@ -4576,12 +4686,65 @@ end:
4576 skb_set_owner_r(skb, sk); 4686 skb_set_owner_r(skb, sk);
4577} 4687}
4578 4688
4689static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
4690 bool *fragstolen)
4691{
4692 int eaten;
4693 struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
4694
4695 __skb_pull(skb, hdrlen);
4696 eaten = (tail &&
4697 tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
4698 tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4699 if (!eaten) {
4700 __skb_queue_tail(&sk->sk_receive_queue, skb);
4701 skb_set_owner_r(skb, sk);
4702 }
4703 return eaten;
4704}
4705
4706int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
4707{
4708 struct sk_buff *skb;
4709 struct tcphdr *th;
4710 bool fragstolen;
4711
4712 if (tcp_try_rmem_schedule(sk, size + sizeof(*th)))
4713 goto err;
4714
4715 skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
4716 if (!skb)
4717 goto err;
4718
4719 th = (struct tcphdr *)skb_put(skb, sizeof(*th));
4720 skb_reset_transport_header(skb);
4721 memset(th, 0, sizeof(*th));
4722
4723 if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
4724 goto err_free;
4725
4726 TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
4727 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
4728 TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
4729
4730 if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) {
4731 WARN_ON_ONCE(fragstolen); /* should not happen */
4732 __kfree_skb(skb);
4733 }
4734 return size;
4735
4736err_free:
4737 kfree_skb(skb);
4738err:
4739 return -ENOMEM;
4740}
4579 4741
4580static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) 4742static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4581{ 4743{
4582 const struct tcphdr *th = tcp_hdr(skb); 4744 const struct tcphdr *th = tcp_hdr(skb);
4583 struct tcp_sock *tp = tcp_sk(sk); 4745 struct tcp_sock *tp = tcp_sk(sk);
4584 int eaten = -1; 4746 int eaten = -1;
4747 bool fragstolen = false;
4585 4748
4586 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) 4749 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
4587 goto drop; 4750 goto drop;
@@ -4626,8 +4789,7 @@ queue_and_out:
4626 tcp_try_rmem_schedule(sk, skb->truesize)) 4789 tcp_try_rmem_schedule(sk, skb->truesize))
4627 goto drop; 4790 goto drop;
4628 4791
4629 skb_set_owner_r(skb, sk); 4792 eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
4630 __skb_queue_tail(&sk->sk_receive_queue, skb);
4631 } 4793 }
4632 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 4794 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4633 if (skb->len) 4795 if (skb->len)
@@ -4651,7 +4813,7 @@ queue_and_out:
4651 tcp_fast_path_check(sk); 4813 tcp_fast_path_check(sk);
4652 4814
4653 if (eaten > 0) 4815 if (eaten > 0)
4654 __kfree_skb(skb); 4816 kfree_skb_partial(skb, fragstolen);
4655 else if (!sock_flag(sk, SOCK_DEAD)) 4817 else if (!sock_flag(sk, SOCK_DEAD))
4656 sk->sk_data_ready(sk, 0); 4818 sk->sk_data_ready(sk, 0);
4657 return; 4819 return;
@@ -4871,10 +5033,10 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
4871 * Purge the out-of-order queue. 5033 * Purge the out-of-order queue.
4872 * Return true if queue was pruned. 5034 * Return true if queue was pruned.
4873 */ 5035 */
4874static int tcp_prune_ofo_queue(struct sock *sk) 5036static bool tcp_prune_ofo_queue(struct sock *sk)
4875{ 5037{
4876 struct tcp_sock *tp = tcp_sk(sk); 5038 struct tcp_sock *tp = tcp_sk(sk);
4877 int res = 0; 5039 bool res = false;
4878 5040
4879 if (!skb_queue_empty(&tp->out_of_order_queue)) { 5041 if (!skb_queue_empty(&tp->out_of_order_queue)) {
4880 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); 5042 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
@@ -4888,7 +5050,7 @@ static int tcp_prune_ofo_queue(struct sock *sk)
4888 if (tp->rx_opt.sack_ok) 5050 if (tp->rx_opt.sack_ok)
4889 tcp_sack_reset(&tp->rx_opt); 5051 tcp_sack_reset(&tp->rx_opt);
4890 sk_mem_reclaim(sk); 5052 sk_mem_reclaim(sk);
4891 res = 1; 5053 res = true;
4892 } 5054 }
4893 return res; 5055 return res;
4894} 5056}
@@ -4965,7 +5127,7 @@ void tcp_cwnd_application_limited(struct sock *sk)
4965 tp->snd_cwnd_stamp = tcp_time_stamp; 5127 tp->snd_cwnd_stamp = tcp_time_stamp;
4966} 5128}
4967 5129
4968static int tcp_should_expand_sndbuf(const struct sock *sk) 5130static bool tcp_should_expand_sndbuf(const struct sock *sk)
4969{ 5131{
4970 const struct tcp_sock *tp = tcp_sk(sk); 5132 const struct tcp_sock *tp = tcp_sk(sk);
4971 5133
@@ -4973,21 +5135,21 @@ static int tcp_should_expand_sndbuf(const struct sock *sk)
4973 * not modify it. 5135 * not modify it.
4974 */ 5136 */
4975 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) 5137 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
4976 return 0; 5138 return false;
4977 5139
4978 /* If we are under global TCP memory pressure, do not expand. */ 5140 /* If we are under global TCP memory pressure, do not expand. */
4979 if (sk_under_memory_pressure(sk)) 5141 if (sk_under_memory_pressure(sk))
4980 return 0; 5142 return false;
4981 5143
4982 /* If we are under soft global TCP memory pressure, do not expand. */ 5144 /* If we are under soft global TCP memory pressure, do not expand. */
4983 if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0)) 5145 if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
4984 return 0; 5146 return false;
4985 5147
4986 /* If we filled the congestion window, do not expand. */ 5148 /* If we filled the congestion window, do not expand. */
4987 if (tp->packets_out >= tp->snd_cwnd) 5149 if (tp->packets_out >= tp->snd_cwnd)
4988 return 0; 5150 return false;
4989 5151
4990 return 1; 5152 return true;
4991} 5153}
4992 5154
4993/* When incoming ACK allowed to free some skb from write_queue, 5155/* When incoming ACK allowed to free some skb from write_queue,
@@ -5213,19 +5375,19 @@ static inline int tcp_checksum_complete_user(struct sock *sk,
5213} 5375}
5214 5376
5215#ifdef CONFIG_NET_DMA 5377#ifdef CONFIG_NET_DMA
5216static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, 5378static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
5217 int hlen) 5379 int hlen)
5218{ 5380{
5219 struct tcp_sock *tp = tcp_sk(sk); 5381 struct tcp_sock *tp = tcp_sk(sk);
5220 int chunk = skb->len - hlen; 5382 int chunk = skb->len - hlen;
5221 int dma_cookie; 5383 int dma_cookie;
5222 int copied_early = 0; 5384 bool copied_early = false;
5223 5385
5224 if (tp->ucopy.wakeup) 5386 if (tp->ucopy.wakeup)
5225 return 0; 5387 return false;
5226 5388
5227 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) 5389 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
5228 tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); 5390 tp->ucopy.dma_chan = net_dma_find_channel();
5229 5391
5230 if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { 5392 if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
5231 5393
@@ -5238,7 +5400,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
5238 goto out; 5400 goto out;
5239 5401
5240 tp->ucopy.dma_cookie = dma_cookie; 5402 tp->ucopy.dma_cookie = dma_cookie;
5241 copied_early = 1; 5403 copied_early = true;
5242 5404
5243 tp->ucopy.len -= chunk; 5405 tp->ucopy.len -= chunk;
5244 tp->copied_seq += chunk; 5406 tp->copied_seq += chunk;
@@ -5430,6 +5592,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5430 } else { 5592 } else {
5431 int eaten = 0; 5593 int eaten = 0;
5432 int copied_early = 0; 5594 int copied_early = 0;
5595 bool fragstolen = false;
5433 5596
5434 if (tp->copied_seq == tp->rcv_nxt && 5597 if (tp->copied_seq == tp->rcv_nxt &&
5435 len - tcp_header_len <= tp->ucopy.len) { 5598 len - tcp_header_len <= tp->ucopy.len) {
@@ -5487,10 +5650,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5487 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); 5650 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
5488 5651
5489 /* Bulk data transfer: receiver */ 5652 /* Bulk data transfer: receiver */
5490 __skb_pull(skb, tcp_header_len); 5653 eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
5491 __skb_queue_tail(&sk->sk_receive_queue, skb); 5654 &fragstolen);
5492 skb_set_owner_r(skb, sk);
5493 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
5494 } 5655 }
5495 5656
5496 tcp_event_data_recv(sk, skb); 5657 tcp_event_data_recv(sk, skb);
@@ -5512,7 +5673,7 @@ no_ack:
5512 else 5673 else
5513#endif 5674#endif
5514 if (eaten) 5675 if (eaten)
5515 __kfree_skb(skb); 5676 kfree_skb_partial(skb, fragstolen);
5516 else 5677 else
5517 sk->sk_data_ready(sk, 0); 5678 sk->sk_data_ready(sk, 0);
5518 return 0; 5679 return 0;
@@ -5556,6 +5717,44 @@ discard:
5556} 5717}
5557EXPORT_SYMBOL(tcp_rcv_established); 5718EXPORT_SYMBOL(tcp_rcv_established);
5558 5719
5720void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5721{
5722 struct tcp_sock *tp = tcp_sk(sk);
5723 struct inet_connection_sock *icsk = inet_csk(sk);
5724
5725 tcp_set_state(sk, TCP_ESTABLISHED);
5726
5727 if (skb != NULL)
5728 security_inet_conn_established(sk, skb);
5729
5730 /* Make sure socket is routed, for correct metrics. */
5731 icsk->icsk_af_ops->rebuild_header(sk);
5732
5733 tcp_init_metrics(sk);
5734
5735 tcp_init_congestion_control(sk);
5736
5737 /* Prevent spurious tcp_cwnd_restart() on first data
5738 * packet.
5739 */
5740 tp->lsndtime = tcp_time_stamp;
5741
5742 tcp_init_buffer_space(sk);
5743
5744 if (sock_flag(sk, SOCK_KEEPOPEN))
5745 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
5746
5747 if (!tp->rx_opt.snd_wscale)
5748 __tcp_fast_path_on(tp, tp->snd_wnd);
5749 else
5750 tp->pred_flags = 0;
5751
5752 if (!sock_flag(sk, SOCK_DEAD)) {
5753 sk->sk_state_change(sk);
5754 sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
5755 }
5756}
5757
5559static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5758static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5560 const struct tcphdr *th, unsigned int len) 5759 const struct tcphdr *th, unsigned int len)
5561{ 5760{
@@ -5688,36 +5887,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5688 } 5887 }
5689 5888
5690 smp_mb(); 5889 smp_mb();
5691 tcp_set_state(sk, TCP_ESTABLISHED);
5692
5693 security_inet_conn_established(sk, skb);
5694
5695 /* Make sure socket is routed, for correct metrics. */
5696 icsk->icsk_af_ops->rebuild_header(sk);
5697 5890
5698 tcp_init_metrics(sk); 5891 tcp_finish_connect(sk, skb);
5699
5700 tcp_init_congestion_control(sk);
5701
5702 /* Prevent spurious tcp_cwnd_restart() on first data
5703 * packet.
5704 */
5705 tp->lsndtime = tcp_time_stamp;
5706
5707 tcp_init_buffer_space(sk);
5708
5709 if (sock_flag(sk, SOCK_KEEPOPEN))
5710 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
5711
5712 if (!tp->rx_opt.snd_wscale)
5713 __tcp_fast_path_on(tp, tp->snd_wnd);
5714 else
5715 tp->pred_flags = 0;
5716
5717 if (!sock_flag(sk, SOCK_DEAD)) {
5718 sk->sk_state_change(sk);
5719 sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
5720 }
5721 5892
5722 if (sk->sk_write_pending || 5893 if (sk->sk_write_pending ||
5723 icsk->icsk_accept_queue.rskq_defer_accept || 5894 icsk->icsk_accept_queue.rskq_defer_accept ||
@@ -5731,8 +5902,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5731 */ 5902 */
5732 inet_csk_schedule_ack(sk); 5903 inet_csk_schedule_ack(sk);
5733 icsk->icsk_ack.lrcvtime = tcp_time_stamp; 5904 icsk->icsk_ack.lrcvtime = tcp_time_stamp;
5734 icsk->icsk_ack.ato = TCP_ATO_MIN;
5735 tcp_incr_quickack(sk);
5736 tcp_enter_quickack_mode(sk); 5905 tcp_enter_quickack_mode(sk);
5737 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 5906 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
5738 TCP_DELACK_MAX, TCP_RTO_MAX); 5907 TCP_DELACK_MAX, TCP_RTO_MAX);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3a25cf743f8b..a43b87dfe800 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -138,6 +138,14 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
138} 138}
139EXPORT_SYMBOL_GPL(tcp_twsk_unique); 139EXPORT_SYMBOL_GPL(tcp_twsk_unique);
140 140
141static int tcp_repair_connect(struct sock *sk)
142{
143 tcp_connect_init(sk);
144 tcp_finish_connect(sk, NULL);
145
146 return 0;
147}
148
141/* This will initiate an outgoing connection. */ 149/* This will initiate an outgoing connection. */
142int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 150int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
143{ 151{
@@ -196,7 +204,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
196 /* Reset inherited state */ 204 /* Reset inherited state */
197 tp->rx_opt.ts_recent = 0; 205 tp->rx_opt.ts_recent = 0;
198 tp->rx_opt.ts_recent_stamp = 0; 206 tp->rx_opt.ts_recent_stamp = 0;
199 tp->write_seq = 0; 207 if (likely(!tp->repair))
208 tp->write_seq = 0;
200 } 209 }
201 210
202 if (tcp_death_row.sysctl_tw_recycle && 211 if (tcp_death_row.sysctl_tw_recycle &&
@@ -247,7 +256,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
247 sk->sk_gso_type = SKB_GSO_TCPV4; 256 sk->sk_gso_type = SKB_GSO_TCPV4;
248 sk_setup_caps(sk, &rt->dst); 257 sk_setup_caps(sk, &rt->dst);
249 258
250 if (!tp->write_seq) 259 if (!tp->write_seq && likely(!tp->repair))
251 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, 260 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
252 inet->inet_daddr, 261 inet->inet_daddr,
253 inet->inet_sport, 262 inet->inet_sport,
@@ -255,7 +264,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
255 264
256 inet->inet_id = tp->write_seq ^ jiffies; 265 inet->inet_id = tp->write_seq ^ jiffies;
257 266
258 err = tcp_connect(sk); 267 if (likely(!tp->repair))
268 err = tcp_connect(sk);
269 else
270 err = tcp_repair_connect(sk);
271
259 rt = NULL; 272 rt = NULL;
260 if (err) 273 if (err)
261 goto failure; 274 goto failure;
@@ -853,14 +866,14 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
853} 866}
854 867
855/* 868/*
856 * Return 1 if a syncookie should be sent 869 * Return true if a syncookie should be sent
857 */ 870 */
858int tcp_syn_flood_action(struct sock *sk, 871bool tcp_syn_flood_action(struct sock *sk,
859 const struct sk_buff *skb, 872 const struct sk_buff *skb,
860 const char *proto) 873 const char *proto)
861{ 874{
862 const char *msg = "Dropping request"; 875 const char *msg = "Dropping request";
863 int want_cookie = 0; 876 bool want_cookie = false;
864 struct listen_sock *lopt; 877 struct listen_sock *lopt;
865 878
866 879
@@ -868,7 +881,7 @@ int tcp_syn_flood_action(struct sock *sk,
868#ifdef CONFIG_SYN_COOKIES 881#ifdef CONFIG_SYN_COOKIES
869 if (sysctl_tcp_syncookies) { 882 if (sysctl_tcp_syncookies) {
870 msg = "Sending cookies"; 883 msg = "Sending cookies";
871 want_cookie = 1; 884 want_cookie = true;
872 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); 885 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
873 } else 886 } else
874#endif 887#endif
@@ -1183,7 +1196,7 @@ clear_hash_noput:
1183} 1196}
1184EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1197EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1185 1198
1186static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) 1199static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1187{ 1200{
1188 /* 1201 /*
1189 * This gets called for each TCP segment that arrives 1202 * This gets called for each TCP segment that arrives
@@ -1206,16 +1219,16 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1206 1219
1207 /* We've parsed the options - do we have a hash? */ 1220 /* We've parsed the options - do we have a hash? */
1208 if (!hash_expected && !hash_location) 1221 if (!hash_expected && !hash_location)
1209 return 0; 1222 return false;
1210 1223
1211 if (hash_expected && !hash_location) { 1224 if (hash_expected && !hash_location) {
1212 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1225 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1213 return 1; 1226 return true;
1214 } 1227 }
1215 1228
1216 if (!hash_expected && hash_location) { 1229 if (!hash_expected && hash_location) {
1217 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1230 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1218 return 1; 1231 return true;
1219 } 1232 }
1220 1233
1221 /* Okay, so this is hash_expected and hash_location - 1234 /* Okay, so this is hash_expected and hash_location -
@@ -1226,15 +1239,14 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1226 NULL, NULL, skb); 1239 NULL, NULL, skb);
1227 1240
1228 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1241 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1229 if (net_ratelimit()) { 1242 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1230 pr_info("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", 1243 &iph->saddr, ntohs(th->source),
1231 &iph->saddr, ntohs(th->source), 1244 &iph->daddr, ntohs(th->dest),
1232 &iph->daddr, ntohs(th->dest), 1245 genhash ? " tcp_v4_calc_md5_hash failed"
1233 genhash ? " tcp_v4_calc_md5_hash failed" : ""); 1246 : "");
1234 } 1247 return true;
1235 return 1;
1236 } 1248 }
1237 return 0; 1249 return false;
1238} 1250}
1239 1251
1240#endif 1252#endif
@@ -1268,7 +1280,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1268 __be32 saddr = ip_hdr(skb)->saddr; 1280 __be32 saddr = ip_hdr(skb)->saddr;
1269 __be32 daddr = ip_hdr(skb)->daddr; 1281 __be32 daddr = ip_hdr(skb)->daddr;
1270 __u32 isn = TCP_SKB_CB(skb)->when; 1282 __u32 isn = TCP_SKB_CB(skb)->when;
1271 int want_cookie = 0; 1283 bool want_cookie = false;
1272 1284
1273 /* Never answer to SYNs send to broadcast or multicast */ 1285 /* Never answer to SYNs send to broadcast or multicast */
1274 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1286 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -1327,7 +1339,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1327 while (l-- > 0) 1339 while (l-- > 0)
1328 *c++ ^= *hash_location++; 1340 *c++ ^= *hash_location++;
1329 1341
1330 want_cookie = 0; /* not our kind of cookie */ 1342 want_cookie = false; /* not our kind of cookie */
1331 tmp_ext.cookie_out_never = 0; /* false */ 1343 tmp_ext.cookie_out_never = 0; /* false */
1332 tmp_ext.cookie_plus = tmp_opt.cookie_plus; 1344 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1333 } else if (!tp->rx_opt.cookie_in_always) { 1345 } else if (!tp->rx_opt.cookie_in_always) {
@@ -1355,7 +1367,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1355 goto drop_and_free; 1367 goto drop_and_free;
1356 1368
1357 if (!want_cookie || tmp_opt.tstamp_ok) 1369 if (!want_cookie || tmp_opt.tstamp_ok)
1358 TCP_ECN_create_request(req, tcp_hdr(skb)); 1370 TCP_ECN_create_request(req, skb);
1359 1371
1360 if (want_cookie) { 1372 if (want_cookie) {
1361 isn = cookie_v4_init_sequence(sk, skb, &req->mss); 1373 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
@@ -1730,7 +1742,7 @@ process:
1730#ifdef CONFIG_NET_DMA 1742#ifdef CONFIG_NET_DMA
1731 struct tcp_sock *tp = tcp_sk(sk); 1743 struct tcp_sock *tp = tcp_sk(sk);
1732 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) 1744 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1733 tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); 1745 tp->ucopy.dma_chan = net_dma_find_channel();
1734 if (tp->ucopy.dma_chan) 1746 if (tp->ucopy.dma_chan)
1735 ret = tcp_v4_do_rcv(sk, skb); 1747 ret = tcp_v4_do_rcv(sk, skb);
1736 else 1748 else
@@ -1739,7 +1751,8 @@ process:
1739 if (!tcp_prequeue(sk, skb)) 1751 if (!tcp_prequeue(sk, skb))
1740 ret = tcp_v4_do_rcv(sk, skb); 1752 ret = tcp_v4_do_rcv(sk, skb);
1741 } 1753 }
1742 } else if (unlikely(sk_add_backlog(sk, skb))) { 1754 } else if (unlikely(sk_add_backlog(sk, skb,
1755 sk->sk_rcvbuf + sk->sk_sndbuf))) {
1743 bh_unlock_sock(sk); 1756 bh_unlock_sock(sk);
1744 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); 1757 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1745 goto discard_and_relse; 1758 goto discard_and_relse;
@@ -1875,64 +1888,15 @@ static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1875static int tcp_v4_init_sock(struct sock *sk) 1888static int tcp_v4_init_sock(struct sock *sk)
1876{ 1889{
1877 struct inet_connection_sock *icsk = inet_csk(sk); 1890 struct inet_connection_sock *icsk = inet_csk(sk);
1878 struct tcp_sock *tp = tcp_sk(sk);
1879 1891
1880 skb_queue_head_init(&tp->out_of_order_queue); 1892 tcp_init_sock(sk);
1881 tcp_init_xmit_timers(sk);
1882 tcp_prequeue_init(tp);
1883
1884 icsk->icsk_rto = TCP_TIMEOUT_INIT;
1885 tp->mdev = TCP_TIMEOUT_INIT;
1886
1887 /* So many TCP implementations out there (incorrectly) count the
1888 * initial SYN frame in their delayed-ACK and congestion control
1889 * algorithms that we must have the following bandaid to talk
1890 * efficiently to them. -DaveM
1891 */
1892 tp->snd_cwnd = TCP_INIT_CWND;
1893
1894 /* See draft-stevens-tcpca-spec-01 for discussion of the
1895 * initialization of these values.
1896 */
1897 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1898 tp->snd_cwnd_clamp = ~0;
1899 tp->mss_cache = TCP_MSS_DEFAULT;
1900
1901 tp->reordering = sysctl_tcp_reordering;
1902 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1903
1904 sk->sk_state = TCP_CLOSE;
1905
1906 sk->sk_write_space = sk_stream_write_space;
1907 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1908 1893
1909 icsk->icsk_af_ops = &ipv4_specific; 1894 icsk->icsk_af_ops = &ipv4_specific;
1910 icsk->icsk_sync_mss = tcp_sync_mss; 1895
1911#ifdef CONFIG_TCP_MD5SIG 1896#ifdef CONFIG_TCP_MD5SIG
1912 tp->af_specific = &tcp_sock_ipv4_specific; 1897 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1913#endif 1898#endif
1914 1899
1915 /* TCP Cookie Transactions */
1916 if (sysctl_tcp_cookie_size > 0) {
1917 /* Default, cookies without s_data_payload. */
1918 tp->cookie_values =
1919 kzalloc(sizeof(*tp->cookie_values),
1920 sk->sk_allocation);
1921 if (tp->cookie_values != NULL)
1922 kref_init(&tp->cookie_values->kref);
1923 }
1924 /* Presumed zeroed, in order of appearance:
1925 * cookie_in_always, cookie_out_never,
1926 * s_data_constant, s_data_in, s_data_out
1927 */
1928 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1929 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1930
1931 local_bh_disable();
1932 sock_update_memcg(sk);
1933 sk_sockets_allocated_inc(sk);
1934 local_bh_enable();
1935
1936 return 0; 1900 return 0;
1937} 1901}
1938 1902
@@ -2109,7 +2073,7 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2109 return rc; 2073 return rc;
2110} 2074}
2111 2075
2112static inline int empty_bucket(struct tcp_iter_state *st) 2076static inline bool empty_bucket(struct tcp_iter_state *st)
2113{ 2077{
2114 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) && 2078 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2115 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); 2079 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 3cabafb5cdd1..b85d9fe7d663 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -55,7 +55,7 @@ EXPORT_SYMBOL_GPL(tcp_death_row);
55 * state. 55 * state.
56 */ 56 */
57 57
58static int tcp_remember_stamp(struct sock *sk) 58static bool tcp_remember_stamp(struct sock *sk)
59{ 59{
60 const struct inet_connection_sock *icsk = inet_csk(sk); 60 const struct inet_connection_sock *icsk = inet_csk(sk);
61 struct tcp_sock *tp = tcp_sk(sk); 61 struct tcp_sock *tp = tcp_sk(sk);
@@ -72,13 +72,13 @@ static int tcp_remember_stamp(struct sock *sk)
72 } 72 }
73 if (release_it) 73 if (release_it)
74 inet_putpeer(peer); 74 inet_putpeer(peer);
75 return 1; 75 return true;
76 } 76 }
77 77
78 return 0; 78 return false;
79} 79}
80 80
81static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw) 81static bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
82{ 82{
83 struct sock *sk = (struct sock *) tw; 83 struct sock *sk = (struct sock *) tw;
84 struct inet_peer *peer; 84 struct inet_peer *peer;
@@ -94,17 +94,17 @@ static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
94 peer->tcp_ts = tcptw->tw_ts_recent; 94 peer->tcp_ts = tcptw->tw_ts_recent;
95 } 95 }
96 inet_putpeer(peer); 96 inet_putpeer(peer);
97 return 1; 97 return true;
98 } 98 }
99 return 0; 99 return false;
100} 100}
101 101
102static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 102static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
103{ 103{
104 if (seq == s_win) 104 if (seq == s_win)
105 return 1; 105 return true;
106 if (after(end_seq, s_win) && before(seq, e_win)) 106 if (after(end_seq, s_win) && before(seq, e_win))
107 return 1; 107 return true;
108 return seq == e_win && seq == end_seq; 108 return seq == e_win && seq == end_seq;
109} 109}
110 110
@@ -143,7 +143,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
143 struct tcp_options_received tmp_opt; 143 struct tcp_options_received tmp_opt;
144 const u8 *hash_location; 144 const u8 *hash_location;
145 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 145 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
146 int paws_reject = 0; 146 bool paws_reject = false;
147 147
148 tmp_opt.saw_tstamp = 0; 148 tmp_opt.saw_tstamp = 0;
149 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { 149 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
@@ -316,7 +316,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
316 struct inet_timewait_sock *tw = NULL; 316 struct inet_timewait_sock *tw = NULL;
317 const struct inet_connection_sock *icsk = inet_csk(sk); 317 const struct inet_connection_sock *icsk = inet_csk(sk);
318 const struct tcp_sock *tp = tcp_sk(sk); 318 const struct tcp_sock *tp = tcp_sk(sk);
319 int recycle_ok = 0; 319 bool recycle_ok = false;
320 320
321 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) 321 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
322 recycle_ok = tcp_remember_stamp(sk); 322 recycle_ok = tcp_remember_stamp(sk);
@@ -482,6 +482,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
482 newtp->sacked_out = 0; 482 newtp->sacked_out = 0;
483 newtp->fackets_out = 0; 483 newtp->fackets_out = 0;
484 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 484 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
485 tcp_enable_early_retrans(newtp);
485 486
486 /* So many TCP implementations out there (incorrectly) count the 487 /* So many TCP implementations out there (incorrectly) count the
487 * initial SYN frame in their delayed-ACK and congestion control 488 * initial SYN frame in their delayed-ACK and congestion control
@@ -574,7 +575,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
574 struct sock *child; 575 struct sock *child;
575 const struct tcphdr *th = tcp_hdr(skb); 576 const struct tcphdr *th = tcp_hdr(skb);
576 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 577 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
577 int paws_reject = 0; 578 bool paws_reject = false;
578 579
579 tmp_opt.saw_tstamp = 0; 580 tmp_opt.saw_tstamp = 0;
580 if (th->doff > (sizeof(struct tcphdr)>>2)) { 581 if (th->doff > (sizeof(struct tcphdr)>>2)) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 364784a91939..803cbfe82fbc 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -34,6 +34,8 @@
34 * 34 *
35 */ 35 */
36 36
37#define pr_fmt(fmt) "TCP: " fmt
38
37#include <net/tcp.h> 39#include <net/tcp.h>
38 40
39#include <linux/compiler.h> 41#include <linux/compiler.h>
@@ -78,9 +80,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
78 tp->frto_counter = 3; 80 tp->frto_counter = 3;
79 81
80 tp->packets_out += tcp_skb_pcount(skb); 82 tp->packets_out += tcp_skb_pcount(skb);
81 if (!prior_packets) 83 if (!prior_packets || tp->early_retrans_delayed)
82 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 84 tcp_rearm_rto(sk);
83 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
84} 85}
85 86
86/* SND.NXT, if window was not shrunk. 87/* SND.NXT, if window was not shrunk.
@@ -369,7 +370,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
369 TCP_SKB_CB(skb)->end_seq = seq; 370 TCP_SKB_CB(skb)->end_seq = seq;
370} 371}
371 372
372static inline int tcp_urg_mode(const struct tcp_sock *tp) 373static inline bool tcp_urg_mode(const struct tcp_sock *tp)
373{ 374{
374 return tp->snd_una != tp->snd_up; 375 return tp->snd_una != tp->snd_up;
375} 376}
@@ -563,13 +564,13 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
563/* Compute TCP options for SYN packets. This is not the final 564/* Compute TCP options for SYN packets. This is not the final
564 * network wire format yet. 565 * network wire format yet.
565 */ 566 */
566static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, 567static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
567 struct tcp_out_options *opts, 568 struct tcp_out_options *opts,
568 struct tcp_md5sig_key **md5) 569 struct tcp_md5sig_key **md5)
569{ 570{
570 struct tcp_sock *tp = tcp_sk(sk); 571 struct tcp_sock *tp = tcp_sk(sk);
571 struct tcp_cookie_values *cvp = tp->cookie_values; 572 struct tcp_cookie_values *cvp = tp->cookie_values;
572 unsigned remaining = MAX_TCP_OPTION_SPACE; 573 unsigned int remaining = MAX_TCP_OPTION_SPACE;
573 u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? 574 u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
574 tcp_cookie_size_check(cvp->cookie_desired) : 575 tcp_cookie_size_check(cvp->cookie_desired) :
575 0; 576 0;
@@ -663,15 +664,15 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
663} 664}
664 665
665/* Set up TCP options for SYN-ACKs. */ 666/* Set up TCP options for SYN-ACKs. */
666static unsigned tcp_synack_options(struct sock *sk, 667static unsigned int tcp_synack_options(struct sock *sk,
667 struct request_sock *req, 668 struct request_sock *req,
668 unsigned mss, struct sk_buff *skb, 669 unsigned int mss, struct sk_buff *skb,
669 struct tcp_out_options *opts, 670 struct tcp_out_options *opts,
670 struct tcp_md5sig_key **md5, 671 struct tcp_md5sig_key **md5,
671 struct tcp_extend_values *xvp) 672 struct tcp_extend_values *xvp)
672{ 673{
673 struct inet_request_sock *ireq = inet_rsk(req); 674 struct inet_request_sock *ireq = inet_rsk(req);
674 unsigned remaining = MAX_TCP_OPTION_SPACE; 675 unsigned int remaining = MAX_TCP_OPTION_SPACE;
675 u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ? 676 u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
676 xvp->cookie_plus : 677 xvp->cookie_plus :
677 0; 678 0;
@@ -742,13 +743,13 @@ static unsigned tcp_synack_options(struct sock *sk,
742/* Compute TCP options for ESTABLISHED sockets. This is not the 743/* Compute TCP options for ESTABLISHED sockets. This is not the
743 * final wire format yet. 744 * final wire format yet.
744 */ 745 */
745static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, 746static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
746 struct tcp_out_options *opts, 747 struct tcp_out_options *opts,
747 struct tcp_md5sig_key **md5) 748 struct tcp_md5sig_key **md5)
748{ 749{
749 struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; 750 struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
750 struct tcp_sock *tp = tcp_sk(sk); 751 struct tcp_sock *tp = tcp_sk(sk);
751 unsigned size = 0; 752 unsigned int size = 0;
752 unsigned int eff_sacks; 753 unsigned int eff_sacks;
753 754
754#ifdef CONFIG_TCP_MD5SIG 755#ifdef CONFIG_TCP_MD5SIG
@@ -770,9 +771,9 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
770 771
771 eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; 772 eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
772 if (unlikely(eff_sacks)) { 773 if (unlikely(eff_sacks)) {
773 const unsigned remaining = MAX_TCP_OPTION_SPACE - size; 774 const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
774 opts->num_sack_blocks = 775 opts->num_sack_blocks =
775 min_t(unsigned, eff_sacks, 776 min_t(unsigned int, eff_sacks,
776 (remaining - TCPOLEN_SACK_BASE_ALIGNED) / 777 (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
777 TCPOLEN_SACK_PERBLOCK); 778 TCPOLEN_SACK_PERBLOCK);
778 size += TCPOLEN_SACK_BASE_ALIGNED + 779 size += TCPOLEN_SACK_BASE_ALIGNED +
@@ -801,7 +802,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
801 struct tcp_sock *tp; 802 struct tcp_sock *tp;
802 struct tcp_skb_cb *tcb; 803 struct tcp_skb_cb *tcb;
803 struct tcp_out_options opts; 804 struct tcp_out_options opts;
804 unsigned tcp_options_size, tcp_header_size; 805 unsigned int tcp_options_size, tcp_header_size;
805 struct tcp_md5sig_key *md5; 806 struct tcp_md5sig_key *md5;
806 struct tcphdr *th; 807 struct tcphdr *th;
807 int err; 808 int err;
@@ -1096,6 +1097,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
1096 eat = min_t(int, len, skb_headlen(skb)); 1097 eat = min_t(int, len, skb_headlen(skb));
1097 if (eat) { 1098 if (eat) {
1098 __skb_pull(skb, eat); 1099 __skb_pull(skb, eat);
1100 skb->avail_size -= eat;
1099 len -= eat; 1101 len -= eat;
1100 if (!len) 1102 if (!len)
1101 return; 1103 return;
@@ -1149,7 +1151,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
1149} 1151}
1150 1152
1151/* Calculate MSS. Not accounting for SACKs here. */ 1153/* Calculate MSS. Not accounting for SACKs here. */
1152int tcp_mtu_to_mss(const struct sock *sk, int pmtu) 1154int tcp_mtu_to_mss(struct sock *sk, int pmtu)
1153{ 1155{
1154 const struct tcp_sock *tp = tcp_sk(sk); 1156 const struct tcp_sock *tp = tcp_sk(sk);
1155 const struct inet_connection_sock *icsk = inet_csk(sk); 1157 const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1160,6 +1162,14 @@ int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
1160 */ 1162 */
1161 mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); 1163 mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
1162 1164
1165 /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
1166 if (icsk->icsk_af_ops->net_frag_header_len) {
1167 const struct dst_entry *dst = __sk_dst_get(sk);
1168
1169 if (dst && dst_allfrag(dst))
1170 mss_now -= icsk->icsk_af_ops->net_frag_header_len;
1171 }
1172
1163 /* Clamp it (mss_clamp does not include tcp options) */ 1173 /* Clamp it (mss_clamp does not include tcp options) */
1164 if (mss_now > tp->rx_opt.mss_clamp) 1174 if (mss_now > tp->rx_opt.mss_clamp)
1165 mss_now = tp->rx_opt.mss_clamp; 1175 mss_now = tp->rx_opt.mss_clamp;
@@ -1178,7 +1188,7 @@ int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
1178} 1188}
1179 1189
1180/* Inverse of above */ 1190/* Inverse of above */
1181int tcp_mss_to_mtu(const struct sock *sk, int mss) 1191int tcp_mss_to_mtu(struct sock *sk, int mss)
1182{ 1192{
1183 const struct tcp_sock *tp = tcp_sk(sk); 1193 const struct tcp_sock *tp = tcp_sk(sk);
1184 const struct inet_connection_sock *icsk = inet_csk(sk); 1194 const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1189,6 +1199,13 @@ int tcp_mss_to_mtu(const struct sock *sk, int mss)
1189 icsk->icsk_ext_hdr_len + 1199 icsk->icsk_ext_hdr_len +
1190 icsk->icsk_af_ops->net_header_len; 1200 icsk->icsk_af_ops->net_header_len;
1191 1201
1202 /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
1203 if (icsk->icsk_af_ops->net_frag_header_len) {
1204 const struct dst_entry *dst = __sk_dst_get(sk);
1205
1206 if (dst && dst_allfrag(dst))
1207 mtu += icsk->icsk_af_ops->net_frag_header_len;
1208 }
1192 return mtu; 1209 return mtu;
1193} 1210}
1194 1211
@@ -1258,7 +1275,7 @@ unsigned int tcp_current_mss(struct sock *sk)
1258 const struct tcp_sock *tp = tcp_sk(sk); 1275 const struct tcp_sock *tp = tcp_sk(sk);
1259 const struct dst_entry *dst = __sk_dst_get(sk); 1276 const struct dst_entry *dst = __sk_dst_get(sk);
1260 u32 mss_now; 1277 u32 mss_now;
1261 unsigned header_len; 1278 unsigned int header_len;
1262 struct tcp_out_options opts; 1279 struct tcp_out_options opts;
1263 struct tcp_md5sig_key *md5; 1280 struct tcp_md5sig_key *md5;
1264 1281
@@ -1374,33 +1391,33 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
1374} 1391}
1375 1392
1376/* Minshall's variant of the Nagle send check. */ 1393/* Minshall's variant of the Nagle send check. */
1377static inline int tcp_minshall_check(const struct tcp_sock *tp) 1394static inline bool tcp_minshall_check(const struct tcp_sock *tp)
1378{ 1395{
1379 return after(tp->snd_sml, tp->snd_una) && 1396 return after(tp->snd_sml, tp->snd_una) &&
1380 !after(tp->snd_sml, tp->snd_nxt); 1397 !after(tp->snd_sml, tp->snd_nxt);
1381} 1398}
1382 1399
1383/* Return 0, if packet can be sent now without violation Nagle's rules: 1400/* Return false, if packet can be sent now without violation Nagle's rules:
1384 * 1. It is full sized. 1401 * 1. It is full sized.
1385 * 2. Or it contains FIN. (already checked by caller) 1402 * 2. Or it contains FIN. (already checked by caller)
1386 * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. 1403 * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
1387 * 4. Or TCP_CORK is not set, and all sent packets are ACKed. 1404 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
1388 * With Minshall's modification: all sent small packets are ACKed. 1405 * With Minshall's modification: all sent small packets are ACKed.
1389 */ 1406 */
1390static inline int tcp_nagle_check(const struct tcp_sock *tp, 1407static inline bool tcp_nagle_check(const struct tcp_sock *tp,
1391 const struct sk_buff *skb, 1408 const struct sk_buff *skb,
1392 unsigned mss_now, int nonagle) 1409 unsigned int mss_now, int nonagle)
1393{ 1410{
1394 return skb->len < mss_now && 1411 return skb->len < mss_now &&
1395 ((nonagle & TCP_NAGLE_CORK) || 1412 ((nonagle & TCP_NAGLE_CORK) ||
1396 (!nonagle && tp->packets_out && tcp_minshall_check(tp))); 1413 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1397} 1414}
1398 1415
1399/* Return non-zero if the Nagle test allows this packet to be 1416/* Return true if the Nagle test allows this packet to be
1400 * sent now. 1417 * sent now.
1401 */ 1418 */
1402static inline int tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, 1419static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
1403 unsigned int cur_mss, int nonagle) 1420 unsigned int cur_mss, int nonagle)
1404{ 1421{
1405 /* Nagle rule does not apply to frames, which sit in the middle of the 1422 /* Nagle rule does not apply to frames, which sit in the middle of the
1406 * write_queue (they have no chances to get new data). 1423 * write_queue (they have no chances to get new data).
@@ -1409,24 +1426,25 @@ static inline int tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff
1409 * argument based upon the location of SKB in the send queue. 1426 * argument based upon the location of SKB in the send queue.
1410 */ 1427 */
1411 if (nonagle & TCP_NAGLE_PUSH) 1428 if (nonagle & TCP_NAGLE_PUSH)
1412 return 1; 1429 return true;
1413 1430
1414 /* Don't use the nagle rule for urgent data (or for the final FIN). 1431 /* Don't use the nagle rule for urgent data (or for the final FIN).
1415 * Nagle can be ignored during F-RTO too (see RFC4138). 1432 * Nagle can be ignored during F-RTO too (see RFC4138).
1416 */ 1433 */
1417 if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || 1434 if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
1418 (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) 1435 (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
1419 return 1; 1436 return true;
1420 1437
1421 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) 1438 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
1422 return 1; 1439 return true;
1423 1440
1424 return 0; 1441 return false;
1425} 1442}
1426 1443
1427/* Does at least the first segment of SKB fit into the send window? */ 1444/* Does at least the first segment of SKB fit into the send window? */
1428static inline int tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, 1445static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
1429 unsigned int cur_mss) 1446 const struct sk_buff *skb,
1447 unsigned int cur_mss)
1430{ 1448{
1431 u32 end_seq = TCP_SKB_CB(skb)->end_seq; 1449 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
1432 1450
@@ -1459,7 +1477,7 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
1459} 1477}
1460 1478
1461/* Test if sending is allowed right now. */ 1479/* Test if sending is allowed right now. */
1462int tcp_may_send_now(struct sock *sk) 1480bool tcp_may_send_now(struct sock *sk)
1463{ 1481{
1464 const struct tcp_sock *tp = tcp_sk(sk); 1482 const struct tcp_sock *tp = tcp_sk(sk);
1465 struct sk_buff *skb = tcp_send_head(sk); 1483 struct sk_buff *skb = tcp_send_head(sk);
@@ -1529,7 +1547,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1529 * 1547 *
1530 * This algorithm is from John Heffner. 1548 * This algorithm is from John Heffner.
1531 */ 1549 */
1532static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) 1550static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1533{ 1551{
1534 struct tcp_sock *tp = tcp_sk(sk); 1552 struct tcp_sock *tp = tcp_sk(sk);
1535 const struct inet_connection_sock *icsk = inet_csk(sk); 1553 const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1589,11 +1607,11 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1589 /* Ok, it looks like it is advisable to defer. */ 1607 /* Ok, it looks like it is advisable to defer. */
1590 tp->tso_deferred = 1 | (jiffies << 1); 1608 tp->tso_deferred = 1 | (jiffies << 1);
1591 1609
1592 return 1; 1610 return true;
1593 1611
1594send_now: 1612send_now:
1595 tp->tso_deferred = 0; 1613 tp->tso_deferred = 0;
1596 return 0; 1614 return false;
1597} 1615}
1598 1616
1599/* Create a new MTU probe if we are ready. 1617/* Create a new MTU probe if we are ready.
@@ -1735,11 +1753,11 @@ static int tcp_mtu_probe(struct sock *sk)
1735 * snd_up-64k-mss .. snd_up cannot be large. However, taking into 1753 * snd_up-64k-mss .. snd_up cannot be large. However, taking into
1736 * account rare use of URG, this is not a big flaw. 1754 * account rare use of URG, this is not a big flaw.
1737 * 1755 *
1738 * Returns 1, if no segments are in flight and we have queued segments, but 1756 * Returns true, if no segments are in flight and we have queued segments,
1739 * cannot send anything now because of SWS or another problem. 1757 * but cannot send anything now because of SWS or another problem.
1740 */ 1758 */
1741static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, 1759static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1742 int push_one, gfp_t gfp) 1760 int push_one, gfp_t gfp)
1743{ 1761{
1744 struct tcp_sock *tp = tcp_sk(sk); 1762 struct tcp_sock *tp = tcp_sk(sk);
1745 struct sk_buff *skb; 1763 struct sk_buff *skb;
@@ -1753,7 +1771,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1753 /* Do MTU probing. */ 1771 /* Do MTU probing. */
1754 result = tcp_mtu_probe(sk); 1772 result = tcp_mtu_probe(sk);
1755 if (!result) { 1773 if (!result) {
1756 return 0; 1774 return false;
1757 } else if (result > 0) { 1775 } else if (result > 0) {
1758 sent_pkts = 1; 1776 sent_pkts = 1;
1759 } 1777 }
@@ -1812,7 +1830,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1812 1830
1813 if (likely(sent_pkts)) { 1831 if (likely(sent_pkts)) {
1814 tcp_cwnd_validate(sk); 1832 tcp_cwnd_validate(sk);
1815 return 0; 1833 return false;
1816 } 1834 }
1817 return !tp->packets_out && tcp_send_head(sk); 1835 return !tp->packets_out && tcp_send_head(sk);
1818} 1836}
@@ -2011,22 +2029,22 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2011} 2029}
2012 2030
2013/* Check if coalescing SKBs is legal. */ 2031/* Check if coalescing SKBs is legal. */
2014static int tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) 2032static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
2015{ 2033{
2016 if (tcp_skb_pcount(skb) > 1) 2034 if (tcp_skb_pcount(skb) > 1)
2017 return 0; 2035 return false;
2018 /* TODO: SACK collapsing could be used to remove this condition */ 2036 /* TODO: SACK collapsing could be used to remove this condition */
2019 if (skb_shinfo(skb)->nr_frags != 0) 2037 if (skb_shinfo(skb)->nr_frags != 0)
2020 return 0; 2038 return false;
2021 if (skb_cloned(skb)) 2039 if (skb_cloned(skb))
2022 return 0; 2040 return false;
2023 if (skb == tcp_send_head(sk)) 2041 if (skb == tcp_send_head(sk))
2024 return 0; 2042 return false;
2025 /* Some heurestics for collapsing over SACK'd could be invented */ 2043 /* Some heurestics for collapsing over SACK'd could be invented */
2026 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) 2044 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
2027 return 0; 2045 return false;
2028 2046
2029 return 1; 2047 return true;
2030} 2048}
2031 2049
2032/* Collapse packets in the retransmit queue to make to create 2050/* Collapse packets in the retransmit queue to make to create
@@ -2037,7 +2055,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2037{ 2055{
2038 struct tcp_sock *tp = tcp_sk(sk); 2056 struct tcp_sock *tp = tcp_sk(sk);
2039 struct sk_buff *skb = to, *tmp; 2057 struct sk_buff *skb = to, *tmp;
2040 int first = 1; 2058 bool first = true;
2041 2059
2042 if (!sysctl_tcp_retrans_collapse) 2060 if (!sysctl_tcp_retrans_collapse)
2043 return; 2061 return;
@@ -2051,7 +2069,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2051 space -= skb->len; 2069 space -= skb->len;
2052 2070
2053 if (first) { 2071 if (first) {
2054 first = 0; 2072 first = false;
2055 continue; 2073 continue;
2056 } 2074 }
2057 2075
@@ -2060,7 +2078,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2060 /* Punt if not enough space exists in the first SKB for 2078 /* Punt if not enough space exists in the first SKB for
2061 * the data in the second 2079 * the data in the second
2062 */ 2080 */
2063 if (skb->len > skb_tailroom(to)) 2081 if (skb->len > skb_availroom(to))
2064 break; 2082 break;
2065 2083
2066 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) 2084 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
@@ -2166,8 +2184,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2166 2184
2167#if FASTRETRANS_DEBUG > 0 2185#if FASTRETRANS_DEBUG > 0
2168 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { 2186 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
2169 if (net_ratelimit()) 2187 net_dbg_ratelimited("retrans_out leaked\n");
2170 printk(KERN_DEBUG "retrans_out leaked.\n");
2171 } 2188 }
2172#endif 2189#endif
2173 if (!tp->retrans_out) 2190 if (!tp->retrans_out)
@@ -2192,18 +2209,18 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2192/* Check if we forward retransmits are possible in the current 2209/* Check if we forward retransmits are possible in the current
2193 * window/congestion state. 2210 * window/congestion state.
2194 */ 2211 */
2195static int tcp_can_forward_retransmit(struct sock *sk) 2212static bool tcp_can_forward_retransmit(struct sock *sk)
2196{ 2213{
2197 const struct inet_connection_sock *icsk = inet_csk(sk); 2214 const struct inet_connection_sock *icsk = inet_csk(sk);
2198 const struct tcp_sock *tp = tcp_sk(sk); 2215 const struct tcp_sock *tp = tcp_sk(sk);
2199 2216
2200 /* Forward retransmissions are possible only during Recovery. */ 2217 /* Forward retransmissions are possible only during Recovery. */
2201 if (icsk->icsk_ca_state != TCP_CA_Recovery) 2218 if (icsk->icsk_ca_state != TCP_CA_Recovery)
2202 return 0; 2219 return false;
2203 2220
2204 /* No forward retransmissions in Reno are possible. */ 2221 /* No forward retransmissions in Reno are possible. */
2205 if (tcp_is_reno(tp)) 2222 if (tcp_is_reno(tp))
2206 return 0; 2223 return false;
2207 2224
2208 /* Yeah, we have to make difficult choice between forward transmission 2225 /* Yeah, we have to make difficult choice between forward transmission
2209 * and retransmission... Both ways have their merits... 2226 * and retransmission... Both ways have their merits...
@@ -2214,9 +2231,9 @@ static int tcp_can_forward_retransmit(struct sock *sk)
2214 */ 2231 */
2215 2232
2216 if (tcp_may_send_now(sk)) 2233 if (tcp_may_send_now(sk))
2217 return 0; 2234 return false;
2218 2235
2219 return 1; 2236 return true;
2220} 2237}
2221 2238
2222/* This gets called after a retransmit timeout, and the initially 2239/* This gets called after a retransmit timeout, and the initially
@@ -2401,7 +2418,7 @@ int tcp_send_synack(struct sock *sk)
2401 2418
2402 skb = tcp_write_queue_head(sk); 2419 skb = tcp_write_queue_head(sk);
2403 if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { 2420 if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
2404 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); 2421 pr_debug("%s: wrong queue state\n", __func__);
2405 return -EFAULT; 2422 return -EFAULT;
2406 } 2423 }
2407 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { 2424 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
@@ -2561,7 +2578,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2561EXPORT_SYMBOL(tcp_make_synack); 2578EXPORT_SYMBOL(tcp_make_synack);
2562 2579
2563/* Do all connect socket setups that can be done AF independent. */ 2580/* Do all connect socket setups that can be done AF independent. */
2564static void tcp_connect_init(struct sock *sk) 2581void tcp_connect_init(struct sock *sk)
2565{ 2582{
2566 const struct dst_entry *dst = __sk_dst_get(sk); 2583 const struct dst_entry *dst = __sk_dst_get(sk);
2567 struct tcp_sock *tp = tcp_sk(sk); 2584 struct tcp_sock *tp = tcp_sk(sk);
@@ -2616,9 +2633,12 @@ static void tcp_connect_init(struct sock *sk)
2616 tp->snd_una = tp->write_seq; 2633 tp->snd_una = tp->write_seq;
2617 tp->snd_sml = tp->write_seq; 2634 tp->snd_sml = tp->write_seq;
2618 tp->snd_up = tp->write_seq; 2635 tp->snd_up = tp->write_seq;
2619 tp->rcv_nxt = 0; 2636 tp->snd_nxt = tp->write_seq;
2620 tp->rcv_wup = 0; 2637
2621 tp->copied_seq = 0; 2638 if (likely(!tp->repair))
2639 tp->rcv_nxt = 0;
2640 tp->rcv_wup = tp->rcv_nxt;
2641 tp->copied_seq = tp->rcv_nxt;
2622 2642
2623 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; 2643 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
2624 inet_csk(sk)->icsk_retransmits = 0; 2644 inet_csk(sk)->icsk_retransmits = 0;
@@ -2641,7 +2661,6 @@ int tcp_connect(struct sock *sk)
2641 /* Reserve space for headers. */ 2661 /* Reserve space for headers. */
2642 skb_reserve(buff, MAX_TCP_HEADER); 2662 skb_reserve(buff, MAX_TCP_HEADER);
2643 2663
2644 tp->snd_nxt = tp->write_seq;
2645 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); 2664 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
2646 TCP_ECN_send_syn(sk, buff); 2665 TCP_ECN_send_syn(sk, buff);
2647 2666
@@ -2790,6 +2809,15 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
2790 return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); 2809 return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
2791} 2810}
2792 2811
2812void tcp_send_window_probe(struct sock *sk)
2813{
2814 if (sk->sk_state == TCP_ESTABLISHED) {
2815 tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
2816 tcp_sk(sk)->snd_nxt = tcp_sk(sk)->write_seq;
2817 tcp_xmit_probe_skb(sk, 0);
2818 }
2819}
2820
2793/* Initiate keepalive or window probe from timer. */ 2821/* Initiate keepalive or window probe from timer. */
2794int tcp_write_wakeup(struct sock *sk) 2822int tcp_write_wakeup(struct sock *sk)
2795{ 2823{
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index a981cdc0a6e9..4526fe68e60e 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -91,7 +91,7 @@ static inline int tcp_probe_avail(void)
91 * Note: arguments must match tcp_rcv_established()! 91 * Note: arguments must match tcp_rcv_established()!
92 */ 92 */
93static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, 93static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
94 struct tcphdr *th, unsigned len) 94 struct tcphdr *th, unsigned int len)
95{ 95{
96 const struct tcp_sock *tp = tcp_sk(sk); 96 const struct tcp_sock *tp = tcp_sk(sk);
97 const struct inet_sock *inet = inet_sk(sk); 97 const struct inet_sock *inet = inet_sk(sk);
@@ -138,7 +138,7 @@ static struct jprobe tcp_jprobe = {
138 .entry = jtcp_rcv_established, 138 .entry = jtcp_rcv_established,
139}; 139};
140 140
141static int tcpprobe_open(struct inode * inode, struct file * file) 141static int tcpprobe_open(struct inode *inode, struct file *file)
142{ 142{
143 /* Reset (empty) log */ 143 /* Reset (empty) log */
144 spin_lock_bh(&tcp_probe.lock); 144 spin_lock_bh(&tcp_probe.lock);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 34d4a02c2f16..e911e6c523ec 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -319,6 +319,11 @@ void tcp_retransmit_timer(struct sock *sk)
319 struct tcp_sock *tp = tcp_sk(sk); 319 struct tcp_sock *tp = tcp_sk(sk);
320 struct inet_connection_sock *icsk = inet_csk(sk); 320 struct inet_connection_sock *icsk = inet_csk(sk);
321 321
322 if (tp->early_retrans_delayed) {
323 tcp_resume_early_retransmit(sk);
324 return;
325 }
326
322 if (!tp->packets_out) 327 if (!tp->packets_out)
323 goto out; 328 goto out;
324 329
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index fe141052a1be..609397ee78fb 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -107,6 +107,7 @@
107#include <net/checksum.h> 107#include <net/checksum.h>
108#include <net/xfrm.h> 108#include <net/xfrm.h>
109#include <trace/events/udp.h> 109#include <trace/events/udp.h>
110#include <linux/static_key.h>
110#include "udp_impl.h" 111#include "udp_impl.h"
111 112
112struct udp_table udp_table __read_mostly; 113struct udp_table udp_table __read_mostly;
@@ -206,7 +207,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
206 207
207 if (!snum) { 208 if (!snum) {
208 int low, high, remaining; 209 int low, high, remaining;
209 unsigned rand; 210 unsigned int rand;
210 unsigned short first, last; 211 unsigned short first, last;
211 DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); 212 DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
212 213
@@ -846,7 +847,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
846 * Get and verify the address. 847 * Get and verify the address.
847 */ 848 */
848 if (msg->msg_name) { 849 if (msg->msg_name) {
849 struct sockaddr_in * usin = (struct sockaddr_in *)msg->msg_name; 850 struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
850 if (msg->msg_namelen < sizeof(*usin)) 851 if (msg->msg_namelen < sizeof(*usin))
851 return -EINVAL; 852 return -EINVAL;
852 if (usin->sin_family != AF_INET) { 853 if (usin->sin_family != AF_INET) {
@@ -1379,6 +1380,14 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1379 1380
1380} 1381}
1381 1382
1383static struct static_key udp_encap_needed __read_mostly;
1384void udp_encap_enable(void)
1385{
1386 if (!static_key_enabled(&udp_encap_needed))
1387 static_key_slow_inc(&udp_encap_needed);
1388}
1389EXPORT_SYMBOL(udp_encap_enable);
1390
1382/* returns: 1391/* returns:
1383 * -1: error 1392 * -1: error
1384 * 0: success 1393 * 0: success
@@ -1400,7 +1409,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1400 goto drop; 1409 goto drop;
1401 nf_reset(skb); 1410 nf_reset(skb);
1402 1411
1403 if (up->encap_type) { 1412 if (static_key_false(&udp_encap_needed) && up->encap_type) {
1404 int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); 1413 int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
1405 1414
1406 /* 1415 /*
@@ -1470,7 +1479,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1470 goto drop; 1479 goto drop;
1471 1480
1472 1481
1473 if (sk_rcvqueues_full(sk, skb)) 1482 if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf))
1474 goto drop; 1483 goto drop;
1475 1484
1476 rc = 0; 1485 rc = 0;
@@ -1479,7 +1488,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1479 bh_lock_sock(sk); 1488 bh_lock_sock(sk);
1480 if (!sock_owned_by_user(sk)) 1489 if (!sock_owned_by_user(sk))
1481 rc = __udp_queue_rcv_skb(sk, skb); 1490 rc = __udp_queue_rcv_skb(sk, skb);
1482 else if (sk_add_backlog(sk, skb)) { 1491 else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
1483 bh_unlock_sock(sk); 1492 bh_unlock_sock(sk);
1484 goto drop; 1493 goto drop;
1485 } 1494 }
@@ -1760,6 +1769,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
1760 /* FALLTHROUGH */ 1769 /* FALLTHROUGH */
1761 case UDP_ENCAP_L2TPINUDP: 1770 case UDP_ENCAP_L2TPINUDP:
1762 up->encap_type = val; 1771 up->encap_type = val;
1772 udp_encap_enable();
1763 break; 1773 break;
1764 default: 1774 default:
1765 err = -ENOPROTOOPT; 1775 err = -ENOPROTOOPT;
@@ -2163,9 +2173,15 @@ void udp4_proc_exit(void)
2163static __initdata unsigned long uhash_entries; 2173static __initdata unsigned long uhash_entries;
2164static int __init set_uhash_entries(char *str) 2174static int __init set_uhash_entries(char *str)
2165{ 2175{
2176 ssize_t ret;
2177
2166 if (!str) 2178 if (!str)
2167 return 0; 2179 return 0;
2168 uhash_entries = simple_strtoul(str, &str, 0); 2180
2181 ret = kstrtoul(str, 0, &uhash_entries);
2182 if (ret)
2183 return 0;
2184
2169 if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN) 2185 if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
2170 uhash_entries = UDP_HTABLE_SIZE_MIN; 2186 uhash_entries = UDP_HTABLE_SIZE_MIN;
2171 return 1; 2187 return 1;
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 8a949f19deb6..a7f86a3cd502 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -146,9 +146,17 @@ static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
146 return udp_dump_one(&udp_table, in_skb, nlh, req); 146 return udp_dump_one(&udp_table, in_skb, nlh, req);
147} 147}
148 148
149static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
150 void *info)
151{
152 r->idiag_rqueue = sk_rmem_alloc_get(sk);
153 r->idiag_wqueue = sk_wmem_alloc_get(sk);
154}
155
149static const struct inet_diag_handler udp_diag_handler = { 156static const struct inet_diag_handler udp_diag_handler = {
150 .dump = udp_diag_dump, 157 .dump = udp_diag_dump,
151 .dump_one = udp_diag_dump_one, 158 .dump_one = udp_diag_dump_one,
159 .idiag_get_info = udp_diag_get_info,
152 .idiag_type = IPPROTO_UDP, 160 .idiag_type = IPPROTO_UDP,
153}; 161};
154 162
@@ -167,6 +175,7 @@ static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *
167static const struct inet_diag_handler udplite_diag_handler = { 175static const struct inet_diag_handler udplite_diag_handler = {
168 .dump = udplite_diag_dump, 176 .dump = udplite_diag_dump,
169 .dump_one = udplite_diag_dump_one, 177 .dump_one = udplite_diag_dump_one,
178 .idiag_get_info = udp_diag_get_info,
170 .idiag_type = IPPROTO_UDPLITE, 179 .idiag_type = IPPROTO_UDPLITE,
171}; 180};
172 181
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index aaad650d47d9..5a681e298b90 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -25,7 +25,7 @@ extern int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
25 size_t len, int noblock, int flags, int *addr_len); 25 size_t len, int noblock, int flags, int *addr_len);
26extern int udp_sendpage(struct sock *sk, struct page *page, int offset, 26extern int udp_sendpage(struct sock *sk, struct page *page, int offset,
27 size_t size, int flags); 27 size_t size, int flags);
28extern int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb); 28extern int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
29extern void udp_destroy_sock(struct sock *sk); 29extern void udp_destroy_sock(struct sock *sk);
30 30
31#ifdef CONFIG_PROC_FS 31#ifdef CONFIG_PROC_FS
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index a0b4c5da8d43..0d3426cb5c4f 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -152,7 +152,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
152 152
153 case IPPROTO_AH: 153 case IPPROTO_AH:
154 if (pskb_may_pull(skb, xprth + 8 - skb->data)) { 154 if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
155 __be32 *ah_hdr = (__be32*)xprth; 155 __be32 *ah_hdr = (__be32 *)xprth;
156 156
157 fl4->fl4_ipsec_spi = ah_hdr[1]; 157 fl4->fl4_ipsec_spi = ah_hdr[1];
158 } 158 }
@@ -298,8 +298,8 @@ void __init xfrm4_init(int rt_max_size)
298 xfrm4_state_init(); 298 xfrm4_state_init();
299 xfrm4_policy_init(); 299 xfrm4_policy_init();
300#ifdef CONFIG_SYSCTL 300#ifdef CONFIG_SYSCTL
301 sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path, 301 sysctl_hdr = register_net_sysctl(&init_net, "net/ipv4",
302 xfrm4_policy_table); 302 xfrm4_policy_table);
303#endif 303#endif
304} 304}
305 305