aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c110
-rw-r--r--net/ipv4/datagram.c20
-rw-r--r--net/ipv4/devinet.c9
-rw-r--r--net/ipv4/gre_demux.c27
-rw-r--r--net/ipv4/gre_offload.c16
-rw-r--r--net/ipv4/icmp.c23
-rw-r--r--net/ipv4/igmp.c16
-rw-r--r--net/ipv4/inet_connection_sock.c11
-rw-r--r--net/ipv4/inet_hashtables.c6
-rw-r--r--net/ipv4/inetpeer.c20
-rw-r--r--net/ipv4/ip_forward.c2
-rw-r--r--net/ipv4/ip_gre.c7
-rw-r--r--net/ipv4/ip_options.c6
-rw-r--r--net/ipv4/ip_output.c22
-rw-r--r--net/ipv4/ip_tunnel.c25
-rw-r--r--net/ipv4/ip_tunnel_core.c10
-rw-r--r--net/ipv4/ip_vti.c8
-rw-r--r--net/ipv4/ipip.c5
-rw-r--r--net/ipv4/ipmr.c4
-rw-r--r--net/ipv4/netfilter/iptable_nat.c14
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nft_chain_nat_ipv4.c12
-rw-r--r--net/ipv4/proc.c24
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/route.c52
-rw-r--r--net/ipv4/syncookies.c3
-rw-r--r--net/ipv4/sysctl_net_ipv4.c45
-rw-r--r--net/ipv4/tcp.c8
-rw-r--r--net/ipv4/tcp_bic.c5
-rw-r--r--net/ipv4/tcp_cong.c24
-rw-r--r--net/ipv4/tcp_cubic.c5
-rw-r--r--net/ipv4/tcp_fastopen.c219
-rw-r--r--net/ipv4/tcp_highspeed.c4
-rw-r--r--net/ipv4/tcp_htcp.c4
-rw-r--r--net/ipv4/tcp_hybla.c7
-rw-r--r--net/ipv4/tcp_illinois.c5
-rw-r--r--net/ipv4/tcp_input.c36
-rw-r--r--net/ipv4/tcp_ipv4.c303
-rw-r--r--net/ipv4/tcp_lp.c5
-rw-r--r--net/ipv4/tcp_metrics.c5
-rw-r--r--net/ipv4/tcp_minisocks.c31
-rw-r--r--net/ipv4/tcp_offload.c9
-rw-r--r--net/ipv4/tcp_output.c126
-rw-r--r--net/ipv4/tcp_scalable.c5
-rw-r--r--net/ipv4/tcp_vegas.c7
-rw-r--r--net/ipv4/tcp_veno.c9
-rw-r--r--net/ipv4/tcp_yeah.c5
-rw-r--r--net/ipv4/udp.c135
-rw-r--r--net/ipv4/udp_offload.c8
-rw-r--r--net/ipv4/udplite.c1
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c2
-rw-r--r--net/ipv4/xfrm4_output.c2
52 files changed, 715 insertions, 756 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6d6dd345bc4d..d5e6836cf772 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -254,7 +254,6 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
254 struct inet_sock *inet; 254 struct inet_sock *inet;
255 struct proto *answer_prot; 255 struct proto *answer_prot;
256 unsigned char answer_flags; 256 unsigned char answer_flags;
257 char answer_no_check;
258 int try_loading_module = 0; 257 int try_loading_module = 0;
259 int err; 258 int err;
260 259
@@ -312,7 +311,6 @@ lookup_protocol:
312 311
313 sock->ops = answer->ops; 312 sock->ops = answer->ops;
314 answer_prot = answer->prot; 313 answer_prot = answer->prot;
315 answer_no_check = answer->no_check;
316 answer_flags = answer->flags; 314 answer_flags = answer->flags;
317 rcu_read_unlock(); 315 rcu_read_unlock();
318 316
@@ -324,7 +322,6 @@ lookup_protocol:
324 goto out; 322 goto out;
325 323
326 err = 0; 324 err = 0;
327 sk->sk_no_check = answer_no_check;
328 if (INET_PROTOSW_REUSE & answer_flags) 325 if (INET_PROTOSW_REUSE & answer_flags)
329 sk->sk_reuse = SK_CAN_REUSE; 326 sk->sk_reuse = SK_CAN_REUSE;
330 327
@@ -1002,7 +999,6 @@ static struct inet_protosw inetsw_array[] =
1002 .protocol = IPPROTO_TCP, 999 .protocol = IPPROTO_TCP,
1003 .prot = &tcp_prot, 1000 .prot = &tcp_prot,
1004 .ops = &inet_stream_ops, 1001 .ops = &inet_stream_ops,
1005 .no_check = 0,
1006 .flags = INET_PROTOSW_PERMANENT | 1002 .flags = INET_PROTOSW_PERMANENT |
1007 INET_PROTOSW_ICSK, 1003 INET_PROTOSW_ICSK,
1008 }, 1004 },
@@ -1012,7 +1008,6 @@ static struct inet_protosw inetsw_array[] =
1012 .protocol = IPPROTO_UDP, 1008 .protocol = IPPROTO_UDP,
1013 .prot = &udp_prot, 1009 .prot = &udp_prot,
1014 .ops = &inet_dgram_ops, 1010 .ops = &inet_dgram_ops,
1015 .no_check = UDP_CSUM_DEFAULT,
1016 .flags = INET_PROTOSW_PERMANENT, 1011 .flags = INET_PROTOSW_PERMANENT,
1017 }, 1012 },
1018 1013
@@ -1021,7 +1016,6 @@ static struct inet_protosw inetsw_array[] =
1021 .protocol = IPPROTO_ICMP, 1016 .protocol = IPPROTO_ICMP,
1022 .prot = &ping_prot, 1017 .prot = &ping_prot,
1023 .ops = &inet_dgram_ops, 1018 .ops = &inet_dgram_ops,
1024 .no_check = UDP_CSUM_DEFAULT,
1025 .flags = INET_PROTOSW_REUSE, 1019 .flags = INET_PROTOSW_REUSE,
1026 }, 1020 },
1027 1021
@@ -1030,7 +1024,6 @@ static struct inet_protosw inetsw_array[] =
1030 .protocol = IPPROTO_IP, /* wild card */ 1024 .protocol = IPPROTO_IP, /* wild card */
1031 .prot = &raw_prot, 1025 .prot = &raw_prot,
1032 .ops = &inet_sockraw_ops, 1026 .ops = &inet_sockraw_ops,
1033 .no_check = UDP_CSUM_DEFAULT,
1034 .flags = INET_PROTOSW_REUSE, 1027 .flags = INET_PROTOSW_REUSE,
1035 } 1028 }
1036}; 1029};
@@ -1261,10 +1254,12 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
1261 SKB_GSO_DODGY | 1254 SKB_GSO_DODGY |
1262 SKB_GSO_TCP_ECN | 1255 SKB_GSO_TCP_ECN |
1263 SKB_GSO_GRE | 1256 SKB_GSO_GRE |
1257 SKB_GSO_GRE_CSUM |
1264 SKB_GSO_IPIP | 1258 SKB_GSO_IPIP |
1265 SKB_GSO_SIT | 1259 SKB_GSO_SIT |
1266 SKB_GSO_TCPV6 | 1260 SKB_GSO_TCPV6 |
1267 SKB_GSO_UDP_TUNNEL | 1261 SKB_GSO_UDP_TUNNEL |
1262 SKB_GSO_UDP_TUNNEL_CSUM |
1268 SKB_GSO_MPLS | 1263 SKB_GSO_MPLS |
1269 0))) 1264 0)))
1270 goto out; 1265 goto out;
@@ -1476,22 +1471,20 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
1476} 1471}
1477EXPORT_SYMBOL_GPL(inet_ctl_sock_create); 1472EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
1478 1473
1479unsigned long snmp_fold_field(void __percpu *mib[], int offt) 1474unsigned long snmp_fold_field(void __percpu *mib, int offt)
1480{ 1475{
1481 unsigned long res = 0; 1476 unsigned long res = 0;
1482 int i, j; 1477 int i;
1483 1478
1484 for_each_possible_cpu(i) { 1479 for_each_possible_cpu(i)
1485 for (j = 0; j < SNMP_ARRAY_SZ; j++) 1480 res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt);
1486 res += *(((unsigned long *) per_cpu_ptr(mib[j], i)) + offt);
1487 }
1488 return res; 1481 return res;
1489} 1482}
1490EXPORT_SYMBOL_GPL(snmp_fold_field); 1483EXPORT_SYMBOL_GPL(snmp_fold_field);
1491 1484
1492#if BITS_PER_LONG==32 1485#if BITS_PER_LONG==32
1493 1486
1494u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) 1487u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
1495{ 1488{
1496 u64 res = 0; 1489 u64 res = 0;
1497 int cpu; 1490 int cpu;
@@ -1502,7 +1495,7 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
1502 u64 v; 1495 u64 v;
1503 unsigned int start; 1496 unsigned int start;
1504 1497
1505 bhptr = per_cpu_ptr(mib[0], cpu); 1498 bhptr = per_cpu_ptr(mib, cpu);
1506 syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); 1499 syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
1507 do { 1500 do {
1508 start = u64_stats_fetch_begin_irq(syncp); 1501 start = u64_stats_fetch_begin_irq(syncp);
@@ -1516,25 +1509,6 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
1516EXPORT_SYMBOL_GPL(snmp_fold_field64); 1509EXPORT_SYMBOL_GPL(snmp_fold_field64);
1517#endif 1510#endif
1518 1511
1519int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
1520{
1521 BUG_ON(ptr == NULL);
1522 ptr[0] = __alloc_percpu(mibsize, align);
1523 if (!ptr[0])
1524 return -ENOMEM;
1525
1526#if SNMP_ARRAY_SZ == 2
1527 ptr[1] = __alloc_percpu(mibsize, align);
1528 if (!ptr[1]) {
1529 free_percpu(ptr[0]);
1530 ptr[0] = NULL;
1531 return -ENOMEM;
1532 }
1533#endif
1534 return 0;
1535}
1536EXPORT_SYMBOL_GPL(snmp_mib_init);
1537
1538#ifdef CONFIG_IP_MULTICAST 1512#ifdef CONFIG_IP_MULTICAST
1539static const struct net_protocol igmp_protocol = { 1513static const struct net_protocol igmp_protocol = {
1540 .handler = igmp_rcv, 1514 .handler = igmp_rcv,
@@ -1570,40 +1544,30 @@ static __net_init int ipv4_mib_init_net(struct net *net)
1570{ 1544{
1571 int i; 1545 int i;
1572 1546
1573 if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics, 1547 net->mib.tcp_statistics = alloc_percpu(struct tcp_mib);
1574 sizeof(struct tcp_mib), 1548 if (!net->mib.tcp_statistics)
1575 __alignof__(struct tcp_mib)) < 0)
1576 goto err_tcp_mib; 1549 goto err_tcp_mib;
1577 if (snmp_mib_init((void __percpu **)net->mib.ip_statistics, 1550 net->mib.ip_statistics = alloc_percpu(struct ipstats_mib);
1578 sizeof(struct ipstats_mib), 1551 if (!net->mib.ip_statistics)
1579 __alignof__(struct ipstats_mib)) < 0)
1580 goto err_ip_mib; 1552 goto err_ip_mib;
1581 1553
1582 for_each_possible_cpu(i) { 1554 for_each_possible_cpu(i) {
1583 struct ipstats_mib *af_inet_stats; 1555 struct ipstats_mib *af_inet_stats;
1584 af_inet_stats = per_cpu_ptr(net->mib.ip_statistics[0], i); 1556 af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i);
1585 u64_stats_init(&af_inet_stats->syncp); 1557 u64_stats_init(&af_inet_stats->syncp);
1586#if SNMP_ARRAY_SZ == 2
1587 af_inet_stats = per_cpu_ptr(net->mib.ip_statistics[1], i);
1588 u64_stats_init(&af_inet_stats->syncp);
1589#endif
1590 } 1558 }
1591 1559
1592 if (snmp_mib_init((void __percpu **)net->mib.net_statistics, 1560 net->mib.net_statistics = alloc_percpu(struct linux_mib);
1593 sizeof(struct linux_mib), 1561 if (!net->mib.net_statistics)
1594 __alignof__(struct linux_mib)) < 0)
1595 goto err_net_mib; 1562 goto err_net_mib;
1596 if (snmp_mib_init((void __percpu **)net->mib.udp_statistics, 1563 net->mib.udp_statistics = alloc_percpu(struct udp_mib);
1597 sizeof(struct udp_mib), 1564 if (!net->mib.udp_statistics)
1598 __alignof__(struct udp_mib)) < 0)
1599 goto err_udp_mib; 1565 goto err_udp_mib;
1600 if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics, 1566 net->mib.udplite_statistics = alloc_percpu(struct udp_mib);
1601 sizeof(struct udp_mib), 1567 if (!net->mib.udplite_statistics)
1602 __alignof__(struct udp_mib)) < 0)
1603 goto err_udplite_mib; 1568 goto err_udplite_mib;
1604 if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics, 1569 net->mib.icmp_statistics = alloc_percpu(struct icmp_mib);
1605 sizeof(struct icmp_mib), 1570 if (!net->mib.icmp_statistics)
1606 __alignof__(struct icmp_mib)) < 0)
1607 goto err_icmp_mib; 1571 goto err_icmp_mib;
1608 net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib), 1572 net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),
1609 GFP_KERNEL); 1573 GFP_KERNEL);
@@ -1614,17 +1578,17 @@ static __net_init int ipv4_mib_init_net(struct net *net)
1614 return 0; 1578 return 0;
1615 1579
1616err_icmpmsg_mib: 1580err_icmpmsg_mib:
1617 snmp_mib_free((void __percpu **)net->mib.icmp_statistics); 1581 free_percpu(net->mib.icmp_statistics);
1618err_icmp_mib: 1582err_icmp_mib:
1619 snmp_mib_free((void __percpu **)net->mib.udplite_statistics); 1583 free_percpu(net->mib.udplite_statistics);
1620err_udplite_mib: 1584err_udplite_mib:
1621 snmp_mib_free((void __percpu **)net->mib.udp_statistics); 1585 free_percpu(net->mib.udp_statistics);
1622err_udp_mib: 1586err_udp_mib:
1623 snmp_mib_free((void __percpu **)net->mib.net_statistics); 1587 free_percpu(net->mib.net_statistics);
1624err_net_mib: 1588err_net_mib:
1625 snmp_mib_free((void __percpu **)net->mib.ip_statistics); 1589 free_percpu(net->mib.ip_statistics);
1626err_ip_mib: 1590err_ip_mib:
1627 snmp_mib_free((void __percpu **)net->mib.tcp_statistics); 1591 free_percpu(net->mib.tcp_statistics);
1628err_tcp_mib: 1592err_tcp_mib:
1629 return -ENOMEM; 1593 return -ENOMEM;
1630} 1594}
@@ -1632,12 +1596,12 @@ err_tcp_mib:
1632static __net_exit void ipv4_mib_exit_net(struct net *net) 1596static __net_exit void ipv4_mib_exit_net(struct net *net)
1633{ 1597{
1634 kfree(net->mib.icmpmsg_statistics); 1598 kfree(net->mib.icmpmsg_statistics);
1635 snmp_mib_free((void __percpu **)net->mib.icmp_statistics); 1599 free_percpu(net->mib.icmp_statistics);
1636 snmp_mib_free((void __percpu **)net->mib.udplite_statistics); 1600 free_percpu(net->mib.udplite_statistics);
1637 snmp_mib_free((void __percpu **)net->mib.udp_statistics); 1601 free_percpu(net->mib.udp_statistics);
1638 snmp_mib_free((void __percpu **)net->mib.net_statistics); 1602 free_percpu(net->mib.net_statistics);
1639 snmp_mib_free((void __percpu **)net->mib.ip_statistics); 1603 free_percpu(net->mib.ip_statistics);
1640 snmp_mib_free((void __percpu **)net->mib.tcp_statistics); 1604 free_percpu(net->mib.tcp_statistics);
1641} 1605}
1642 1606
1643static __net_initdata struct pernet_operations ipv4_mib_ops = { 1607static __net_initdata struct pernet_operations ipv4_mib_ops = {
@@ -1736,13 +1700,9 @@ static int __init inet_init(void)
1736 1700
1737 BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); 1701 BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));
1738 1702
1739 sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
1740 if (!sysctl_local_reserved_ports)
1741 goto out;
1742
1743 rc = proto_register(&tcp_prot, 1); 1703 rc = proto_register(&tcp_prot, 1);
1744 if (rc) 1704 if (rc)
1745 goto out_free_reserved_ports; 1705 goto out;
1746 1706
1747 rc = proto_register(&udp_prot, 1); 1707 rc = proto_register(&udp_prot, 1);
1748 if (rc) 1708 if (rc)
@@ -1852,8 +1812,6 @@ out_unregister_udp_proto:
1852 proto_unregister(&udp_prot); 1812 proto_unregister(&udp_prot);
1853out_unregister_tcp_proto: 1813out_unregister_tcp_proto:
1854 proto_unregister(&tcp_prot); 1814 proto_unregister(&tcp_prot);
1855out_free_reserved_ports:
1856 kfree(sysctl_local_reserved_ports);
1857 goto out; 1815 goto out;
1858} 1816}
1859 1817
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 8b5134c582f1..a3095fdefbed 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -86,18 +86,26 @@ out:
86} 86}
87EXPORT_SYMBOL(ip4_datagram_connect); 87EXPORT_SYMBOL(ip4_datagram_connect);
88 88
89/* Because UDP xmit path can manipulate sk_dst_cache without holding
90 * socket lock, we need to use sk_dst_set() here,
91 * even if we own the socket lock.
92 */
89void ip4_datagram_release_cb(struct sock *sk) 93void ip4_datagram_release_cb(struct sock *sk)
90{ 94{
91 const struct inet_sock *inet = inet_sk(sk); 95 const struct inet_sock *inet = inet_sk(sk);
92 const struct ip_options_rcu *inet_opt; 96 const struct ip_options_rcu *inet_opt;
93 __be32 daddr = inet->inet_daddr; 97 __be32 daddr = inet->inet_daddr;
98 struct dst_entry *dst;
94 struct flowi4 fl4; 99 struct flowi4 fl4;
95 struct rtable *rt; 100 struct rtable *rt;
96 101
97 if (! __sk_dst_get(sk) || __sk_dst_check(sk, 0))
98 return;
99
100 rcu_read_lock(); 102 rcu_read_lock();
103
104 dst = __sk_dst_get(sk);
105 if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) {
106 rcu_read_unlock();
107 return;
108 }
101 inet_opt = rcu_dereference(inet->inet_opt); 109 inet_opt = rcu_dereference(inet->inet_opt);
102 if (inet_opt && inet_opt->opt.srr) 110 if (inet_opt && inet_opt->opt.srr)
103 daddr = inet_opt->opt.faddr; 111 daddr = inet_opt->opt.faddr;
@@ -105,8 +113,10 @@ void ip4_datagram_release_cb(struct sock *sk)
105 inet->inet_saddr, inet->inet_dport, 113 inet->inet_saddr, inet->inet_dport,
106 inet->inet_sport, sk->sk_protocol, 114 inet->inet_sport, sk->sk_protocol,
107 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); 115 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
108 if (!IS_ERR(rt)) 116
109 __sk_dst_set(sk, &rt->dst); 117 dst = !IS_ERR(rt) ? &rt->dst : NULL;
118 sk_dst_set(sk, dst);
119
110 rcu_read_unlock(); 120 rcu_read_unlock();
111} 121}
112EXPORT_SYMBOL_GPL(ip4_datagram_release_cb); 122EXPORT_SYMBOL_GPL(ip4_datagram_release_cb);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index bdbf68bb2e2d..e9449376b58e 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -106,7 +106,6 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
106#define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT) 106#define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT)
107 107
108static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; 108static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
109static DEFINE_SPINLOCK(inet_addr_hash_lock);
110 109
111static u32 inet_addr_hash(struct net *net, __be32 addr) 110static u32 inet_addr_hash(struct net *net, __be32 addr)
112{ 111{
@@ -119,16 +118,14 @@ static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
119{ 118{
120 u32 hash = inet_addr_hash(net, ifa->ifa_local); 119 u32 hash = inet_addr_hash(net, ifa->ifa_local);
121 120
122 spin_lock(&inet_addr_hash_lock); 121 ASSERT_RTNL();
123 hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); 122 hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
124 spin_unlock(&inet_addr_hash_lock);
125} 123}
126 124
127static void inet_hash_remove(struct in_ifaddr *ifa) 125static void inet_hash_remove(struct in_ifaddr *ifa)
128{ 126{
129 spin_lock(&inet_addr_hash_lock); 127 ASSERT_RTNL();
130 hlist_del_init_rcu(&ifa->hash); 128 hlist_del_init_rcu(&ifa->hash);
131 spin_unlock(&inet_addr_hash_lock);
132} 129}
133 130
134/** 131/**
@@ -830,7 +827,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
830 ifa_existing = find_matching_ifa(ifa); 827 ifa_existing = find_matching_ifa(ifa);
831 if (!ifa_existing) { 828 if (!ifa_existing) {
832 /* It would be best to check for !NLM_F_CREATE here but 829 /* It would be best to check for !NLM_F_CREATE here but
833 * userspace alreay relies on not having to provide this. 830 * userspace already relies on not having to provide this.
834 */ 831 */
835 set_ifa_lifetime(ifa, valid_lft, prefered_lft); 832 set_ifa_lifetime(ifa, valid_lft, prefered_lft);
836 return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); 833 return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 250be7421ab3..4e9619bca732 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -84,7 +84,8 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
84 ptr--; 84 ptr--;
85 } 85 }
86 if (tpi->flags&TUNNEL_CSUM && 86 if (tpi->flags&TUNNEL_CSUM &&
87 !(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) { 87 !(skb_shinfo(skb)->gso_type &
88 (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) {
88 *ptr = 0; 89 *ptr = 0;
89 *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, 90 *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
90 skb->len, 0)); 91 skb->len, 0));
@@ -93,28 +94,6 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
93} 94}
94EXPORT_SYMBOL_GPL(gre_build_header); 95EXPORT_SYMBOL_GPL(gre_build_header);
95 96
96static __sum16 check_checksum(struct sk_buff *skb)
97{
98 __sum16 csum = 0;
99
100 switch (skb->ip_summed) {
101 case CHECKSUM_COMPLETE:
102 csum = csum_fold(skb->csum);
103
104 if (!csum)
105 break;
106 /* Fall through. */
107
108 case CHECKSUM_NONE:
109 skb->csum = 0;
110 csum = __skb_checksum_complete(skb);
111 skb->ip_summed = CHECKSUM_COMPLETE;
112 break;
113 }
114
115 return csum;
116}
117
118static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, 97static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
119 bool *csum_err) 98 bool *csum_err)
120{ 99{
@@ -141,7 +120,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
141 120
142 options = (__be32 *)(greh + 1); 121 options = (__be32 *)(greh + 1);
143 if (greh->flags & GRE_CSUM) { 122 if (greh->flags & GRE_CSUM) {
144 if (check_checksum(skb)) { 123 if (skb_checksum_simple_validate(skb)) {
145 *csum_err = true; 124 *csum_err = true;
146 return -EINVAL; 125 return -EINVAL;
147 } 126 }
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index f1d32280cb54..eb92deb12666 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -42,6 +42,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
42 SKB_GSO_DODGY | 42 SKB_GSO_DODGY |
43 SKB_GSO_TCP_ECN | 43 SKB_GSO_TCP_ECN |
44 SKB_GSO_GRE | 44 SKB_GSO_GRE |
45 SKB_GSO_GRE_CSUM |
45 SKB_GSO_IPIP))) 46 SKB_GSO_IPIP)))
46 goto out; 47 goto out;
47 48
@@ -55,6 +56,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
55 goto out; 56 goto out;
56 57
57 csum = !!(greh->flags & GRE_CSUM); 58 csum = !!(greh->flags & GRE_CSUM);
59 if (csum)
60 skb->encap_hdr_csum = 1;
58 61
59 if (unlikely(!pskb_may_pull(skb, ghl))) 62 if (unlikely(!pskb_may_pull(skb, ghl)))
60 goto out; 63 goto out;
@@ -94,10 +97,13 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
94 } 97 }
95 } 98 }
96 99
97 greh = (struct gre_base_hdr *)(skb->data); 100 skb_reset_transport_header(skb);
101
102 greh = (struct gre_base_hdr *)
103 skb_transport_header(skb);
98 pcsum = (__be32 *)(greh + 1); 104 pcsum = (__be32 *)(greh + 1);
99 *pcsum = 0; 105 *pcsum = 0;
100 *(__sum16 *)pcsum = csum_fold(skb_checksum(skb, 0, skb->len, 0)); 106 *(__sum16 *)pcsum = gso_make_checksum(skb, 0);
101 } 107 }
102 __skb_push(skb, tnl_hlen - ghl); 108 __skb_push(skb, tnl_hlen - ghl);
103 109
@@ -125,10 +131,12 @@ static __sum16 gro_skb_checksum(struct sk_buff *skb)
125 csum_partial(skb->data, skb_gro_offset(skb), 0)); 131 csum_partial(skb->data, skb_gro_offset(skb), 0));
126 sum = csum_fold(NAPI_GRO_CB(skb)->csum); 132 sum = csum_fold(NAPI_GRO_CB(skb)->csum);
127 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) { 133 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) {
128 if (unlikely(!sum)) 134 if (unlikely(!sum) && !skb->csum_complete_sw)
129 netdev_rx_csum_fault(skb->dev); 135 netdev_rx_csum_fault(skb->dev);
130 } else 136 } else {
131 skb->ip_summed = CHECKSUM_COMPLETE; 137 skb->ip_summed = CHECKSUM_COMPLETE;
138 skb->csum_complete_sw = 1;
139 }
132 140
133 return sum; 141 return sum;
134} 142}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 0134663fdbce..79c3d947a481 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -337,6 +337,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
337 struct sock *sk; 337 struct sock *sk;
338 struct inet_sock *inet; 338 struct inet_sock *inet;
339 __be32 daddr, saddr; 339 __be32 daddr, saddr;
340 u32 mark = IP4_REPLY_MARK(net, skb->mark);
340 341
341 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) 342 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
342 return; 343 return;
@@ -349,6 +350,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
349 icmp_param->data.icmph.checksum = 0; 350 icmp_param->data.icmph.checksum = 0;
350 351
351 inet->tos = ip_hdr(skb)->tos; 352 inet->tos = ip_hdr(skb)->tos;
353 sk->sk_mark = mark;
352 daddr = ipc.addr = ip_hdr(skb)->saddr; 354 daddr = ipc.addr = ip_hdr(skb)->saddr;
353 saddr = fib_compute_spec_dst(skb); 355 saddr = fib_compute_spec_dst(skb);
354 ipc.opt = NULL; 356 ipc.opt = NULL;
@@ -364,6 +366,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
364 memset(&fl4, 0, sizeof(fl4)); 366 memset(&fl4, 0, sizeof(fl4));
365 fl4.daddr = daddr; 367 fl4.daddr = daddr;
366 fl4.saddr = saddr; 368 fl4.saddr = saddr;
369 fl4.flowi4_mark = mark;
367 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); 370 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
368 fl4.flowi4_proto = IPPROTO_ICMP; 371 fl4.flowi4_proto = IPPROTO_ICMP;
369 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); 372 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
@@ -382,7 +385,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
382 struct flowi4 *fl4, 385 struct flowi4 *fl4,
383 struct sk_buff *skb_in, 386 struct sk_buff *skb_in,
384 const struct iphdr *iph, 387 const struct iphdr *iph,
385 __be32 saddr, u8 tos, 388 __be32 saddr, u8 tos, u32 mark,
386 int type, int code, 389 int type, int code,
387 struct icmp_bxm *param) 390 struct icmp_bxm *param)
388{ 391{
@@ -394,6 +397,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
394 fl4->daddr = (param->replyopts.opt.opt.srr ? 397 fl4->daddr = (param->replyopts.opt.opt.srr ?
395 param->replyopts.opt.opt.faddr : iph->saddr); 398 param->replyopts.opt.opt.faddr : iph->saddr);
396 fl4->saddr = saddr; 399 fl4->saddr = saddr;
400 fl4->flowi4_mark = mark;
397 fl4->flowi4_tos = RT_TOS(tos); 401 fl4->flowi4_tos = RT_TOS(tos);
398 fl4->flowi4_proto = IPPROTO_ICMP; 402 fl4->flowi4_proto = IPPROTO_ICMP;
399 fl4->fl4_icmp_type = type; 403 fl4->fl4_icmp_type = type;
@@ -491,6 +495,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
491 struct flowi4 fl4; 495 struct flowi4 fl4;
492 __be32 saddr; 496 __be32 saddr;
493 u8 tos; 497 u8 tos;
498 u32 mark;
494 struct net *net; 499 struct net *net;
495 struct sock *sk; 500 struct sock *sk;
496 501
@@ -592,6 +597,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
592 tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | 597 tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
593 IPTOS_PREC_INTERNETCONTROL) : 598 IPTOS_PREC_INTERNETCONTROL) :
594 iph->tos; 599 iph->tos;
600 mark = IP4_REPLY_MARK(net, skb_in->mark);
595 601
596 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in)) 602 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
597 goto out_unlock; 603 goto out_unlock;
@@ -608,13 +614,14 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
608 icmp_param->skb = skb_in; 614 icmp_param->skb = skb_in;
609 icmp_param->offset = skb_network_offset(skb_in); 615 icmp_param->offset = skb_network_offset(skb_in);
610 inet_sk(sk)->tos = tos; 616 inet_sk(sk)->tos = tos;
617 sk->sk_mark = mark;
611 ipc.addr = iph->saddr; 618 ipc.addr = iph->saddr;
612 ipc.opt = &icmp_param->replyopts.opt; 619 ipc.opt = &icmp_param->replyopts.opt;
613 ipc.tx_flags = 0; 620 ipc.tx_flags = 0;
614 ipc.ttl = 0; 621 ipc.ttl = 0;
615 ipc.tos = -1; 622 ipc.tos = -1;
616 623
617 rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, 624 rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
618 type, code, icmp_param); 625 type, code, icmp_param);
619 if (IS_ERR(rt)) 626 if (IS_ERR(rt))
620 goto out_unlock; 627 goto out_unlock;
@@ -908,16 +915,8 @@ int icmp_rcv(struct sk_buff *skb)
908 915
909 ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS); 916 ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS);
910 917
911 switch (skb->ip_summed) { 918 if (skb_checksum_simple_validate(skb))
912 case CHECKSUM_COMPLETE: 919 goto csum_error;
913 if (!csum_fold(skb->csum))
914 break;
915 /* fall through */
916 case CHECKSUM_NONE:
917 skb->csum = 0;
918 if (__skb_checksum_complete(skb))
919 goto csum_error;
920 }
921 920
922 if (!pskb_pull(skb, sizeof(*icmph))) 921 if (!pskb_pull(skb, sizeof(*icmph)))
923 goto error; 922 goto error;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 97e4d1655d26..6748d420f714 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -369,7 +369,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
369 pip->saddr = fl4.saddr; 369 pip->saddr = fl4.saddr;
370 pip->protocol = IPPROTO_IGMP; 370 pip->protocol = IPPROTO_IGMP;
371 pip->tot_len = 0; /* filled in later */ 371 pip->tot_len = 0; /* filled in later */
372 ip_select_ident(skb, &rt->dst, NULL); 372 ip_select_ident(skb, NULL);
373 ((u8 *)&pip[1])[0] = IPOPT_RA; 373 ((u8 *)&pip[1])[0] = IPOPT_RA;
374 ((u8 *)&pip[1])[1] = 4; 374 ((u8 *)&pip[1])[1] = 4;
375 ((u8 *)&pip[1])[2] = 0; 375 ((u8 *)&pip[1])[2] = 0;
@@ -714,7 +714,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
714 iph->daddr = dst; 714 iph->daddr = dst;
715 iph->saddr = fl4.saddr; 715 iph->saddr = fl4.saddr;
716 iph->protocol = IPPROTO_IGMP; 716 iph->protocol = IPPROTO_IGMP;
717 ip_select_ident(skb, &rt->dst, NULL); 717 ip_select_ident(skb, NULL);
718 ((u8 *)&iph[1])[0] = IPOPT_RA; 718 ((u8 *)&iph[1])[0] = IPOPT_RA;
719 ((u8 *)&iph[1])[1] = 4; 719 ((u8 *)&iph[1])[1] = 4;
720 ((u8 *)&iph[1])[2] = 0; 720 ((u8 *)&iph[1])[2] = 0;
@@ -988,16 +988,8 @@ int igmp_rcv(struct sk_buff *skb)
988 if (!pskb_may_pull(skb, sizeof(struct igmphdr))) 988 if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
989 goto drop; 989 goto drop;
990 990
991 switch (skb->ip_summed) { 991 if (skb_checksum_simple_validate(skb))
992 case CHECKSUM_COMPLETE: 992 goto drop;
993 if (!csum_fold(skb->csum))
994 break;
995 /* fall through */
996 case CHECKSUM_NONE:
997 skb->csum = 0;
998 if (__skb_checksum_complete(skb))
999 goto drop;
1000 }
1001 993
1002 ih = igmp_hdr(skb); 994 ih = igmp_hdr(skb);
1003 switch (ih->type) { 995 switch (ih->type) {
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index a56b8e6e866a..14d02ea905b6 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -29,9 +29,6 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
29EXPORT_SYMBOL(inet_csk_timer_bug_msg); 29EXPORT_SYMBOL(inet_csk_timer_bug_msg);
30#endif 30#endif
31 31
32unsigned long *sysctl_local_reserved_ports;
33EXPORT_SYMBOL(sysctl_local_reserved_ports);
34
35void inet_get_local_port_range(struct net *net, int *low, int *high) 32void inet_get_local_port_range(struct net *net, int *low, int *high)
36{ 33{
37 unsigned int seq; 34 unsigned int seq;
@@ -113,7 +110,7 @@ again:
113 110
114 smallest_size = -1; 111 smallest_size = -1;
115 do { 112 do {
116 if (inet_is_reserved_local_port(rover)) 113 if (inet_is_local_reserved_port(net, rover))
117 goto next_nolock; 114 goto next_nolock;
118 head = &hashinfo->bhash[inet_bhashfn(net, rover, 115 head = &hashinfo->bhash[inet_bhashfn(net, rover,
119 hashinfo->bhash_size)]; 116 hashinfo->bhash_size)];
@@ -408,7 +405,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
408 struct net *net = sock_net(sk); 405 struct net *net = sock_net(sk);
409 int flags = inet_sk_flowi_flags(sk); 406 int flags = inet_sk_flowi_flags(sk);
410 407
411 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 408 flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark,
412 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 409 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
413 sk->sk_protocol, 410 sk->sk_protocol,
414 flags, 411 flags,
@@ -445,7 +442,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
445 442
446 rcu_read_lock(); 443 rcu_read_lock();
447 opt = rcu_dereference(newinet->inet_opt); 444 opt = rcu_dereference(newinet->inet_opt);
448 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 445 flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark,
449 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 446 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
450 sk->sk_protocol, inet_sk_flowi_flags(sk), 447 sk->sk_protocol, inet_sk_flowi_flags(sk),
451 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, 448 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
@@ -680,6 +677,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
680 inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); 677 inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
681 newsk->sk_write_space = sk_stream_write_space; 678 newsk->sk_write_space = sk_stream_write_space;
682 679
680 newsk->sk_mark = inet_rsk(req)->ir_mark;
681
683 newicsk->icsk_retransmits = 0; 682 newicsk->icsk_retransmits = 0;
684 newicsk->icsk_backoff = 0; 683 newicsk->icsk_backoff = 0;
685 newicsk->icsk_probes_out = 0; 684 newicsk->icsk_probes_out = 0;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 8b9cf279450d..43116e8c8e13 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -274,7 +274,7 @@ struct sock *__inet_lookup_established(struct net *net,
274 const __be32 daddr, const u16 hnum, 274 const __be32 daddr, const u16 hnum,
275 const int dif) 275 const int dif)
276{ 276{
277 INET_ADDR_COOKIE(acookie, saddr, daddr) 277 INET_ADDR_COOKIE(acookie, saddr, daddr);
278 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 278 const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
279 struct sock *sk; 279 struct sock *sk;
280 const struct hlist_nulls_node *node; 280 const struct hlist_nulls_node *node;
@@ -327,7 +327,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
327 __be32 daddr = inet->inet_rcv_saddr; 327 __be32 daddr = inet->inet_rcv_saddr;
328 __be32 saddr = inet->inet_daddr; 328 __be32 saddr = inet->inet_daddr;
329 int dif = sk->sk_bound_dev_if; 329 int dif = sk->sk_bound_dev_if;
330 INET_ADDR_COOKIE(acookie, saddr, daddr) 330 INET_ADDR_COOKIE(acookie, saddr, daddr);
331 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 331 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
332 struct net *net = sock_net(sk); 332 struct net *net = sock_net(sk);
333 unsigned int hash = inet_ehashfn(net, daddr, lport, 333 unsigned int hash = inet_ehashfn(net, daddr, lport,
@@ -500,7 +500,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
500 local_bh_disable(); 500 local_bh_disable();
501 for (i = 1; i <= remaining; i++) { 501 for (i = 1; i <= remaining; i++) {
502 port = low + (i + offset) % remaining; 502 port = low + (i + offset) % remaining;
503 if (inet_is_reserved_local_port(port)) 503 if (inet_is_local_reserved_port(net, port))
504 continue; 504 continue;
505 head = &hinfo->bhash[inet_bhashfn(net, port, 505 head = &hinfo->bhash[inet_bhashfn(net, port,
506 hinfo->bhash_size)]; 506 hinfo->bhash_size)];
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 56cd458a1b8c..bd5f5928167d 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -26,20 +26,7 @@
26 * Theory of operations. 26 * Theory of operations.
27 * We keep one entry for each peer IP address. The nodes contains long-living 27 * We keep one entry for each peer IP address. The nodes contains long-living
28 * information about the peer which doesn't depend on routes. 28 * information about the peer which doesn't depend on routes.
29 * At this moment this information consists only of ID field for the next
30 * outgoing IP packet. This field is incremented with each packet as encoded
31 * in inet_getid() function (include/net/inetpeer.h).
32 * At the moment of writing this notes identifier of IP packets is generated
33 * to be unpredictable using this code only for packets subjected
34 * (actually or potentially) to defragmentation. I.e. DF packets less than
35 * PMTU in size when local fragmentation is disabled use a constant ID and do
36 * not use this code (see ip_select_ident() in include/net/ip.h).
37 * 29 *
38 * Route cache entries hold references to our nodes.
39 * New cache entries get references via lookup by destination IP address in
40 * the avl tree. The reference is grabbed only when it's needed i.e. only
41 * when we try to output IP packet which needs an unpredictable ID (see
42 * __ip_select_ident() in net/ipv4/route.c).
43 * Nodes are removed only when reference counter goes to 0. 30 * Nodes are removed only when reference counter goes to 0.
44 * When it's happened the node may be removed when a sufficient amount of 31 * When it's happened the node may be removed when a sufficient amount of
45 * time has been passed since its last use. The less-recently-used entry can 32 * time has been passed since its last use. The less-recently-used entry can
@@ -62,7 +49,6 @@
62 * refcnt: atomically against modifications on other CPU; 49 * refcnt: atomically against modifications on other CPU;
63 * usually under some other lock to prevent node disappearing 50 * usually under some other lock to prevent node disappearing
64 * daddr: unchangeable 51 * daddr: unchangeable
65 * ip_id_count: atomic value (no lock needed)
66 */ 52 */
67 53
68static struct kmem_cache *peer_cachep __read_mostly; 54static struct kmem_cache *peer_cachep __read_mostly;
@@ -120,7 +106,7 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min
120static void inetpeer_gc_worker(struct work_struct *work) 106static void inetpeer_gc_worker(struct work_struct *work)
121{ 107{
122 struct inet_peer *p, *n, *c; 108 struct inet_peer *p, *n, *c;
123 LIST_HEAD(list); 109 struct list_head list;
124 110
125 spin_lock_bh(&gc_lock); 111 spin_lock_bh(&gc_lock);
126 list_replace_init(&gc_list, &list); 112 list_replace_init(&gc_list, &list);
@@ -497,10 +483,6 @@ relookup:
497 p->daddr = *daddr; 483 p->daddr = *daddr;
498 atomic_set(&p->refcnt, 1); 484 atomic_set(&p->refcnt, 1);
499 atomic_set(&p->rid, 0); 485 atomic_set(&p->rid, 0);
500 atomic_set(&p->ip_id_count,
501 (daddr->family == AF_INET) ?
502 secure_ip_id(daddr->addr.a4) :
503 secure_ipv6_id(daddr->addr.a6));
504 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; 486 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
505 p->rate_tokens = 0; 487 p->rate_tokens = 0;
506 /* 60*HZ is arbitrary, but chosen enough high so that the first 488 /* 60*HZ is arbitrary, but chosen enough high so that the first
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 6f111e48e11c..3a83ce5efa80 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -42,7 +42,7 @@
42static bool ip_may_fragment(const struct sk_buff *skb) 42static bool ip_may_fragment(const struct sk_buff *skb)
43{ 43{
44 return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) || 44 return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) ||
45 skb->local_df; 45 skb->ignore_df;
46} 46}
47 47
48static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) 48static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 94213c891565..9b842544aea3 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -410,7 +410,7 @@ static int ipgre_open(struct net_device *dev)
410 struct flowi4 fl4; 410 struct flowi4 fl4;
411 struct rtable *rt; 411 struct rtable *rt;
412 412
413 rt = ip_route_output_gre(dev_net(dev), &fl4, 413 rt = ip_route_output_gre(t->net, &fl4,
414 t->parms.iph.daddr, 414 t->parms.iph.daddr,
415 t->parms.iph.saddr, 415 t->parms.iph.saddr,
416 t->parms.o_key, 416 t->parms.o_key,
@@ -434,7 +434,7 @@ static int ipgre_close(struct net_device *dev)
434 434
435 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { 435 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
436 struct in_device *in_dev; 436 struct in_device *in_dev;
437 in_dev = inetdev_by_index(dev_net(dev), t->mlink); 437 in_dev = inetdev_by_index(t->net, t->mlink);
438 if (in_dev) 438 if (in_dev)
439 ip_mc_dec_group(in_dev, t->parms.iph.daddr); 439 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
440 } 440 }
@@ -478,7 +478,7 @@ static void __gre_tunnel_init(struct net_device *dev)
478 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; 478 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
479 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; 479 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
480 480
481 dev->features |= NETIF_F_NETNS_LOCAL | GRE_FEATURES; 481 dev->features |= GRE_FEATURES;
482 dev->hw_features |= GRE_FEATURES; 482 dev->hw_features |= GRE_FEATURES;
483 483
484 if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { 484 if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
@@ -649,6 +649,7 @@ static void ipgre_tap_setup(struct net_device *dev)
649{ 649{
650 ether_setup(dev); 650 ether_setup(dev);
651 dev->netdev_ops = &gre_tap_netdev_ops; 651 dev->netdev_ops = &gre_tap_netdev_ops;
652 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
652 ip_tunnel_setup(dev, gre_tap_net_id); 653 ip_tunnel_setup(dev, gre_tap_net_id);
653} 654}
654 655
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index f4ab72e19af9..5e7aecea05cd 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -364,7 +364,7 @@ int ip_options_compile(struct net *net,
364 } 364 }
365 if (optptr[2] <= optlen) { 365 if (optptr[2] <= optlen) {
366 unsigned char *timeptr = NULL; 366 unsigned char *timeptr = NULL;
367 if (optptr[2]+3 > optptr[1]) { 367 if (optptr[2]+3 > optlen) {
368 pp_ptr = optptr + 2; 368 pp_ptr = optptr + 2;
369 goto error; 369 goto error;
370 } 370 }
@@ -376,7 +376,7 @@ int ip_options_compile(struct net *net,
376 optptr[2] += 4; 376 optptr[2] += 4;
377 break; 377 break;
378 case IPOPT_TS_TSANDADDR: 378 case IPOPT_TS_TSANDADDR:
379 if (optptr[2]+7 > optptr[1]) { 379 if (optptr[2]+7 > optlen) {
380 pp_ptr = optptr + 2; 380 pp_ptr = optptr + 2;
381 goto error; 381 goto error;
382 } 382 }
@@ -390,7 +390,7 @@ int ip_options_compile(struct net *net,
390 optptr[2] += 8; 390 optptr[2] += 8;
391 break; 391 break;
392 case IPOPT_TS_PRESPEC: 392 case IPOPT_TS_PRESPEC:
393 if (optptr[2]+7 > optptr[1]) { 393 if (optptr[2]+7 > optlen) {
394 pp_ptr = optptr + 2; 394 pp_ptr = optptr + 2;
395 goto error; 395 goto error;
396 } 396 }
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index a52f50187b54..8d3b6b0e9857 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -148,7 +148,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
148 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); 148 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
149 iph->saddr = saddr; 149 iph->saddr = saddr;
150 iph->protocol = sk->sk_protocol; 150 iph->protocol = sk->sk_protocol;
151 ip_select_ident(skb, &rt->dst, sk); 151 ip_select_ident(skb, sk);
152 152
153 if (opt && opt->opt.optlen) { 153 if (opt && opt->opt.optlen) {
154 iph->ihl += opt->opt.optlen>>2; 154 iph->ihl += opt->opt.optlen>>2;
@@ -415,7 +415,7 @@ packet_routed:
415 skb_reset_network_header(skb); 415 skb_reset_network_header(skb);
416 iph = ip_hdr(skb); 416 iph = ip_hdr(skb);
417 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); 417 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
418 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df) 418 if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
419 iph->frag_off = htons(IP_DF); 419 iph->frag_off = htons(IP_DF);
420 else 420 else
421 iph->frag_off = 0; 421 iph->frag_off = 0;
@@ -430,8 +430,7 @@ packet_routed:
430 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); 430 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
431 } 431 }
432 432
433 ip_select_ident_more(skb, &rt->dst, sk, 433 ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1);
434 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
435 434
436 /* TODO : should we use skb->sk here instead of sk ? */ 435 /* TODO : should we use skb->sk here instead of sk ? */
437 skb->priority = sk->sk_priority; 436 skb->priority = sk->sk_priority;
@@ -501,7 +500,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
501 iph = ip_hdr(skb); 500 iph = ip_hdr(skb);
502 501
503 mtu = ip_skb_dst_mtu(skb); 502 mtu = ip_skb_dst_mtu(skb);
504 if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) || 503 if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
505 (IPCB(skb)->frag_max_size && 504 (IPCB(skb)->frag_max_size &&
506 IPCB(skb)->frag_max_size > mtu))) { 505 IPCB(skb)->frag_max_size > mtu))) {
507 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 506 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
@@ -866,7 +865,7 @@ static int __ip_append_data(struct sock *sk,
866 865
867 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 866 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
868 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 867 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
869 maxnonfragsize = ip_sk_local_df(sk) ? 0xFFFF : mtu; 868 maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
870 869
871 if (cork->length + length > maxnonfragsize - fragheaderlen) { 870 if (cork->length + length > maxnonfragsize - fragheaderlen) {
872 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, 871 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
@@ -1189,7 +1188,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1189 1188
1190 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 1189 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1191 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 1190 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1192 maxnonfragsize = ip_sk_local_df(sk) ? 0xFFFF : mtu; 1191 maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
1193 1192
1194 if (cork->length + size > maxnonfragsize - fragheaderlen) { 1193 if (cork->length + size > maxnonfragsize - fragheaderlen) {
1195 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, 1194 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
@@ -1350,10 +1349,10 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
1350 * to fragment the frame generated here. No matter, what transforms 1349 * to fragment the frame generated here. No matter, what transforms
1351 * how transforms change size of the packet, it will come out. 1350 * how transforms change size of the packet, it will come out.
1352 */ 1351 */
1353 skb->local_df = ip_sk_local_df(sk); 1352 skb->ignore_df = ip_sk_ignore_df(sk);
1354 1353
1355 /* DF bit is set when we want to see DF on outgoing frames. 1354 /* DF bit is set when we want to see DF on outgoing frames.
1356 * If local_df is set too, we still allow to fragment this frame 1355 * If ignore_df is set too, we still allow to fragment this frame
1357 * locally. */ 1356 * locally. */
1358 if (inet->pmtudisc == IP_PMTUDISC_DO || 1357 if (inet->pmtudisc == IP_PMTUDISC_DO ||
1359 inet->pmtudisc == IP_PMTUDISC_PROBE || 1358 inet->pmtudisc == IP_PMTUDISC_PROBE ||
@@ -1379,7 +1378,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
1379 iph->ttl = ttl; 1378 iph->ttl = ttl;
1380 iph->protocol = sk->sk_protocol; 1379 iph->protocol = sk->sk_protocol;
1381 ip_copy_addrs(iph, fl4); 1380 ip_copy_addrs(iph, fl4);
1382 ip_select_ident(skb, &rt->dst, sk); 1381 ip_select_ident(skb, sk);
1383 1382
1384 if (opt) { 1383 if (opt) {
1385 iph->ihl += opt->optlen>>2; 1384 iph->ihl += opt->optlen>>2;
@@ -1546,7 +1545,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
1546 daddr = replyopts.opt.opt.faddr; 1545 daddr = replyopts.opt.opt.faddr;
1547 } 1546 }
1548 1547
1549 flowi4_init_output(&fl4, arg->bound_dev_if, 0, 1548 flowi4_init_output(&fl4, arg->bound_dev_if,
1549 IP4_REPLY_MARK(net, skb->mark),
1550 RT_TOS(arg->tos), 1550 RT_TOS(arg->tos),
1551 RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, 1551 RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
1552 ip_reply_arg_flowi_flags(arg), 1552 ip_reply_arg_flowi_flags(arg),
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 2acc2337d38b..097b3e7c1e8f 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -268,6 +268,7 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
268 __be32 remote = parms->iph.daddr; 268 __be32 remote = parms->iph.daddr;
269 __be32 local = parms->iph.saddr; 269 __be32 local = parms->iph.saddr;
270 __be32 key = parms->i_key; 270 __be32 key = parms->i_key;
271 __be16 flags = parms->i_flags;
271 int link = parms->link; 272 int link = parms->link;
272 struct ip_tunnel *t = NULL; 273 struct ip_tunnel *t = NULL;
273 struct hlist_head *head = ip_bucket(itn, parms); 274 struct hlist_head *head = ip_bucket(itn, parms);
@@ -275,9 +276,9 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
275 hlist_for_each_entry_rcu(t, head, hash_node) { 276 hlist_for_each_entry_rcu(t, head, hash_node) {
276 if (local == t->parms.iph.saddr && 277 if (local == t->parms.iph.saddr &&
277 remote == t->parms.iph.daddr && 278 remote == t->parms.iph.daddr &&
278 key == t->parms.i_key &&
279 link == t->parms.link && 279 link == t->parms.link &&
280 type == t->dev->type) 280 type == t->dev->type &&
281 ip_tunnel_key_match(&t->parms, flags, key))
281 break; 282 break;
282 } 283 }
283 return t; 284 return t;
@@ -395,11 +396,10 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
395 struct ip_tunnel_net *itn, 396 struct ip_tunnel_net *itn,
396 struct ip_tunnel_parm *parms) 397 struct ip_tunnel_parm *parms)
397{ 398{
398 struct ip_tunnel *nt, *fbt; 399 struct ip_tunnel *nt;
399 struct net_device *dev; 400 struct net_device *dev;
400 401
401 BUG_ON(!itn->fb_tunnel_dev); 402 BUG_ON(!itn->fb_tunnel_dev);
402 fbt = netdev_priv(itn->fb_tunnel_dev);
403 dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); 403 dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
404 if (IS_ERR(dev)) 404 if (IS_ERR(dev))
405 return ERR_CAST(dev); 405 return ERR_CAST(dev);
@@ -668,6 +668,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
668 dev->needed_headroom = max_headroom; 668 dev->needed_headroom = max_headroom;
669 669
670 if (skb_cow_head(skb, dev->needed_headroom)) { 670 if (skb_cow_head(skb, dev->needed_headroom)) {
671 ip_rt_put(rt);
671 dev->stats.tx_dropped++; 672 dev->stats.tx_dropped++;
672 kfree_skb(skb); 673 kfree_skb(skb);
673 return; 674 return;
@@ -747,19 +748,19 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
747 goto done; 748 goto done;
748 if (p->iph.ttl) 749 if (p->iph.ttl)
749 p->iph.frag_off |= htons(IP_DF); 750 p->iph.frag_off |= htons(IP_DF);
750 if (!(p->i_flags&TUNNEL_KEY)) 751 if (!(p->i_flags & VTI_ISVTI)) {
751 p->i_key = 0; 752 if (!(p->i_flags & TUNNEL_KEY))
752 if (!(p->o_flags&TUNNEL_KEY)) 753 p->i_key = 0;
753 p->o_key = 0; 754 if (!(p->o_flags & TUNNEL_KEY))
755 p->o_key = 0;
756 }
754 757
755 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 758 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
756 759
757 if (!t && (cmd == SIOCADDTUNNEL)) { 760 if (!t && (cmd == SIOCADDTUNNEL)) {
758 t = ip_tunnel_create(net, itn, p); 761 t = ip_tunnel_create(net, itn, p);
759 if (IS_ERR(t)) { 762 err = PTR_ERR_OR_ZERO(t);
760 err = PTR_ERR(t); 763 break;
761 break;
762 }
763 } 764 }
764 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { 765 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
765 if (t != NULL) { 766 if (t != NULL) {
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index bcf206c79005..f4c987bb7e94 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -74,7 +74,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
74 iph->daddr = dst; 74 iph->daddr = dst;
75 iph->saddr = src; 75 iph->saddr = src;
76 iph->ttl = ttl; 76 iph->ttl = ttl;
77 __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1); 77 __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1);
78 78
79 err = ip_local_out_sk(sk, skb); 79 err = ip_local_out_sk(sk, skb);
80 if (unlikely(net_xmit_eval(err))) 80 if (unlikely(net_xmit_eval(err)))
@@ -135,6 +135,14 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
135 return skb; 135 return skb;
136 } 136 }
137 137
138 /* If packet is not gso and we are resolving any partial checksum,
139 * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL
140 * on the outer header without confusing devices that implement
141 * NETIF_F_IP_CSUM with encapsulation.
142 */
143 if (csum_help)
144 skb->encapsulation = 0;
145
138 if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) { 146 if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) {
139 err = skb_checksum_help(skb); 147 err = skb_checksum_help(skb);
140 if (unlikely(err)) 148 if (unlikely(err))
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 13ef00f1e17b..b8960f3527f3 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -313,7 +313,13 @@ vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
313 return -EINVAL; 313 return -EINVAL;
314 } 314 }
315 315
316 p.i_flags |= VTI_ISVTI; 316 if (!(p.i_flags & GRE_KEY))
317 p.i_key = 0;
318 if (!(p.o_flags & GRE_KEY))
319 p.o_key = 0;
320
321 p.i_flags = VTI_ISVTI;
322
317 err = ip_tunnel_ioctl(dev, &p, cmd); 323 err = ip_tunnel_ioctl(dev, &p, cmd);
318 if (err) 324 if (err)
319 return err; 325 return err;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 812b18351462..62eaa005e146 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -149,13 +149,13 @@ static int ipip_err(struct sk_buff *skb, u32 info)
149 149
150 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { 150 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
151 ipv4_update_pmtu(skb, dev_net(skb->dev), info, 151 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
152 t->dev->ifindex, 0, IPPROTO_IPIP, 0); 152 t->parms.link, 0, IPPROTO_IPIP, 0);
153 err = 0; 153 err = 0;
154 goto out; 154 goto out;
155 } 155 }
156 156
157 if (type == ICMP_REDIRECT) { 157 if (type == ICMP_REDIRECT) {
158 ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0, 158 ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
159 IPPROTO_IPIP, 0); 159 IPPROTO_IPIP, 0);
160 err = 0; 160 err = 0;
161 goto out; 161 goto out;
@@ -486,4 +486,5 @@ static void __exit ipip_fini(void)
486module_init(ipip_init); 486module_init(ipip_init);
487module_exit(ipip_fini); 487module_exit(ipip_fini);
488MODULE_LICENSE("GPL"); 488MODULE_LICENSE("GPL");
489MODULE_ALIAS_RTNL_LINK("ipip");
489MODULE_ALIAS_NETDEV("tunl0"); 490MODULE_ALIAS_NETDEV("tunl0");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index d84dc8d4c916..65bcaa789043 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -484,7 +484,7 @@ static void reg_vif_setup(struct net_device *dev)
484 dev->type = ARPHRD_PIMREG; 484 dev->type = ARPHRD_PIMREG;
485 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; 485 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
486 dev->flags = IFF_NOARP; 486 dev->flags = IFF_NOARP;
487 dev->netdev_ops = &reg_vif_netdev_ops, 487 dev->netdev_ops = &reg_vif_netdev_ops;
488 dev->destructor = free_netdev; 488 dev->destructor = free_netdev;
489 dev->features |= NETIF_F_NETNS_LOCAL; 489 dev->features |= NETIF_F_NETNS_LOCAL;
490} 490}
@@ -1663,7 +1663,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1663 iph->protocol = IPPROTO_IPIP; 1663 iph->protocol = IPPROTO_IPIP;
1664 iph->ihl = 5; 1664 iph->ihl = 5;
1665 iph->tot_len = htons(skb->len); 1665 iph->tot_len = htons(skb->len);
1666 ip_select_ident(skb, skb_dst(skb), NULL); 1666 ip_select_ident(skb, NULL);
1667 ip_send_check(iph); 1667 ip_send_check(iph);
1668 1668
1669 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 1669 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index ee2886126e3d..f1787c04a4dd 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -91,17 +91,9 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops,
91 if (nf_ct_is_untracked(ct)) 91 if (nf_ct_is_untracked(ct))
92 return NF_ACCEPT; 92 return NF_ACCEPT;
93 93
94 nat = nfct_nat(ct); 94 nat = nf_ct_nat_ext_add(ct);
95 if (!nat) { 95 if (nat == NULL)
96 /* NAT module was loaded late. */ 96 return NF_ACCEPT;
97 if (nf_ct_is_confirmed(ct))
98 return NF_ACCEPT;
99 nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
100 if (nat == NULL) {
101 pr_debug("failed to add NAT extension\n");
102 return NF_ACCEPT;
103 }
104 }
105 97
106 switch (ctinfo) { 98 switch (ctinfo) {
107 case IP_CT_RELATED: 99 case IP_CT_RELATED:
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index f40f321b41fc..b8f6381c7d0b 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -34,7 +34,7 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
34 34
35 if (!err) { 35 if (!err) {
36 ip_send_check(ip_hdr(skb)); 36 ip_send_check(ip_hdr(skb));
37 skb->local_df = 1; 37 skb->ignore_df = 1;
38 } 38 }
39 39
40 return err; 40 return err;
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index b5b256d45e67..3964157d826c 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -48,15 +48,9 @@ static unsigned int nf_nat_fn(const struct nf_hook_ops *ops,
48 48
49 NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))); 49 NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)));
50 50
51 nat = nfct_nat(ct); 51 nat = nf_ct_nat_ext_add(ct);
52 if (nat == NULL) { 52 if (nat == NULL)
53 /* Conntrack module was loaded late, can't add extension. */ 53 return NF_ACCEPT;
54 if (nf_ct_is_confirmed(ct))
55 return NF_ACCEPT;
56 nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
57 if (nat == NULL)
58 return NF_ACCEPT;
59 }
60 54
61 switch (ctinfo) { 55 switch (ctinfo) {
62 case IP_CT_RELATED: 56 case IP_CT_RELATED:
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index ad737fad6d8b..ae0af9386f7c 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -345,15 +345,15 @@ static void icmp_put(struct seq_file *seq)
345 for (i = 0; icmpmibmap[i].name != NULL; i++) 345 for (i = 0; icmpmibmap[i].name != NULL; i++)
346 seq_printf(seq, " Out%s", icmpmibmap[i].name); 346 seq_printf(seq, " Out%s", icmpmibmap[i].name);
347 seq_printf(seq, "\nIcmp: %lu %lu %lu", 347 seq_printf(seq, "\nIcmp: %lu %lu %lu",
348 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS), 348 snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS),
349 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS), 349 snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS),
350 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); 350 snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS));
351 for (i = 0; icmpmibmap[i].name != NULL; i++) 351 for (i = 0; icmpmibmap[i].name != NULL; i++)
352 seq_printf(seq, " %lu", 352 seq_printf(seq, " %lu",
353 atomic_long_read(ptr + icmpmibmap[i].index)); 353 atomic_long_read(ptr + icmpmibmap[i].index));
354 seq_printf(seq, " %lu %lu", 354 seq_printf(seq, " %lu %lu",
355 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), 355 snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
356 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); 356 snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
357 for (i = 0; icmpmibmap[i].name != NULL; i++) 357 for (i = 0; icmpmibmap[i].name != NULL; i++)
358 seq_printf(seq, " %lu", 358 seq_printf(seq, " %lu",
359 atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); 359 atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
@@ -379,7 +379,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
379 BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); 379 BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
380 for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) 380 for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
381 seq_printf(seq, " %llu", 381 seq_printf(seq, " %llu",
382 snmp_fold_field64((void __percpu **)net->mib.ip_statistics, 382 snmp_fold_field64(net->mib.ip_statistics,
383 snmp4_ipstats_list[i].entry, 383 snmp4_ipstats_list[i].entry,
384 offsetof(struct ipstats_mib, syncp))); 384 offsetof(struct ipstats_mib, syncp)));
385 385
@@ -395,11 +395,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
395 /* MaxConn field is signed, RFC 2012 */ 395 /* MaxConn field is signed, RFC 2012 */
396 if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) 396 if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
397 seq_printf(seq, " %ld", 397 seq_printf(seq, " %ld",
398 snmp_fold_field((void __percpu **)net->mib.tcp_statistics, 398 snmp_fold_field(net->mib.tcp_statistics,
399 snmp4_tcp_list[i].entry)); 399 snmp4_tcp_list[i].entry));
400 else 400 else
401 seq_printf(seq, " %lu", 401 seq_printf(seq, " %lu",
402 snmp_fold_field((void __percpu **)net->mib.tcp_statistics, 402 snmp_fold_field(net->mib.tcp_statistics,
403 snmp4_tcp_list[i].entry)); 403 snmp4_tcp_list[i].entry));
404 } 404 }
405 405
@@ -410,7 +410,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
410 seq_puts(seq, "\nUdp:"); 410 seq_puts(seq, "\nUdp:");
411 for (i = 0; snmp4_udp_list[i].name != NULL; i++) 411 for (i = 0; snmp4_udp_list[i].name != NULL; i++)
412 seq_printf(seq, " %lu", 412 seq_printf(seq, " %lu",
413 snmp_fold_field((void __percpu **)net->mib.udp_statistics, 413 snmp_fold_field(net->mib.udp_statistics,
414 snmp4_udp_list[i].entry)); 414 snmp4_udp_list[i].entry));
415 415
416 /* the UDP and UDP-Lite MIBs are the same */ 416 /* the UDP and UDP-Lite MIBs are the same */
@@ -421,7 +421,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
421 seq_puts(seq, "\nUdpLite:"); 421 seq_puts(seq, "\nUdpLite:");
422 for (i = 0; snmp4_udp_list[i].name != NULL; i++) 422 for (i = 0; snmp4_udp_list[i].name != NULL; i++)
423 seq_printf(seq, " %lu", 423 seq_printf(seq, " %lu",
424 snmp_fold_field((void __percpu **)net->mib.udplite_statistics, 424 snmp_fold_field(net->mib.udplite_statistics,
425 snmp4_udp_list[i].entry)); 425 snmp4_udp_list[i].entry));
426 426
427 seq_putc(seq, '\n'); 427 seq_putc(seq, '\n');
@@ -458,7 +458,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
458 seq_puts(seq, "\nTcpExt:"); 458 seq_puts(seq, "\nTcpExt:");
459 for (i = 0; snmp4_net_list[i].name != NULL; i++) 459 for (i = 0; snmp4_net_list[i].name != NULL; i++)
460 seq_printf(seq, " %lu", 460 seq_printf(seq, " %lu",
461 snmp_fold_field((void __percpu **)net->mib.net_statistics, 461 snmp_fold_field(net->mib.net_statistics,
462 snmp4_net_list[i].entry)); 462 snmp4_net_list[i].entry));
463 463
464 seq_puts(seq, "\nIpExt:"); 464 seq_puts(seq, "\nIpExt:");
@@ -468,7 +468,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
468 seq_puts(seq, "\nIpExt:"); 468 seq_puts(seq, "\nIpExt:");
469 for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) 469 for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
470 seq_printf(seq, " %llu", 470 seq_printf(seq, " %llu",
471 snmp_fold_field64((void __percpu **)net->mib.ip_statistics, 471 snmp_fold_field64(net->mib.ip_statistics,
472 snmp4_ipextstats_list[i].entry, 472 snmp4_ipextstats_list[i].entry,
473 offsetof(struct ipstats_mib, syncp))); 473 offsetof(struct ipstats_mib, syncp)));
474 474
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index a9dbe58bdfe7..2c65160565e1 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -389,7 +389,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
389 iph->check = 0; 389 iph->check = 0;
390 iph->tot_len = htons(length); 390 iph->tot_len = htons(length);
391 if (!iph->id) 391 if (!iph->id)
392 ip_select_ident(skb, &rt->dst, NULL); 392 ip_select_ident(skb, NULL);
393 393
394 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); 394 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
395 } 395 }
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 5e676be3daeb..082239ffe34a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -89,6 +89,7 @@
89#include <linux/rcupdate.h> 89#include <linux/rcupdate.h>
90#include <linux/times.h> 90#include <linux/times.h>
91#include <linux/slab.h> 91#include <linux/slab.h>
92#include <linux/jhash.h>
92#include <net/dst.h> 93#include <net/dst.h>
93#include <net/net_namespace.h> 94#include <net/net_namespace.h>
94#include <net/protocol.h> 95#include <net/protocol.h>
@@ -456,39 +457,19 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
456 return neigh_create(&arp_tbl, pkey, dev); 457 return neigh_create(&arp_tbl, pkey, dev);
457} 458}
458 459
459/* 460atomic_t *ip_idents __read_mostly;
460 * Peer allocation may fail only in serious out-of-memory conditions. However 461EXPORT_SYMBOL(ip_idents);
461 * we still can generate some output.
462 * Random ID selection looks a bit dangerous because we have no chances to
463 * select ID being unique in a reasonable period of time.
464 * But broken packet identifier may be better than no packet at all.
465 */
466static void ip_select_fb_ident(struct iphdr *iph)
467{
468 static DEFINE_SPINLOCK(ip_fb_id_lock);
469 static u32 ip_fallback_id;
470 u32 salt;
471
472 spin_lock_bh(&ip_fb_id_lock);
473 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
474 iph->id = htons(salt & 0xFFFF);
475 ip_fallback_id = salt;
476 spin_unlock_bh(&ip_fb_id_lock);
477}
478 462
479void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) 463void __ip_select_ident(struct iphdr *iph, int segs)
480{ 464{
481 struct net *net = dev_net(dst->dev); 465 static u32 ip_idents_hashrnd __read_mostly;
482 struct inet_peer *peer; 466 u32 hash, id;
483 467
484 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1); 468 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
485 if (peer) {
486 iph->id = htons(inet_getid(peer, more));
487 inet_putpeer(peer);
488 return;
489 }
490 469
491 ip_select_fb_ident(iph); 470 hash = jhash_1word((__force u32)iph->daddr, ip_idents_hashrnd);
471 id = ip_idents_reserve(hash, segs);
472 iph->id = htons(id);
492} 473}
493EXPORT_SYMBOL(__ip_select_ident); 474EXPORT_SYMBOL(__ip_select_ident);
494 475
@@ -993,6 +974,9 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
993 struct flowi4 fl4; 974 struct flowi4 fl4;
994 struct rtable *rt; 975 struct rtable *rt;
995 976
977 if (!mark)
978 mark = IP4_REPLY_MARK(net, skb->mark);
979
996 __build_flow_key(&fl4, NULL, iph, oif, 980 __build_flow_key(&fl4, NULL, iph, oif,
997 RT_TOS(iph->tos), protocol, mark, flow_flags); 981 RT_TOS(iph->tos), protocol, mark, flow_flags);
998 rt = __ip_route_output_key(net, &fl4); 982 rt = __ip_route_output_key(net, &fl4);
@@ -1010,6 +994,10 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1010 struct rtable *rt; 994 struct rtable *rt;
1011 995
1012 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); 996 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
997
998 if (!fl4.flowi4_mark)
999 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1000
1013 rt = __ip_route_output_key(sock_net(sk), &fl4); 1001 rt = __ip_route_output_key(sock_net(sk), &fl4);
1014 if (!IS_ERR(rt)) { 1002 if (!IS_ERR(rt)) {
1015 __ip_rt_update_pmtu(rt, &fl4, mtu); 1003 __ip_rt_update_pmtu(rt, &fl4, mtu);
@@ -2704,6 +2692,12 @@ int __init ip_rt_init(void)
2704{ 2692{
2705 int rc = 0; 2693 int rc = 0;
2706 2694
2695 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2696 if (!ip_idents)
2697 panic("IP: failed to allocate ip_idents\n");
2698
2699 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2700
2707#ifdef CONFIG_IP_ROUTE_CLASSID 2701#ifdef CONFIG_IP_ROUTE_CLASSID
2708 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 2702 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2709 if (!ip_rt_acct) 2703 if (!ip_rt_acct)
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index f2ed13c2125f..c86624b36a62 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -303,6 +303,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
303 ireq->ir_rmt_port = th->source; 303 ireq->ir_rmt_port = th->source;
304 ireq->ir_loc_addr = ip_hdr(skb)->daddr; 304 ireq->ir_loc_addr = ip_hdr(skb)->daddr;
305 ireq->ir_rmt_addr = ip_hdr(skb)->saddr; 305 ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
306 ireq->ir_mark = inet_request_mark(sk, skb);
306 ireq->ecn_ok = ecn_ok; 307 ireq->ecn_ok = ecn_ok;
307 ireq->snd_wscale = tcp_opt.snd_wscale; 308 ireq->snd_wscale = tcp_opt.snd_wscale;
308 ireq->sack_ok = tcp_opt.sack_ok; 309 ireq->sack_ok = tcp_opt.sack_ok;
@@ -339,7 +340,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
339 * hasn't changed since we received the original syn, but I see 340 * hasn't changed since we received the original syn, but I see
340 * no easy way to do this. 341 * no easy way to do this.
341 */ 342 */
342 flowi4_init_output(&fl4, sk->sk_bound_dev_if, sk->sk_mark, 343 flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark,
343 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, 344 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
344 inet_sk_flowi_flags(sk), 345 inet_sk_flowi_flags(sk),
345 (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr, 346 (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr,
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 5cde8f263d40..79a007c52558 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -437,13 +437,6 @@ static struct ctl_table ipv4_table[] = {
437 .proc_handler = proc_dointvec 437 .proc_handler = proc_dointvec
438 }, 438 },
439 { 439 {
440 .procname = "ip_local_reserved_ports",
441 .data = NULL, /* initialized in sysctl_ipv4_init */
442 .maxlen = 65536,
443 .mode = 0644,
444 .proc_handler = proc_do_large_bitmap,
445 },
446 {
447 .procname = "igmp_max_memberships", 440 .procname = "igmp_max_memberships",
448 .data = &sysctl_igmp_max_memberships, 441 .data = &sysctl_igmp_max_memberships,
449 .maxlen = sizeof(int), 442 .maxlen = sizeof(int),
@@ -825,6 +818,13 @@ static struct ctl_table ipv4_net_table[] = {
825 .proc_handler = ipv4_local_port_range, 818 .proc_handler = ipv4_local_port_range,
826 }, 819 },
827 { 820 {
821 .procname = "ip_local_reserved_ports",
822 .data = &init_net.ipv4.sysctl_local_reserved_ports,
823 .maxlen = 65536,
824 .mode = 0644,
825 .proc_handler = proc_do_large_bitmap,
826 },
827 {
828 .procname = "ip_no_pmtu_disc", 828 .procname = "ip_no_pmtu_disc",
829 .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc, 829 .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc,
830 .maxlen = sizeof(int), 830 .maxlen = sizeof(int),
@@ -838,6 +838,20 @@ static struct ctl_table ipv4_net_table[] = {
838 .mode = 0644, 838 .mode = 0644,
839 .proc_handler = proc_dointvec, 839 .proc_handler = proc_dointvec,
840 }, 840 },
841 {
842 .procname = "fwmark_reflect",
843 .data = &init_net.ipv4.sysctl_fwmark_reflect,
844 .maxlen = sizeof(int),
845 .mode = 0644,
846 .proc_handler = proc_dointvec,
847 },
848 {
849 .procname = "tcp_fwmark_accept",
850 .data = &init_net.ipv4.sysctl_tcp_fwmark_accept,
851 .maxlen = sizeof(int),
852 .mode = 0644,
853 .proc_handler = proc_dointvec,
854 },
841 { } 855 { }
842}; 856};
843 857
@@ -862,8 +876,14 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
862 if (net->ipv4.ipv4_hdr == NULL) 876 if (net->ipv4.ipv4_hdr == NULL)
863 goto err_reg; 877 goto err_reg;
864 878
879 net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
880 if (!net->ipv4.sysctl_local_reserved_ports)
881 goto err_ports;
882
865 return 0; 883 return 0;
866 884
885err_ports:
886 unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
867err_reg: 887err_reg:
868 if (!net_eq(net, &init_net)) 888 if (!net_eq(net, &init_net))
869 kfree(table); 889 kfree(table);
@@ -875,6 +895,7 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net)
875{ 895{
876 struct ctl_table *table; 896 struct ctl_table *table;
877 897
898 kfree(net->ipv4.sysctl_local_reserved_ports);
878 table = net->ipv4.ipv4_hdr->ctl_table_arg; 899 table = net->ipv4.ipv4_hdr->ctl_table_arg;
879 unregister_net_sysctl_table(net->ipv4.ipv4_hdr); 900 unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
880 kfree(table); 901 kfree(table);
@@ -888,16 +909,6 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
888static __init int sysctl_ipv4_init(void) 909static __init int sysctl_ipv4_init(void)
889{ 910{
890 struct ctl_table_header *hdr; 911 struct ctl_table_header *hdr;
891 struct ctl_table *i;
892
893 for (i = ipv4_table; i->procname; i++) {
894 if (strcmp(i->procname, "ip_local_reserved_ports") == 0) {
895 i->data = sysctl_local_reserved_ports;
896 break;
897 }
898 }
899 if (!i->procname)
900 return -EINVAL;
901 912
902 hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table); 913 hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table);
903 if (hdr == NULL) 914 if (hdr == NULL)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4bd6d52eeffb..eb1dde37e678 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2916,6 +2916,14 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2916 case TCP_USER_TIMEOUT: 2916 case TCP_USER_TIMEOUT:
2917 val = jiffies_to_msecs(icsk->icsk_user_timeout); 2917 val = jiffies_to_msecs(icsk->icsk_user_timeout);
2918 break; 2918 break;
2919
2920 case TCP_FASTOPEN:
2921 if (icsk->icsk_accept_queue.fastopenq != NULL)
2922 val = icsk->icsk_accept_queue.fastopenq->max_qlen;
2923 else
2924 val = 0;
2925 break;
2926
2919 case TCP_TIMESTAMP: 2927 case TCP_TIMESTAMP:
2920 val = tcp_time_stamp + tp->tsoffset; 2928 val = tcp_time_stamp + tp->tsoffset;
2921 break; 2929 break;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 821846fb0a7e..d5de69bc04f5 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -140,13 +140,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
140 ca->cnt = 1; 140 ca->cnt = 1;
141} 141}
142 142
143static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, 143static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
144 u32 in_flight)
145{ 144{
146 struct tcp_sock *tp = tcp_sk(sk); 145 struct tcp_sock *tp = tcp_sk(sk);
147 struct bictcp *ca = inet_csk_ca(sk); 146 struct bictcp *ca = inet_csk_ca(sk);
148 147
149 if (!tcp_is_cwnd_limited(sk, in_flight)) 148 if (!tcp_is_cwnd_limited(sk))
150 return; 149 return;
151 150
152 if (tp->snd_cwnd <= tp->snd_ssthresh) 151 if (tp->snd_cwnd <= tp->snd_ssthresh)
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 2b9464c93b88..7b09d8b49fa5 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -276,26 +276,6 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
276 return err; 276 return err;
277} 277}
278 278
279/* RFC2861 Check whether we are limited by application or congestion window
280 * This is the inverse of cwnd check in tcp_tso_should_defer
281 */
282bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
283{
284 const struct tcp_sock *tp = tcp_sk(sk);
285 u32 left;
286
287 if (in_flight >= tp->snd_cwnd)
288 return true;
289
290 left = tp->snd_cwnd - in_flight;
291 if (sk_can_gso(sk) &&
292 left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
293 left < tp->xmit_size_goal_segs)
294 return true;
295 return left <= tcp_max_tso_deferred_mss(tp);
296}
297EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
298
299/* Slow start is used when congestion window is no greater than the slow start 279/* Slow start is used when congestion window is no greater than the slow start
300 * threshold. We base on RFC2581 and also handle stretch ACKs properly. 280 * threshold. We base on RFC2581 and also handle stretch ACKs properly.
301 * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but 281 * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
@@ -337,11 +317,11 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
337/* This is Jacobson's slow start and congestion avoidance. 317/* This is Jacobson's slow start and congestion avoidance.
338 * SIGCOMM '88, p. 328. 318 * SIGCOMM '88, p. 328.
339 */ 319 */
340void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) 320void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
341{ 321{
342 struct tcp_sock *tp = tcp_sk(sk); 322 struct tcp_sock *tp = tcp_sk(sk);
343 323
344 if (!tcp_is_cwnd_limited(sk, in_flight)) 324 if (!tcp_is_cwnd_limited(sk))
345 return; 325 return;
346 326
347 /* In "safe" area, increase. */ 327 /* In "safe" area, increase. */
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index b4f1b29b08bd..a9bd8a4828a9 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -304,13 +304,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
304 ca->cnt = 1; 304 ca->cnt = 1;
305} 305}
306 306
307static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, 307static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
308 u32 in_flight)
309{ 308{
310 struct tcp_sock *tp = tcp_sk(sk); 309 struct tcp_sock *tp = tcp_sk(sk);
311 struct bictcp *ca = inet_csk_ca(sk); 310 struct bictcp *ca = inet_csk_ca(sk);
312 311
313 if (!tcp_is_cwnd_limited(sk, in_flight)) 312 if (!tcp_is_cwnd_limited(sk))
314 return; 313 return;
315 314
316 if (tp->snd_cwnd <= tp->snd_ssthresh) { 315 if (tp->snd_cwnd <= tp->snd_ssthresh) {
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index f195d9316e55..62e48cf84e60 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -72,25 +72,224 @@ error: kfree(ctx);
72 return err; 72 return err;
73} 73}
74 74
75/* Computes the fastopen cookie for the IP path. 75static bool __tcp_fastopen_cookie_gen(const void *path,
76 * The path is a 128 bits long (pad with zeros for IPv4). 76 struct tcp_fastopen_cookie *foc)
77 *
78 * The caller must check foc->len to determine if a valid cookie
79 * has been generated successfully.
80*/
81void tcp_fastopen_cookie_gen(__be32 src, __be32 dst,
82 struct tcp_fastopen_cookie *foc)
83{ 77{
84 __be32 path[4] = { src, dst, 0, 0 };
85 struct tcp_fastopen_context *ctx; 78 struct tcp_fastopen_context *ctx;
79 bool ok = false;
86 80
87 tcp_fastopen_init_key_once(true); 81 tcp_fastopen_init_key_once(true);
88 82
89 rcu_read_lock(); 83 rcu_read_lock();
90 ctx = rcu_dereference(tcp_fastopen_ctx); 84 ctx = rcu_dereference(tcp_fastopen_ctx);
91 if (ctx) { 85 if (ctx) {
92 crypto_cipher_encrypt_one(ctx->tfm, foc->val, (__u8 *)path); 86 crypto_cipher_encrypt_one(ctx->tfm, foc->val, path);
93 foc->len = TCP_FASTOPEN_COOKIE_SIZE; 87 foc->len = TCP_FASTOPEN_COOKIE_SIZE;
88 ok = true;
94 } 89 }
95 rcu_read_unlock(); 90 rcu_read_unlock();
91 return ok;
92}
93
94/* Generate the fastopen cookie by doing aes128 encryption on both
95 * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6
96 * addresses. For the longer IPv6 addresses use CBC-MAC.
97 *
98 * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
99 */
100static bool tcp_fastopen_cookie_gen(struct request_sock *req,
101 struct sk_buff *syn,
102 struct tcp_fastopen_cookie *foc)
103{
104 if (req->rsk_ops->family == AF_INET) {
105 const struct iphdr *iph = ip_hdr(syn);
106
107 __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
108 return __tcp_fastopen_cookie_gen(path, foc);
109 }
110
111#if IS_ENABLED(CONFIG_IPV6)
112 if (req->rsk_ops->family == AF_INET6) {
113 const struct ipv6hdr *ip6h = ipv6_hdr(syn);
114 struct tcp_fastopen_cookie tmp;
115
116 if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) {
117 struct in6_addr *buf = (struct in6_addr *) tmp.val;
118 int i = 4;
119
120 for (i = 0; i < 4; i++)
121 buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
122 return __tcp_fastopen_cookie_gen(buf, foc);
123 }
124 }
125#endif
126 return false;
127}
128
129static bool tcp_fastopen_create_child(struct sock *sk,
130 struct sk_buff *skb,
131 struct dst_entry *dst,
132 struct request_sock *req)
133{
134 struct tcp_sock *tp = tcp_sk(sk);
135 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
136 struct sock *child;
137
138 req->num_retrans = 0;
139 req->num_timeout = 0;
140 req->sk = NULL;
141
142 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
143 if (child == NULL)
144 return false;
145
146 spin_lock(&queue->fastopenq->lock);
147 queue->fastopenq->qlen++;
148 spin_unlock(&queue->fastopenq->lock);
149
150 /* Initialize the child socket. Have to fix some values to take
151 * into account the child is a Fast Open socket and is created
152 * only out of the bits carried in the SYN packet.
153 */
154 tp = tcp_sk(child);
155
156 tp->fastopen_rsk = req;
157 /* Do a hold on the listner sk so that if the listener is being
158 * closed, the child that has been accepted can live on and still
159 * access listen_lock.
160 */
161 sock_hold(sk);
162 tcp_rsk(req)->listener = sk;
163
164 /* RFC1323: The window in SYN & SYN/ACK segments is never
165 * scaled. So correct it appropriately.
166 */
167 tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
168
169 /* Activate the retrans timer so that SYNACK can be retransmitted.
170 * The request socket is not added to the SYN table of the parent
171 * because it's been added to the accept queue directly.
172 */
173 inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
174 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
175
176 /* Add the child socket directly into the accept queue */
177 inet_csk_reqsk_queue_add(sk, req, child);
178
179 /* Now finish processing the fastopen child socket. */
180 inet_csk(child)->icsk_af_ops->rebuild_header(child);
181 tcp_init_congestion_control(child);
182 tcp_mtup_init(child);
183 tcp_init_metrics(child);
184 tcp_init_buffer_space(child);
185
186 /* Queue the data carried in the SYN packet. We need to first
187 * bump skb's refcnt because the caller will attempt to free it.
188 *
189 * XXX (TFO) - we honor a zero-payload TFO request for now,
190 * (any reason not to?) but no need to queue the skb since
191 * there is no data. How about SYN+FIN?
192 */
193 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1) {
194 skb = skb_get(skb);
195 skb_dst_drop(skb);
196 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
197 skb_set_owner_r(skb, child);
198 __skb_queue_tail(&child->sk_receive_queue, skb);
199 tp->syn_data_acked = 1;
200 }
201 tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
202 sk->sk_data_ready(sk);
203 bh_unlock_sock(child);
204 sock_put(child);
205 WARN_ON(req->sk == NULL);
206 return true;
207}
208EXPORT_SYMBOL(tcp_fastopen_create_child);
209
210static bool tcp_fastopen_queue_check(struct sock *sk)
211{
212 struct fastopen_queue *fastopenq;
213
214 /* Make sure the listener has enabled fastopen, and we don't
215 * exceed the max # of pending TFO requests allowed before trying
216 * to validating the cookie in order to avoid burning CPU cycles
217 * unnecessarily.
218 *
219 * XXX (TFO) - The implication of checking the max_qlen before
220 * processing a cookie request is that clients can't differentiate
221 * between qlen overflow causing Fast Open to be disabled
222 * temporarily vs a server not supporting Fast Open at all.
223 */
224 fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
225 if (fastopenq == NULL || fastopenq->max_qlen == 0)
226 return false;
227
228 if (fastopenq->qlen >= fastopenq->max_qlen) {
229 struct request_sock *req1;
230 spin_lock(&fastopenq->lock);
231 req1 = fastopenq->rskq_rst_head;
232 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
233 spin_unlock(&fastopenq->lock);
234 NET_INC_STATS_BH(sock_net(sk),
235 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
236 return false;
237 }
238 fastopenq->rskq_rst_head = req1->dl_next;
239 fastopenq->qlen--;
240 spin_unlock(&fastopenq->lock);
241 reqsk_free(req1);
242 }
243 return true;
244}
245
246/* Returns true if we should perform Fast Open on the SYN. The cookie (foc)
247 * may be updated and return the client in the SYN-ACK later. E.g., Fast Open
248 * cookie request (foc->len == 0).
249 */
250bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
251 struct request_sock *req,
252 struct tcp_fastopen_cookie *foc,
253 struct dst_entry *dst)
254{
255 struct tcp_fastopen_cookie valid_foc = { .len = -1 };
256 bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
257
258 if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
259 (syn_data || foc->len >= 0) &&
260 tcp_fastopen_queue_check(sk))) {
261 foc->len = -1;
262 return false;
263 }
264
265 if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
266 goto fastopen;
267
268 if (tcp_fastopen_cookie_gen(req, skb, &valid_foc) &&
269 foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
270 foc->len == valid_foc.len &&
271 !memcmp(foc->val, valid_foc.val, foc->len)) {
272 /* Cookie is valid. Create a (full) child socket to accept
273 * the data in SYN before returning a SYN-ACK to ack the
274 * data. If we fail to create the socket, fall back and
275 * ack the ISN only but includes the same cookie.
276 *
277 * Note: Data-less SYN with valid cookie is allowed to send
278 * data in SYN_RECV state.
279 */
280fastopen:
281 if (tcp_fastopen_create_child(sk, skb, dst, req)) {
282 foc->len = -1;
283 NET_INC_STATS_BH(sock_net(sk),
284 LINUX_MIB_TCPFASTOPENPASSIVE);
285 return true;
286 }
287 }
288
289 NET_INC_STATS_BH(sock_net(sk), foc->len ?
290 LINUX_MIB_TCPFASTOPENPASSIVEFAIL :
291 LINUX_MIB_TCPFASTOPENCOOKIEREQD);
292 *foc = valid_foc;
293 return false;
96} 294}
295EXPORT_SYMBOL(tcp_try_fastopen);
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 8b9e7bad77c0..1c4908280d92 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -109,12 +109,12 @@ static void hstcp_init(struct sock *sk)
109 tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); 109 tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
110} 110}
111 111
112static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) 112static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
113{ 113{
114 struct tcp_sock *tp = tcp_sk(sk); 114 struct tcp_sock *tp = tcp_sk(sk);
115 struct hstcp *ca = inet_csk_ca(sk); 115 struct hstcp *ca = inet_csk_ca(sk);
116 116
117 if (!tcp_is_cwnd_limited(sk, in_flight)) 117 if (!tcp_is_cwnd_limited(sk))
118 return; 118 return;
119 119
120 if (tp->snd_cwnd <= tp->snd_ssthresh) 120 if (tp->snd_cwnd <= tp->snd_ssthresh)
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 4a194acfd923..031361311a8b 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -227,12 +227,12 @@ static u32 htcp_recalc_ssthresh(struct sock *sk)
227 return max((tp->snd_cwnd * ca->beta) >> 7, 2U); 227 return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
228} 228}
229 229
230static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) 230static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
231{ 231{
232 struct tcp_sock *tp = tcp_sk(sk); 232 struct tcp_sock *tp = tcp_sk(sk);
233 struct htcp *ca = inet_csk_ca(sk); 233 struct htcp *ca = inet_csk_ca(sk);
234 234
235 if (!tcp_is_cwnd_limited(sk, in_flight)) 235 if (!tcp_is_cwnd_limited(sk))
236 return; 236 return;
237 237
238 if (tp->snd_cwnd <= tp->snd_ssthresh) 238 if (tp->snd_cwnd <= tp->snd_ssthresh)
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index a15a799bf768..d8f8f05a4951 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -87,8 +87,7 @@ static inline u32 hybla_fraction(u32 odds)
87 * o Give cwnd a new value based on the model proposed 87 * o Give cwnd a new value based on the model proposed
88 * o remember increments <1 88 * o remember increments <1
89 */ 89 */
90static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked, 90static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
91 u32 in_flight)
92{ 91{
93 struct tcp_sock *tp = tcp_sk(sk); 92 struct tcp_sock *tp = tcp_sk(sk);
94 struct hybla *ca = inet_csk_ca(sk); 93 struct hybla *ca = inet_csk_ca(sk);
@@ -101,11 +100,11 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked,
101 ca->minrtt_us = tp->srtt_us; 100 ca->minrtt_us = tp->srtt_us;
102 } 101 }
103 102
104 if (!tcp_is_cwnd_limited(sk, in_flight)) 103 if (!tcp_is_cwnd_limited(sk))
105 return; 104 return;
106 105
107 if (!ca->hybla_en) { 106 if (!ca->hybla_en) {
108 tcp_reno_cong_avoid(sk, ack, acked, in_flight); 107 tcp_reno_cong_avoid(sk, ack, acked);
109 return; 108 return;
110 } 109 }
111 110
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 863d105e3015..5999b3972e64 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -255,8 +255,7 @@ static void tcp_illinois_state(struct sock *sk, u8 new_state)
255/* 255/*
256 * Increase window in response to successful acknowledgment. 256 * Increase window in response to successful acknowledgment.
257 */ 257 */
258static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked, 258static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)
259 u32 in_flight)
260{ 259{
261 struct tcp_sock *tp = tcp_sk(sk); 260 struct tcp_sock *tp = tcp_sk(sk);
262 struct illinois *ca = inet_csk_ca(sk); 261 struct illinois *ca = inet_csk_ca(sk);
@@ -265,7 +264,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked,
265 update_params(sk); 264 update_params(sk);
266 265
267 /* RFC2861 only increase cwnd if fully utilized */ 266 /* RFC2861 only increase cwnd if fully utilized */
268 if (!tcp_is_cwnd_limited(sk, in_flight)) 267 if (!tcp_is_cwnd_limited(sk))
269 return; 268 return;
270 269
271 /* In slow start */ 270 /* In slow start */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3a26b3b23f16..40661fc1e233 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1167,7 +1167,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1167 } 1167 }
1168 pkt_len = new_len; 1168 pkt_len = new_len;
1169 } 1169 }
1170 err = tcp_fragment(sk, skb, pkt_len, mss); 1170 err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
1171 if (err < 0) 1171 if (err < 0)
1172 return err; 1172 return err;
1173 } 1173 }
@@ -2241,7 +2241,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2241 break; 2241 break;
2242 2242
2243 mss = skb_shinfo(skb)->gso_size; 2243 mss = skb_shinfo(skb)->gso_size;
2244 err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss); 2244 err = tcp_fragment(sk, skb, (packets - oldcnt) * mss,
2245 mss, GFP_ATOMIC);
2245 if (err < 0) 2246 if (err < 0)
2246 break; 2247 break;
2247 cnt = packets; 2248 cnt = packets;
@@ -2937,10 +2938,11 @@ static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
2937 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L); 2938 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L);
2938} 2939}
2939 2940
2940static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) 2941static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
2941{ 2942{
2942 const struct inet_connection_sock *icsk = inet_csk(sk); 2943 const struct inet_connection_sock *icsk = inet_csk(sk);
2943 icsk->icsk_ca_ops->cong_avoid(sk, ack, acked, in_flight); 2944
2945 icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
2944 tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; 2946 tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
2945} 2947}
2946 2948
@@ -3363,7 +3365,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3363 u32 ack_seq = TCP_SKB_CB(skb)->seq; 3365 u32 ack_seq = TCP_SKB_CB(skb)->seq;
3364 u32 ack = TCP_SKB_CB(skb)->ack_seq; 3366 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3365 bool is_dupack = false; 3367 bool is_dupack = false;
3366 u32 prior_in_flight;
3367 u32 prior_fackets; 3368 u32 prior_fackets;
3368 int prior_packets = tp->packets_out; 3369 int prior_packets = tp->packets_out;
3369 const int prior_unsacked = tp->packets_out - tp->sacked_out; 3370 const int prior_unsacked = tp->packets_out - tp->sacked_out;
@@ -3396,7 +3397,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3396 flag |= FLAG_SND_UNA_ADVANCED; 3397 flag |= FLAG_SND_UNA_ADVANCED;
3397 3398
3398 prior_fackets = tp->fackets_out; 3399 prior_fackets = tp->fackets_out;
3399 prior_in_flight = tcp_packets_in_flight(tp);
3400 3400
3401 /* ts_recent update must be made after we are sure that the packet 3401 /* ts_recent update must be made after we are sure that the packet
3402 * is in window. 3402 * is in window.
@@ -3451,7 +3451,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3451 3451
3452 /* Advance cwnd if state allows */ 3452 /* Advance cwnd if state allows */
3453 if (tcp_may_raise_cwnd(sk, flag)) 3453 if (tcp_may_raise_cwnd(sk, flag))
3454 tcp_cong_avoid(sk, ack, acked, prior_in_flight); 3454 tcp_cong_avoid(sk, ack, acked);
3455 3455
3456 if (tcp_ack_is_dubious(sk, flag)) { 3456 if (tcp_ack_is_dubious(sk, flag)) {
3457 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3457 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
@@ -4702,28 +4702,6 @@ static int tcp_prune_queue(struct sock *sk)
4702 return -1; 4702 return -1;
4703} 4703}
4704 4704
4705/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
4706 * As additional protections, we do not touch cwnd in retransmission phases,
4707 * and if application hit its sndbuf limit recently.
4708 */
4709void tcp_cwnd_application_limited(struct sock *sk)
4710{
4711 struct tcp_sock *tp = tcp_sk(sk);
4712
4713 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
4714 sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
4715 /* Limited by application or receiver window. */
4716 u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
4717 u32 win_used = max(tp->snd_cwnd_used, init_win);
4718 if (win_used < tp->snd_cwnd) {
4719 tp->snd_ssthresh = tcp_current_ssthresh(sk);
4720 tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
4721 }
4722 tp->snd_cwnd_used = 0;
4723 }
4724 tp->snd_cwnd_stamp = tcp_time_stamp;
4725}
4726
4727static bool tcp_should_expand_sndbuf(const struct sock *sk) 4705static bool tcp_should_expand_sndbuf(const struct sock *sk)
4728{ 4706{
4729 const struct tcp_sock *tp = tcp_sk(sk); 4707 const struct tcp_sock *tp = tcp_sk(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 438f3b95143d..77cccda1ad0c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -336,8 +336,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
336 const int code = icmp_hdr(icmp_skb)->code; 336 const int code = icmp_hdr(icmp_skb)->code;
337 struct sock *sk; 337 struct sock *sk;
338 struct sk_buff *skb; 338 struct sk_buff *skb;
339 struct request_sock *req; 339 struct request_sock *fastopen;
340 __u32 seq; 340 __u32 seq, snd_una;
341 __u32 remaining; 341 __u32 remaining;
342 int err; 342 int err;
343 struct net *net = dev_net(icmp_skb->dev); 343 struct net *net = dev_net(icmp_skb->dev);
@@ -378,12 +378,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
378 378
379 icsk = inet_csk(sk); 379 icsk = inet_csk(sk);
380 tp = tcp_sk(sk); 380 tp = tcp_sk(sk);
381 req = tp->fastopen_rsk;
382 seq = ntohl(th->seq); 381 seq = ntohl(th->seq);
382 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
383 fastopen = tp->fastopen_rsk;
384 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
383 if (sk->sk_state != TCP_LISTEN && 385 if (sk->sk_state != TCP_LISTEN &&
384 !between(seq, tp->snd_una, tp->snd_nxt) && 386 !between(seq, snd_una, tp->snd_nxt)) {
385 (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
386 /* For a Fast Open socket, allow seq to be snt_isn. */
387 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); 387 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
388 goto out; 388 goto out;
389 } 389 }
@@ -426,11 +426,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
426 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) 426 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
427 break; 427 break;
428 if (seq != tp->snd_una || !icsk->icsk_retransmits || 428 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
429 !icsk->icsk_backoff) 429 !icsk->icsk_backoff || fastopen)
430 break; 430 break;
431 431
432 /* XXX (TFO) - revisit the following logic for TFO */
433
434 if (sock_owned_by_user(sk)) 432 if (sock_owned_by_user(sk))
435 break; 433 break;
436 434
@@ -462,14 +460,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
462 goto out; 460 goto out;
463 } 461 }
464 462
465 /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
466 * than following the TCP_SYN_RECV case and closing the socket,
467 * we ignore the ICMP error and keep trying like a fully established
468 * socket. Is this the right thing to do?
469 */
470 if (req && req->sk == NULL)
471 goto out;
472
473 switch (sk->sk_state) { 463 switch (sk->sk_state) {
474 struct request_sock *req, **prev; 464 struct request_sock *req, **prev;
475 case TCP_LISTEN: 465 case TCP_LISTEN:
@@ -502,10 +492,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
502 goto out; 492 goto out;
503 493
504 case TCP_SYN_SENT: 494 case TCP_SYN_SENT:
505 case TCP_SYN_RECV: /* Cannot happen. 495 case TCP_SYN_RECV:
506 It can f.e. if SYNs crossed, 496 /* Only in fast or simultaneous open. If a fast open socket is
507 or Fast Open. 497 * is already accepted it is treated as a connected one below.
508 */ 498 */
499 if (fastopen && fastopen->sk == NULL)
500 break;
501
509 if (!sock_owned_by_user(sk)) { 502 if (!sock_owned_by_user(sk)) {
510 sk->sk_err = err; 503 sk->sk_err = err;
511 504
@@ -822,7 +815,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
822 */ 815 */
823static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, 816static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
824 struct request_sock *req, 817 struct request_sock *req,
825 u16 queue_mapping) 818 u16 queue_mapping,
819 struct tcp_fastopen_cookie *foc)
826{ 820{
827 const struct inet_request_sock *ireq = inet_rsk(req); 821 const struct inet_request_sock *ireq = inet_rsk(req);
828 struct flowi4 fl4; 822 struct flowi4 fl4;
@@ -833,7 +827,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
833 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 827 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
834 return -1; 828 return -1;
835 829
836 skb = tcp_make_synack(sk, dst, req, NULL); 830 skb = tcp_make_synack(sk, dst, req, foc);
837 831
838 if (skb) { 832 if (skb) {
839 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 833 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
@@ -852,7 +846,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
852 846
853static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) 847static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
854{ 848{
855 int res = tcp_v4_send_synack(sk, NULL, req, 0); 849 int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL);
856 850
857 if (!res) { 851 if (!res) {
858 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); 852 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
@@ -1260,187 +1254,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1260}; 1254};
1261#endif 1255#endif
1262 1256
1263static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1264 struct request_sock *req,
1265 struct tcp_fastopen_cookie *foc,
1266 struct tcp_fastopen_cookie *valid_foc)
1267{
1268 bool skip_cookie = false;
1269 struct fastopen_queue *fastopenq;
1270
1271 if (likely(!fastopen_cookie_present(foc))) {
1272 /* See include/net/tcp.h for the meaning of these knobs */
1273 if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1274 ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1275 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1276 skip_cookie = true; /* no cookie to validate */
1277 else
1278 return false;
1279 }
1280 fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1281 /* A FO option is present; bump the counter. */
1282 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1283
1284 /* Make sure the listener has enabled fastopen, and we don't
1285 * exceed the max # of pending TFO requests allowed before trying
1286 * to validating the cookie in order to avoid burning CPU cycles
1287 * unnecessarily.
1288 *
1289 * XXX (TFO) - The implication of checking the max_qlen before
1290 * processing a cookie request is that clients can't differentiate
1291 * between qlen overflow causing Fast Open to be disabled
1292 * temporarily vs a server not supporting Fast Open at all.
1293 */
1294 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1295 fastopenq == NULL || fastopenq->max_qlen == 0)
1296 return false;
1297
1298 if (fastopenq->qlen >= fastopenq->max_qlen) {
1299 struct request_sock *req1;
1300 spin_lock(&fastopenq->lock);
1301 req1 = fastopenq->rskq_rst_head;
1302 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1303 spin_unlock(&fastopenq->lock);
1304 NET_INC_STATS_BH(sock_net(sk),
1305 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1306 /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1307 foc->len = -1;
1308 return false;
1309 }
1310 fastopenq->rskq_rst_head = req1->dl_next;
1311 fastopenq->qlen--;
1312 spin_unlock(&fastopenq->lock);
1313 reqsk_free(req1);
1314 }
1315 if (skip_cookie) {
1316 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1317 return true;
1318 }
1319
1320 if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1321 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1322 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
1323 ip_hdr(skb)->daddr, valid_foc);
1324 if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1325 memcmp(&foc->val[0], &valid_foc->val[0],
1326 TCP_FASTOPEN_COOKIE_SIZE) != 0)
1327 return false;
1328 valid_foc->len = -1;
1329 }
1330 /* Acknowledge the data received from the peer. */
1331 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1332 return true;
1333 } else if (foc->len == 0) { /* Client requesting a cookie */
1334 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
1335 ip_hdr(skb)->daddr, valid_foc);
1336 NET_INC_STATS_BH(sock_net(sk),
1337 LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1338 } else {
1339 /* Client sent a cookie with wrong size. Treat it
1340 * the same as invalid and return a valid one.
1341 */
1342 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
1343 ip_hdr(skb)->daddr, valid_foc);
1344 }
1345 return false;
1346}
1347
1348static int tcp_v4_conn_req_fastopen(struct sock *sk,
1349 struct sk_buff *skb,
1350 struct sk_buff *skb_synack,
1351 struct request_sock *req)
1352{
1353 struct tcp_sock *tp = tcp_sk(sk);
1354 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1355 const struct inet_request_sock *ireq = inet_rsk(req);
1356 struct sock *child;
1357 int err;
1358
1359 req->num_retrans = 0;
1360 req->num_timeout = 0;
1361 req->sk = NULL;
1362
1363 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1364 if (child == NULL) {
1365 NET_INC_STATS_BH(sock_net(sk),
1366 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1367 kfree_skb(skb_synack);
1368 return -1;
1369 }
1370 err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr,
1371 ireq->ir_rmt_addr, ireq->opt);
1372 err = net_xmit_eval(err);
1373 if (!err)
1374 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1375 /* XXX (TFO) - is it ok to ignore error and continue? */
1376
1377 spin_lock(&queue->fastopenq->lock);
1378 queue->fastopenq->qlen++;
1379 spin_unlock(&queue->fastopenq->lock);
1380
1381 /* Initialize the child socket. Have to fix some values to take
1382 * into account the child is a Fast Open socket and is created
1383 * only out of the bits carried in the SYN packet.
1384 */
1385 tp = tcp_sk(child);
1386
1387 tp->fastopen_rsk = req;
1388 /* Do a hold on the listner sk so that if the listener is being
1389 * closed, the child that has been accepted can live on and still
1390 * access listen_lock.
1391 */
1392 sock_hold(sk);
1393 tcp_rsk(req)->listener = sk;
1394
1395 /* RFC1323: The window in SYN & SYN/ACK segments is never
1396 * scaled. So correct it appropriately.
1397 */
1398 tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1399
1400 /* Activate the retrans timer so that SYNACK can be retransmitted.
1401 * The request socket is not added to the SYN table of the parent
1402 * because it's been added to the accept queue directly.
1403 */
1404 inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1405 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1406
1407 /* Add the child socket directly into the accept queue */
1408 inet_csk_reqsk_queue_add(sk, req, child);
1409
1410 /* Now finish processing the fastopen child socket. */
1411 inet_csk(child)->icsk_af_ops->rebuild_header(child);
1412 tcp_init_congestion_control(child);
1413 tcp_mtup_init(child);
1414 tcp_init_metrics(child);
1415 tcp_init_buffer_space(child);
1416
1417 /* Queue the data carried in the SYN packet. We need to first
1418 * bump skb's refcnt because the caller will attempt to free it.
1419 *
1420 * XXX (TFO) - we honor a zero-payload TFO request for now.
1421 * (Any reason not to?)
1422 */
1423 if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1424 /* Don't queue the skb if there is no payload in SYN.
1425 * XXX (TFO) - How about SYN+FIN?
1426 */
1427 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1428 } else {
1429 skb = skb_get(skb);
1430 skb_dst_drop(skb);
1431 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1432 skb_set_owner_r(skb, child);
1433 __skb_queue_tail(&child->sk_receive_queue, skb);
1434 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1435 tp->syn_data_acked = 1;
1436 }
1437 sk->sk_data_ready(sk);
1438 bh_unlock_sock(child);
1439 sock_put(child);
1440 WARN_ON(req->sk == NULL);
1441 return 0;
1442}
1443
1444int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1257int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1445{ 1258{
1446 struct tcp_options_received tmp_opt; 1259 struct tcp_options_received tmp_opt;
@@ -1451,12 +1264,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1451 __be32 saddr = ip_hdr(skb)->saddr; 1264 __be32 saddr = ip_hdr(skb)->saddr;
1452 __be32 daddr = ip_hdr(skb)->daddr; 1265 __be32 daddr = ip_hdr(skb)->daddr;
1453 __u32 isn = TCP_SKB_CB(skb)->when; 1266 __u32 isn = TCP_SKB_CB(skb)->when;
1454 bool want_cookie = false; 1267 bool want_cookie = false, fastopen;
1455 struct flowi4 fl4; 1268 struct flowi4 fl4;
1456 struct tcp_fastopen_cookie foc = { .len = -1 }; 1269 struct tcp_fastopen_cookie foc = { .len = -1 };
1457 struct tcp_fastopen_cookie valid_foc = { .len = -1 }; 1270 int err;
1458 struct sk_buff *skb_synack;
1459 int do_fastopen;
1460 1271
1461 /* Never answer to SYNs send to broadcast or multicast */ 1272 /* Never answer to SYNs send to broadcast or multicast */
1462 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1273 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -1507,6 +1318,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1507 ireq->ir_rmt_addr = saddr; 1318 ireq->ir_rmt_addr = saddr;
1508 ireq->no_srccheck = inet_sk(sk)->transparent; 1319 ireq->no_srccheck = inet_sk(sk)->transparent;
1509 ireq->opt = tcp_v4_save_options(skb); 1320 ireq->opt = tcp_v4_save_options(skb);
1321 ireq->ir_mark = inet_request_mark(sk, skb);
1510 1322
1511 if (security_inet_conn_request(sk, skb, req)) 1323 if (security_inet_conn_request(sk, skb, req))
1512 goto drop_and_free; 1324 goto drop_and_free;
@@ -1555,52 +1367,24 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1555 1367
1556 isn = tcp_v4_init_sequence(skb); 1368 isn = tcp_v4_init_sequence(skb);
1557 } 1369 }
1558 tcp_rsk(req)->snt_isn = isn; 1370 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1559
1560 if (dst == NULL) {
1561 dst = inet_csk_route_req(sk, &fl4, req);
1562 if (dst == NULL)
1563 goto drop_and_free;
1564 }
1565 do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1566
1567 /* We don't call tcp_v4_send_synack() directly because we need
1568 * to make sure a child socket can be created successfully before
1569 * sending back synack!
1570 *
1571 * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1572 * (or better yet, call tcp_send_synack() in the child context
1573 * directly, but will have to fix bunch of other code first)
1574 * after syn_recv_sock() except one will need to first fix the
1575 * latter to remove its dependency on the current implementation
1576 * of tcp_v4_send_synack()->tcp_select_initial_window().
1577 */
1578 skb_synack = tcp_make_synack(sk, dst, req,
1579 fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1580
1581 if (skb_synack) {
1582 __tcp_v4_send_check(skb_synack, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1583 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1584 } else
1585 goto drop_and_free; 1371 goto drop_and_free;
1586 1372
1587 if (likely(!do_fastopen)) { 1373 tcp_rsk(req)->snt_isn = isn;
1588 int err; 1374 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1589 err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr, 1375 tcp_openreq_init_rwin(req, sk, dst);
1590 ireq->ir_rmt_addr, ireq->opt); 1376 fastopen = !want_cookie &&
1591 err = net_xmit_eval(err); 1377 tcp_try_fastopen(sk, skb, req, &foc, dst);
1378 err = tcp_v4_send_synack(sk, dst, req,
1379 skb_get_queue_mapping(skb), &foc);
1380 if (!fastopen) {
1592 if (err || want_cookie) 1381 if (err || want_cookie)
1593 goto drop_and_free; 1382 goto drop_and_free;
1594 1383
1595 tcp_rsk(req)->snt_synack = tcp_time_stamp; 1384 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1596 tcp_rsk(req)->listener = NULL; 1385 tcp_rsk(req)->listener = NULL;
1597 /* Add the request_sock to the SYN table */
1598 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); 1386 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1599 if (fastopen_cookie_present(&foc) && foc.len != 0) 1387 }
1600 NET_INC_STATS_BH(sock_net(sk),
1601 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1602 } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req))
1603 goto drop_and_free;
1604 1388
1605 return 0; 1389 return 0;
1606 1390
@@ -1744,28 +1528,6 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1744 return sk; 1528 return sk;
1745} 1529}
1746 1530
1747static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1748{
1749 const struct iphdr *iph = ip_hdr(skb);
1750
1751 if (skb->ip_summed == CHECKSUM_COMPLETE) {
1752 if (!tcp_v4_check(skb->len, iph->saddr,
1753 iph->daddr, skb->csum)) {
1754 skb->ip_summed = CHECKSUM_UNNECESSARY;
1755 return 0;
1756 }
1757 }
1758
1759 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1760 skb->len, IPPROTO_TCP, 0);
1761
1762 if (skb->len <= 76) {
1763 return __skb_checksum_complete(skb);
1764 }
1765 return 0;
1766}
1767
1768
1769/* The socket must have it's spinlock held when we get 1531/* The socket must have it's spinlock held when we get
1770 * here. 1532 * here.
1771 * 1533 *
@@ -1960,7 +1722,8 @@ int tcp_v4_rcv(struct sk_buff *skb)
1960 * Packet length and doff are validated by header prediction, 1722 * Packet length and doff are validated by header prediction,
1961 * provided case of th->doff==0 is eliminated. 1723 * provided case of th->doff==0 is eliminated.
1962 * So, we defer the checks. */ 1724 * So, we defer the checks. */
1963 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) 1725
1726 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1964 goto csum_error; 1727 goto csum_error;
1965 1728
1966 th = tcp_hdr(skb); 1729 th = tcp_hdr(skb);
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index c9aecae31327..1e70fa8fa793 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -115,13 +115,12 @@ static void tcp_lp_init(struct sock *sk)
115 * Will only call newReno CA when away from inference. 115 * Will only call newReno CA when away from inference.
116 * From TCP-LP's paper, this will be handled in additive increasement. 116 * From TCP-LP's paper, this will be handled in additive increasement.
117 */ 117 */
118static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked, 118static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
119 u32 in_flight)
120{ 119{
121 struct lp *lp = inet_csk_ca(sk); 120 struct lp *lp = inet_csk_ca(sk);
122 121
123 if (!(lp->flag & LP_WITHIN_INF)) 122 if (!(lp->flag & LP_WITHIN_INF))
124 tcp_reno_cong_avoid(sk, ack, acked, in_flight); 123 tcp_reno_cong_avoid(sk, ack, acked);
125} 124}
126 125
127/** 126/**
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index dcaf72f10216..4fe041805989 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1159,10 +1159,7 @@ static void __net_exit tcp_net_metrics_exit(struct net *net)
1159 tm = next; 1159 tm = next;
1160 } 1160 }
1161 } 1161 }
1162 if (is_vmalloc_addr(net->ipv4.tcp_metrics_hash)) 1162 kvfree(net->ipv4.tcp_metrics_hash);
1163 vfree(net->ipv4.tcp_metrics_hash);
1164 else
1165 kfree(net->ipv4.tcp_metrics_hash);
1166} 1163}
1167 1164
1168static __net_initdata struct pernet_operations tcp_net_metrics_ops = { 1165static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 05c1b155251d..e68e0d4af6c9 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -362,6 +362,37 @@ void tcp_twsk_destructor(struct sock *sk)
362} 362}
363EXPORT_SYMBOL_GPL(tcp_twsk_destructor); 363EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
364 364
365void tcp_openreq_init_rwin(struct request_sock *req,
366 struct sock *sk, struct dst_entry *dst)
367{
368 struct inet_request_sock *ireq = inet_rsk(req);
369 struct tcp_sock *tp = tcp_sk(sk);
370 __u8 rcv_wscale;
371 int mss = dst_metric_advmss(dst);
372
373 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
374 mss = tp->rx_opt.user_mss;
375
376 /* Set this up on the first call only */
377 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
378
379 /* limit the window selection if the user enforce a smaller rx buffer */
380 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
381 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
382 req->window_clamp = tcp_full_space(sk);
383
384 /* tcp_full_space because it is guaranteed to be the first packet */
385 tcp_select_initial_window(tcp_full_space(sk),
386 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
387 &req->rcv_wnd,
388 &req->window_clamp,
389 ireq->wscale_ok,
390 &rcv_wscale,
391 dst_metric(dst, RTAX_INITRWND));
392 ireq->rcv_wscale = rcv_wscale;
393}
394EXPORT_SYMBOL(tcp_openreq_init_rwin);
395
365static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, 396static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
366 struct request_sock *req) 397 struct request_sock *req)
367{ 398{
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index b92b81718ca4..4e86c59ec7f7 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -57,10 +57,12 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
57 SKB_GSO_TCP_ECN | 57 SKB_GSO_TCP_ECN |
58 SKB_GSO_TCPV6 | 58 SKB_GSO_TCPV6 |
59 SKB_GSO_GRE | 59 SKB_GSO_GRE |
60 SKB_GSO_GRE_CSUM |
60 SKB_GSO_IPIP | 61 SKB_GSO_IPIP |
61 SKB_GSO_SIT | 62 SKB_GSO_SIT |
62 SKB_GSO_MPLS | 63 SKB_GSO_MPLS |
63 SKB_GSO_UDP_TUNNEL | 64 SKB_GSO_UDP_TUNNEL |
65 SKB_GSO_UDP_TUNNEL_CSUM |
64 0) || 66 0) ||
65 !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) 67 !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
66 goto out; 68 goto out;
@@ -97,9 +99,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
97 th->check = newcheck; 99 th->check = newcheck;
98 100
99 if (skb->ip_summed != CHECKSUM_PARTIAL) 101 if (skb->ip_summed != CHECKSUM_PARTIAL)
100 th->check = 102 th->check = gso_make_checksum(skb, ~th->check);
101 csum_fold(csum_partial(skb_transport_header(skb),
102 thlen, skb->csum));
103 103
104 seq += mss; 104 seq += mss;
105 if (copy_destructor) { 105 if (copy_destructor) {
@@ -133,8 +133,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
133 th->check = ~csum_fold((__force __wsum)((__force u32)th->check + 133 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
134 (__force u32)delta)); 134 (__force u32)delta));
135 if (skb->ip_summed != CHECKSUM_PARTIAL) 135 if (skb->ip_summed != CHECKSUM_PARTIAL)
136 th->check = csum_fold(csum_partial(skb_transport_header(skb), 136 th->check = gso_make_checksum(skb, ~th->check);
137 thlen, skb->csum));
138out: 137out:
139 return segs; 138 return segs;
140} 139}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2d340bd2cd3d..d92bce0ea24e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -627,7 +627,7 @@ static unsigned int tcp_synack_options(struct sock *sk,
627 if (unlikely(!ireq->tstamp_ok)) 627 if (unlikely(!ireq->tstamp_ok))
628 remaining -= TCPOLEN_SACKPERM_ALIGNED; 628 remaining -= TCPOLEN_SACKPERM_ALIGNED;
629 } 629 }
630 if (foc != NULL) { 630 if (foc != NULL && foc->len >= 0) {
631 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; 631 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
632 need = (need + 3) & ~3U; /* Align to 32 bits */ 632 need = (need + 3) & ~3U; /* Align to 32 bits */
633 if (remaining >= need) { 633 if (remaining >= need) {
@@ -878,15 +878,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
878 BUG_ON(!skb || !tcp_skb_pcount(skb)); 878 BUG_ON(!skb || !tcp_skb_pcount(skb));
879 879
880 if (clone_it) { 880 if (clone_it) {
881 const struct sk_buff *fclone = skb + 1;
882
883 skb_mstamp_get(&skb->skb_mstamp); 881 skb_mstamp_get(&skb->skb_mstamp);
884 882
885 if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
886 fclone->fclone == SKB_FCLONE_CLONE))
887 NET_INC_STATS(sock_net(sk),
888 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
889
890 if (unlikely(skb_cloned(skb))) 883 if (unlikely(skb_cloned(skb)))
891 skb = pskb_copy(skb, gfp_mask); 884 skb = pskb_copy(skb, gfp_mask);
892 else 885 else
@@ -1081,7 +1074,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
1081 * Remember, these are still headerless SKBs at this point. 1074 * Remember, these are still headerless SKBs at this point.
1082 */ 1075 */
1083int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, 1076int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1084 unsigned int mss_now) 1077 unsigned int mss_now, gfp_t gfp)
1085{ 1078{
1086 struct tcp_sock *tp = tcp_sk(sk); 1079 struct tcp_sock *tp = tcp_sk(sk);
1087 struct sk_buff *buff; 1080 struct sk_buff *buff;
@@ -1096,11 +1089,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1096 if (nsize < 0) 1089 if (nsize < 0)
1097 nsize = 0; 1090 nsize = 0;
1098 1091
1099 if (skb_unclone(skb, GFP_ATOMIC)) 1092 if (skb_unclone(skb, gfp))
1100 return -ENOMEM; 1093 return -ENOMEM;
1101 1094
1102 /* Get a new skb... force flag on. */ 1095 /* Get a new skb... force flag on. */
1103 buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); 1096 buff = sk_stream_alloc_skb(sk, nsize, gfp);
1104 if (buff == NULL) 1097 if (buff == NULL)
1105 return -ENOMEM; /* We'll just try again later. */ 1098 return -ENOMEM; /* We'll just try again later. */
1106 1099
@@ -1387,12 +1380,43 @@ unsigned int tcp_current_mss(struct sock *sk)
1387 return mss_now; 1380 return mss_now;
1388} 1381}
1389 1382
1390/* Congestion window validation. (RFC2861) */ 1383/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
1391static void tcp_cwnd_validate(struct sock *sk) 1384 * As additional protections, we do not touch cwnd in retransmission phases,
1385 * and if application hit its sndbuf limit recently.
1386 */
1387static void tcp_cwnd_application_limited(struct sock *sk)
1388{
1389 struct tcp_sock *tp = tcp_sk(sk);
1390
1391 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
1392 sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
1393 /* Limited by application or receiver window. */
1394 u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
1395 u32 win_used = max(tp->snd_cwnd_used, init_win);
1396 if (win_used < tp->snd_cwnd) {
1397 tp->snd_ssthresh = tcp_current_ssthresh(sk);
1398 tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
1399 }
1400 tp->snd_cwnd_used = 0;
1401 }
1402 tp->snd_cwnd_stamp = tcp_time_stamp;
1403}
1404
1405static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1392{ 1406{
1393 struct tcp_sock *tp = tcp_sk(sk); 1407 struct tcp_sock *tp = tcp_sk(sk);
1394 1408
1395 if (tp->packets_out >= tp->snd_cwnd) { 1409 /* Track the maximum number of outstanding packets in each
1410 * window, and remember whether we were cwnd-limited then.
1411 */
1412 if (!before(tp->snd_una, tp->max_packets_seq) ||
1413 tp->packets_out > tp->max_packets_out) {
1414 tp->max_packets_out = tp->packets_out;
1415 tp->max_packets_seq = tp->snd_nxt;
1416 tp->is_cwnd_limited = is_cwnd_limited;
1417 }
1418
1419 if (tcp_is_cwnd_limited(sk)) {
1396 /* Network is feed fully. */ 1420 /* Network is feed fully. */
1397 tp->snd_cwnd_used = 0; 1421 tp->snd_cwnd_used = 0;
1398 tp->snd_cwnd_stamp = tcp_time_stamp; 1422 tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -1601,7 +1625,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1601 1625
1602 /* All of a TSO frame must be composed of paged data. */ 1626 /* All of a TSO frame must be composed of paged data. */
1603 if (skb->len != skb->data_len) 1627 if (skb->len != skb->data_len)
1604 return tcp_fragment(sk, skb, len, mss_now); 1628 return tcp_fragment(sk, skb, len, mss_now, gfp);
1605 1629
1606 buff = sk_stream_alloc_skb(sk, 0, gfp); 1630 buff = sk_stream_alloc_skb(sk, 0, gfp);
1607 if (unlikely(buff == NULL)) 1631 if (unlikely(buff == NULL))
@@ -1644,7 +1668,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1644 * 1668 *
1645 * This algorithm is from John Heffner. 1669 * This algorithm is from John Heffner.
1646 */ 1670 */
1647static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) 1671static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1672 bool *is_cwnd_limited)
1648{ 1673{
1649 struct tcp_sock *tp = tcp_sk(sk); 1674 struct tcp_sock *tp = tcp_sk(sk);
1650 const struct inet_connection_sock *icsk = inet_csk(sk); 1675 const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1708,6 +1733,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1708 if (!tp->tso_deferred) 1733 if (!tp->tso_deferred)
1709 tp->tso_deferred = 1 | (jiffies << 1); 1734 tp->tso_deferred = 1 | (jiffies << 1);
1710 1735
1736 if (cong_win < send_win && cong_win < skb->len)
1737 *is_cwnd_limited = true;
1738
1711 return true; 1739 return true;
1712 1740
1713send_now: 1741send_now:
@@ -1868,6 +1896,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1868 unsigned int tso_segs, sent_pkts; 1896 unsigned int tso_segs, sent_pkts;
1869 int cwnd_quota; 1897 int cwnd_quota;
1870 int result; 1898 int result;
1899 bool is_cwnd_limited = false;
1871 1900
1872 sent_pkts = 0; 1901 sent_pkts = 0;
1873 1902
@@ -1892,6 +1921,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1892 1921
1893 cwnd_quota = tcp_cwnd_test(tp, skb); 1922 cwnd_quota = tcp_cwnd_test(tp, skb);
1894 if (!cwnd_quota) { 1923 if (!cwnd_quota) {
1924 is_cwnd_limited = true;
1895 if (push_one == 2) 1925 if (push_one == 2)
1896 /* Force out a loss probe pkt. */ 1926 /* Force out a loss probe pkt. */
1897 cwnd_quota = 1; 1927 cwnd_quota = 1;
@@ -1908,7 +1938,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1908 nonagle : TCP_NAGLE_PUSH)))) 1938 nonagle : TCP_NAGLE_PUSH))))
1909 break; 1939 break;
1910 } else { 1940 } else {
1911 if (!push_one && tcp_tso_should_defer(sk, skb)) 1941 if (!push_one &&
1942 tcp_tso_should_defer(sk, skb, &is_cwnd_limited))
1912 break; 1943 break;
1913 } 1944 }
1914 1945
@@ -1973,7 +2004,7 @@ repair:
1973 /* Send one loss probe per tail loss episode. */ 2004 /* Send one loss probe per tail loss episode. */
1974 if (push_one != 2) 2005 if (push_one != 2)
1975 tcp_schedule_loss_probe(sk); 2006 tcp_schedule_loss_probe(sk);
1976 tcp_cwnd_validate(sk); 2007 tcp_cwnd_validate(sk, is_cwnd_limited);
1977 return false; 2008 return false;
1978 } 2009 }
1979 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); 2010 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
@@ -2037,6 +2068,25 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2037 return true; 2068 return true;
2038} 2069}
2039 2070
2071/* Thanks to skb fast clones, we can detect if a prior transmit of
2072 * a packet is still in a qdisc or driver queue.
2073 * In this case, there is very little point doing a retransmit !
2074 * Note: This is called from BH context only.
2075 */
2076static bool skb_still_in_host_queue(const struct sock *sk,
2077 const struct sk_buff *skb)
2078{
2079 const struct sk_buff *fclone = skb + 1;
2080
2081 if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
2082 fclone->fclone == SKB_FCLONE_CLONE)) {
2083 NET_INC_STATS_BH(sock_net(sk),
2084 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
2085 return true;
2086 }
2087 return false;
2088}
2089
2040/* When probe timeout (PTO) fires, send a new segment if one exists, else 2090/* When probe timeout (PTO) fires, send a new segment if one exists, else
2041 * retransmit the last segment. 2091 * retransmit the last segment.
2042 */ 2092 */
@@ -2062,12 +2112,16 @@ void tcp_send_loss_probe(struct sock *sk)
2062 if (WARN_ON(!skb)) 2112 if (WARN_ON(!skb))
2063 goto rearm_timer; 2113 goto rearm_timer;
2064 2114
2115 if (skb_still_in_host_queue(sk, skb))
2116 goto rearm_timer;
2117
2065 pcount = tcp_skb_pcount(skb); 2118 pcount = tcp_skb_pcount(skb);
2066 if (WARN_ON(!pcount)) 2119 if (WARN_ON(!pcount))
2067 goto rearm_timer; 2120 goto rearm_timer;
2068 2121
2069 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { 2122 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
2070 if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss))) 2123 if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
2124 GFP_ATOMIC)))
2071 goto rearm_timer; 2125 goto rearm_timer;
2072 skb = tcp_write_queue_tail(sk); 2126 skb = tcp_write_queue_tail(sk);
2073 } 2127 }
@@ -2075,9 +2129,7 @@ void tcp_send_loss_probe(struct sock *sk)
2075 if (WARN_ON(!skb || !tcp_skb_pcount(skb))) 2129 if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
2076 goto rearm_timer; 2130 goto rearm_timer;
2077 2131
2078 /* Probe with zero data doesn't trigger fast recovery. */ 2132 err = __tcp_retransmit_skb(sk, skb);
2079 if (skb->len > 0)
2080 err = __tcp_retransmit_skb(sk, skb);
2081 2133
2082 /* Record snd_nxt for loss detection. */ 2134 /* Record snd_nxt for loss detection. */
2083 if (likely(!err)) 2135 if (likely(!err))
@@ -2383,6 +2435,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2383 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) 2435 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
2384 return -EAGAIN; 2436 return -EAGAIN;
2385 2437
2438 if (skb_still_in_host_queue(sk, skb))
2439 return -EBUSY;
2440
2386 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { 2441 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
2387 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) 2442 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
2388 BUG(); 2443 BUG();
@@ -2405,7 +2460,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2405 return -EAGAIN; 2460 return -EAGAIN;
2406 2461
2407 if (skb->len > cur_mss) { 2462 if (skb->len > cur_mss) {
2408 if (tcp_fragment(sk, skb, cur_mss, cur_mss)) 2463 if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC))
2409 return -ENOMEM; /* We'll try again later. */ 2464 return -ENOMEM; /* We'll try again later. */
2410 } else { 2465 } else {
2411 int oldpcount = tcp_skb_pcount(skb); 2466 int oldpcount = tcp_skb_pcount(skb);
@@ -2476,7 +2531,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2476 * see tcp_input.c tcp_sacktag_write_queue(). 2531 * see tcp_input.c tcp_sacktag_write_queue().
2477 */ 2532 */
2478 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; 2533 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
2479 } else { 2534 } else if (err != -EBUSY) {
2480 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); 2535 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
2481 } 2536 }
2482 return err; 2537 return err;
@@ -2754,27 +2809,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2754 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) 2809 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
2755 mss = tp->rx_opt.user_mss; 2810 mss = tp->rx_opt.user_mss;
2756 2811
2757 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
2758 __u8 rcv_wscale;
2759 /* Set this up on the first call only */
2760 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
2761
2762 /* limit the window selection if the user enforce a smaller rx buffer */
2763 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2764 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
2765 req->window_clamp = tcp_full_space(sk);
2766
2767 /* tcp_full_space because it is guaranteed to be the first packet */
2768 tcp_select_initial_window(tcp_full_space(sk),
2769 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
2770 &req->rcv_wnd,
2771 &req->window_clamp,
2772 ireq->wscale_ok,
2773 &rcv_wscale,
2774 dst_metric(dst, RTAX_INITRWND));
2775 ireq->rcv_wscale = rcv_wscale;
2776 }
2777
2778 memset(&opts, 0, sizeof(opts)); 2812 memset(&opts, 0, sizeof(opts));
2779#ifdef CONFIG_SYN_COOKIES 2813#ifdef CONFIG_SYN_COOKIES
2780 if (unlikely(req->cookie_ts)) 2814 if (unlikely(req->cookie_ts))
@@ -3207,7 +3241,7 @@ int tcp_write_wakeup(struct sock *sk)
3207 skb->len > mss) { 3241 skb->len > mss) {
3208 seg_size = min(seg_size, mss); 3242 seg_size = min(seg_size, mss);
3209 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; 3243 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3210 if (tcp_fragment(sk, skb, seg_size, mss)) 3244 if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
3211 return -1; 3245 return -1;
3212 } else if (!tcp_skb_pcount(skb)) 3246 } else if (!tcp_skb_pcount(skb))
3213 tcp_set_skb_tso_segs(sk, skb, mss); 3247 tcp_set_skb_tso_segs(sk, skb, mss);
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 0ac50836da4d..8250949b8853 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -15,12 +15,11 @@
15#define TCP_SCALABLE_AI_CNT 50U 15#define TCP_SCALABLE_AI_CNT 50U
16#define TCP_SCALABLE_MD_SCALE 3 16#define TCP_SCALABLE_MD_SCALE 3
17 17
18static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked, 18static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
19 u32 in_flight)
20{ 19{
21 struct tcp_sock *tp = tcp_sk(sk); 20 struct tcp_sock *tp = tcp_sk(sk);
22 21
23 if (!tcp_is_cwnd_limited(sk, in_flight)) 22 if (!tcp_is_cwnd_limited(sk))
24 return; 23 return;
25 24
26 if (tp->snd_cwnd <= tp->snd_ssthresh) 25 if (tp->snd_cwnd <= tp->snd_ssthresh)
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 48539fff6357..9a5e05f27f4f 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -163,14 +163,13 @@ static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
163 return min(tp->snd_ssthresh, tp->snd_cwnd-1); 163 return min(tp->snd_ssthresh, tp->snd_cwnd-1);
164} 164}
165 165
166static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked, 166static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
167 u32 in_flight)
168{ 167{
169 struct tcp_sock *tp = tcp_sk(sk); 168 struct tcp_sock *tp = tcp_sk(sk);
170 struct vegas *vegas = inet_csk_ca(sk); 169 struct vegas *vegas = inet_csk_ca(sk);
171 170
172 if (!vegas->doing_vegas_now) { 171 if (!vegas->doing_vegas_now) {
173 tcp_reno_cong_avoid(sk, ack, acked, in_flight); 172 tcp_reno_cong_avoid(sk, ack, acked);
174 return; 173 return;
175 } 174 }
176 175
@@ -195,7 +194,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked,
195 /* We don't have enough RTT samples to do the Vegas 194 /* We don't have enough RTT samples to do the Vegas
196 * calculation, so we'll behave like Reno. 195 * calculation, so we'll behave like Reno.
197 */ 196 */
198 tcp_reno_cong_avoid(sk, ack, acked, in_flight); 197 tcp_reno_cong_avoid(sk, ack, acked);
199 } else { 198 } else {
200 u32 rtt, diff; 199 u32 rtt, diff;
201 u64 target_cwnd; 200 u64 target_cwnd;
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 1b8e28fcd7e1..27b9825753d1 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -114,19 +114,18 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)
114 tcp_veno_init(sk); 114 tcp_veno_init(sk);
115} 115}
116 116
117static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked, 117static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
118 u32 in_flight)
119{ 118{
120 struct tcp_sock *tp = tcp_sk(sk); 119 struct tcp_sock *tp = tcp_sk(sk);
121 struct veno *veno = inet_csk_ca(sk); 120 struct veno *veno = inet_csk_ca(sk);
122 121
123 if (!veno->doing_veno_now) { 122 if (!veno->doing_veno_now) {
124 tcp_reno_cong_avoid(sk, ack, acked, in_flight); 123 tcp_reno_cong_avoid(sk, ack, acked);
125 return; 124 return;
126 } 125 }
127 126
128 /* limited by applications */ 127 /* limited by applications */
129 if (!tcp_is_cwnd_limited(sk, in_flight)) 128 if (!tcp_is_cwnd_limited(sk))
130 return; 129 return;
131 130
132 /* We do the Veno calculations only if we got enough rtt samples */ 131 /* We do the Veno calculations only if we got enough rtt samples */
@@ -134,7 +133,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked,
134 /* We don't have enough rtt samples to do the Veno 133 /* We don't have enough rtt samples to do the Veno
135 * calculation, so we'll behave like Reno. 134 * calculation, so we'll behave like Reno.
136 */ 135 */
137 tcp_reno_cong_avoid(sk, ack, acked, in_flight); 136 tcp_reno_cong_avoid(sk, ack, acked);
138 } else { 137 } else {
139 u64 target_cwnd; 138 u64 target_cwnd;
140 u32 rtt; 139 u32 rtt;
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 5ede0e727945..599b79b8eac0 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -69,13 +69,12 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
69 tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us); 69 tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
70} 70}
71 71
72static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked, 72static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
73 u32 in_flight)
74{ 73{
75 struct tcp_sock *tp = tcp_sk(sk); 74 struct tcp_sock *tp = tcp_sk(sk);
76 struct yeah *yeah = inet_csk_ca(sk); 75 struct yeah *yeah = inet_csk_ca(sk);
77 76
78 if (!tcp_is_cwnd_limited(sk, in_flight)) 77 if (!tcp_is_cwnd_limited(sk))
79 return; 78 return;
80 79
81 if (tp->snd_cwnd <= tp->snd_ssthresh) 80 if (tp->snd_cwnd <= tp->snd_ssthresh)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4468e1adc094..185ed3e59802 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -246,7 +246,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
246 do { 246 do {
247 if (low <= snum && snum <= high && 247 if (low <= snum && snum <= high &&
248 !test_bit(snum >> udptable->log, bitmap) && 248 !test_bit(snum >> udptable->log, bitmap) &&
249 !inet_is_reserved_local_port(snum)) 249 !inet_is_local_reserved_port(net, snum))
250 goto found; 250 goto found;
251 snum += rand; 251 snum += rand;
252 } while (snum != first); 252 } while (snum != first);
@@ -727,13 +727,12 @@ EXPORT_SYMBOL(udp_flush_pending_frames);
727void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) 727void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
728{ 728{
729 struct udphdr *uh = udp_hdr(skb); 729 struct udphdr *uh = udp_hdr(skb);
730 struct sk_buff *frags = skb_shinfo(skb)->frag_list;
731 int offset = skb_transport_offset(skb); 730 int offset = skb_transport_offset(skb);
732 int len = skb->len - offset; 731 int len = skb->len - offset;
733 int hlen = len; 732 int hlen = len;
734 __wsum csum = 0; 733 __wsum csum = 0;
735 734
736 if (!frags) { 735 if (!skb_has_frag_list(skb)) {
737 /* 736 /*
738 * Only one fragment on the socket. 737 * Only one fragment on the socket.
739 */ 738 */
@@ -742,15 +741,17 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
742 uh->check = ~csum_tcpudp_magic(src, dst, len, 741 uh->check = ~csum_tcpudp_magic(src, dst, len,
743 IPPROTO_UDP, 0); 742 IPPROTO_UDP, 0);
744 } else { 743 } else {
744 struct sk_buff *frags;
745
745 /* 746 /*
746 * HW-checksum won't work as there are two or more 747 * HW-checksum won't work as there are two or more
747 * fragments on the socket so that all csums of sk_buffs 748 * fragments on the socket so that all csums of sk_buffs
748 * should be together 749 * should be together
749 */ 750 */
750 do { 751 skb_walk_frags(skb, frags) {
751 csum = csum_add(csum, frags->csum); 752 csum = csum_add(csum, frags->csum);
752 hlen -= frags->len; 753 hlen -= frags->len;
753 } while ((frags = frags->next)); 754 }
754 755
755 csum = skb_checksum(skb, offset, hlen, csum); 756 csum = skb_checksum(skb, offset, hlen, csum);
756 skb->ip_summed = CHECKSUM_NONE; 757 skb->ip_summed = CHECKSUM_NONE;
@@ -762,6 +763,43 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
762} 763}
763EXPORT_SYMBOL_GPL(udp4_hwcsum); 764EXPORT_SYMBOL_GPL(udp4_hwcsum);
764 765
766/* Function to set UDP checksum for an IPv4 UDP packet. This is intended
767 * for the simple case like when setting the checksum for a UDP tunnel.
768 */
769void udp_set_csum(bool nocheck, struct sk_buff *skb,
770 __be32 saddr, __be32 daddr, int len)
771{
772 struct udphdr *uh = udp_hdr(skb);
773
774 if (nocheck)
775 uh->check = 0;
776 else if (skb_is_gso(skb))
777 uh->check = ~udp_v4_check(len, saddr, daddr, 0);
778 else if (skb_dst(skb) && skb_dst(skb)->dev &&
779 (skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) {
780
781 BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
782
783 skb->ip_summed = CHECKSUM_PARTIAL;
784 skb->csum_start = skb_transport_header(skb) - skb->head;
785 skb->csum_offset = offsetof(struct udphdr, check);
786 uh->check = ~udp_v4_check(len, saddr, daddr, 0);
787 } else {
788 __wsum csum;
789
790 BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
791
792 uh->check = 0;
793 csum = skb_checksum(skb, 0, len, 0);
794 uh->check = udp_v4_check(len, saddr, daddr, csum);
795 if (uh->check == 0)
796 uh->check = CSUM_MANGLED_0;
797
798 skb->ip_summed = CHECKSUM_UNNECESSARY;
799 }
800}
801EXPORT_SYMBOL(udp_set_csum);
802
765static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) 803static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
766{ 804{
767 struct sock *sk = skb->sk; 805 struct sock *sk = skb->sk;
@@ -785,7 +823,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
785 if (is_udplite) /* UDP-Lite */ 823 if (is_udplite) /* UDP-Lite */
786 csum = udplite_csum(skb); 824 csum = udplite_csum(skb);
787 825
788 else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */ 826 else if (sk->sk_no_check_tx) { /* UDP csum disabled */
789 827
790 skb->ip_summed = CHECKSUM_NONE; 828 skb->ip_summed = CHECKSUM_NONE;
791 goto send; 829 goto send;
@@ -1495,6 +1533,10 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1495 if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) { 1533 if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) {
1496 int ret; 1534 int ret;
1497 1535
1536 /* Verify checksum before giving to encap */
1537 if (udp_lib_checksum_complete(skb))
1538 goto csum_error;
1539
1498 ret = encap_rcv(sk, skb); 1540 ret = encap_rcv(sk, skb);
1499 if (ret <= 0) { 1541 if (ret <= 0) {
1500 UDP_INC_STATS_BH(sock_net(sk), 1542 UDP_INC_STATS_BH(sock_net(sk),
@@ -1672,7 +1714,6 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
1672static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, 1714static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
1673 int proto) 1715 int proto)
1674{ 1716{
1675 const struct iphdr *iph;
1676 int err; 1717 int err;
1677 1718
1678 UDP_SKB_CB(skb)->partial_cov = 0; 1719 UDP_SKB_CB(skb)->partial_cov = 0;
@@ -1684,22 +1725,8 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
1684 return err; 1725 return err;
1685 } 1726 }
1686 1727
1687 iph = ip_hdr(skb); 1728 return skb_checksum_init_zero_check(skb, proto, uh->check,
1688 if (uh->check == 0) { 1729 inet_compute_pseudo);
1689 skb->ip_summed = CHECKSUM_UNNECESSARY;
1690 } else if (skb->ip_summed == CHECKSUM_COMPLETE) {
1691 if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
1692 proto, skb->csum))
1693 skb->ip_summed = CHECKSUM_UNNECESSARY;
1694 }
1695 if (!skb_csum_unnecessary(skb))
1696 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1697 skb->len, proto, 0);
1698 /* Probably, we should checksum udp header (it should be in cache
1699 * in any case) and data in tiny packets (< rx copybreak).
1700 */
1701
1702 return 0;
1703} 1730}
1704 1731
1705/* 1732/*
@@ -1886,7 +1913,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
1886 unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); 1913 unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
1887 unsigned int slot2 = hash2 & udp_table.mask; 1914 unsigned int slot2 = hash2 & udp_table.mask;
1888 struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; 1915 struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
1889 INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr) 1916 INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
1890 const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); 1917 const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
1891 1918
1892 rcu_read_lock(); 1919 rcu_read_lock();
@@ -1979,7 +2006,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
1979 int (*push_pending_frames)(struct sock *)) 2006 int (*push_pending_frames)(struct sock *))
1980{ 2007{
1981 struct udp_sock *up = udp_sk(sk); 2008 struct udp_sock *up = udp_sk(sk);
1982 int val; 2009 int val, valbool;
1983 int err = 0; 2010 int err = 0;
1984 int is_udplite = IS_UDPLITE(sk); 2011 int is_udplite = IS_UDPLITE(sk);
1985 2012
@@ -1989,6 +2016,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
1989 if (get_user(val, (int __user *)optval)) 2016 if (get_user(val, (int __user *)optval))
1990 return -EFAULT; 2017 return -EFAULT;
1991 2018
2019 valbool = val ? 1 : 0;
2020
1992 switch (optname) { 2021 switch (optname) {
1993 case UDP_CORK: 2022 case UDP_CORK:
1994 if (val != 0) { 2023 if (val != 0) {
@@ -2018,6 +2047,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
2018 } 2047 }
2019 break; 2048 break;
2020 2049
2050 case UDP_NO_CHECK6_TX:
2051 up->no_check6_tx = valbool;
2052 break;
2053
2054 case UDP_NO_CHECK6_RX:
2055 up->no_check6_rx = valbool;
2056 break;
2057
2021 /* 2058 /*
2022 * UDP-Lite's partial checksum coverage (RFC 3828). 2059 * UDP-Lite's partial checksum coverage (RFC 3828).
2023 */ 2060 */
@@ -2100,6 +2137,14 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
2100 val = up->encap_type; 2137 val = up->encap_type;
2101 break; 2138 break;
2102 2139
2140 case UDP_NO_CHECK6_TX:
2141 val = up->no_check6_tx;
2142 break;
2143
2144 case UDP_NO_CHECK6_RX:
2145 val = up->no_check6_rx;
2146 break;
2147
2103 /* The following two cannot be changed on UDP sockets, the return is 2148 /* The following two cannot be changed on UDP sockets, the return is
2104 * always 0 (which corresponds to the full checksum coverage of UDP). */ 2149 * always 0 (which corresponds to the full checksum coverage of UDP). */
2105 case UDPLITE_SEND_CSCOV: 2150 case UDPLITE_SEND_CSCOV:
@@ -2484,7 +2529,11 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
2484 int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); 2529 int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
2485 __be16 protocol = skb->protocol; 2530 __be16 protocol = skb->protocol;
2486 netdev_features_t enc_features; 2531 netdev_features_t enc_features;
2487 int outer_hlen; 2532 int udp_offset, outer_hlen;
2533 unsigned int oldlen;
2534 bool need_csum;
2535
2536 oldlen = (u16)~skb->len;
2488 2537
2489 if (unlikely(!pskb_may_pull(skb, tnl_hlen))) 2538 if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
2490 goto out; 2539 goto out;
@@ -2496,6 +2545,10 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
2496 skb->mac_len = skb_inner_network_offset(skb); 2545 skb->mac_len = skb_inner_network_offset(skb);
2497 skb->protocol = htons(ETH_P_TEB); 2546 skb->protocol = htons(ETH_P_TEB);
2498 2547
2548 need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
2549 if (need_csum)
2550 skb->encap_hdr_csum = 1;
2551
2499 /* segment inner packet. */ 2552 /* segment inner packet. */
2500 enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); 2553 enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
2501 segs = skb_mac_gso_segment(skb, enc_features); 2554 segs = skb_mac_gso_segment(skb, enc_features);
@@ -2506,10 +2559,11 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
2506 } 2559 }
2507 2560
2508 outer_hlen = skb_tnl_header_len(skb); 2561 outer_hlen = skb_tnl_header_len(skb);
2562 udp_offset = outer_hlen - tnl_hlen;
2509 skb = segs; 2563 skb = segs;
2510 do { 2564 do {
2511 struct udphdr *uh; 2565 struct udphdr *uh;
2512 int udp_offset = outer_hlen - tnl_hlen; 2566 int len;
2513 2567
2514 skb_reset_inner_headers(skb); 2568 skb_reset_inner_headers(skb);
2515 skb->encapsulation = 1; 2569 skb->encapsulation = 1;
@@ -2520,31 +2574,20 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
2520 skb_reset_mac_header(skb); 2574 skb_reset_mac_header(skb);
2521 skb_set_network_header(skb, mac_len); 2575 skb_set_network_header(skb, mac_len);
2522 skb_set_transport_header(skb, udp_offset); 2576 skb_set_transport_header(skb, udp_offset);
2577 len = skb->len - udp_offset;
2523 uh = udp_hdr(skb); 2578 uh = udp_hdr(skb);
2524 uh->len = htons(skb->len - udp_offset); 2579 uh->len = htons(len);
2525
2526 /* csum segment if tunnel sets skb with csum. */
2527 if (protocol == htons(ETH_P_IP) && unlikely(uh->check)) {
2528 struct iphdr *iph = ip_hdr(skb);
2529 2580
2530 uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, 2581 if (need_csum) {
2531 skb->len - udp_offset, 2582 __be32 delta = htonl(oldlen + len);
2532 IPPROTO_UDP, 0);
2533 uh->check = csum_fold(skb_checksum(skb, udp_offset,
2534 skb->len - udp_offset, 0));
2535 if (uh->check == 0)
2536 uh->check = CSUM_MANGLED_0;
2537 2583
2538 } else if (protocol == htons(ETH_P_IPV6)) { 2584 uh->check = ~csum_fold((__force __wsum)
2539 struct ipv6hdr *ipv6h = ipv6_hdr(skb); 2585 ((__force u32)uh->check +
2540 u32 len = skb->len - udp_offset; 2586 (__force u32)delta));
2587 uh->check = gso_make_checksum(skb, ~uh->check);
2541 2588
2542 uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
2543 len, IPPROTO_UDP, 0);
2544 uh->check = csum_fold(skb_checksum(skb, udp_offset, len, 0));
2545 if (uh->check == 0) 2589 if (uh->check == 0)
2546 uh->check = CSUM_MANGLED_0; 2590 uh->check = CSUM_MANGLED_0;
2547 skb->ip_summed = CHECKSUM_NONE;
2548 } 2591 }
2549 2592
2550 skb->protocol = protocol; 2593 skb->protocol = protocol;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 88b4023ecfcf..546d2d439dda 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -56,7 +56,8 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
56 __wsum csum; 56 __wsum csum;
57 57
58 if (skb->encapsulation && 58 if (skb->encapsulation &&
59 skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) { 59 (skb_shinfo(skb)->gso_type &
60 (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) {
60 segs = skb_udp_tunnel_segment(skb, features); 61 segs = skb_udp_tunnel_segment(skb, features);
61 goto out; 62 goto out;
62 } 63 }
@@ -71,8 +72,10 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
71 72
72 if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | 73 if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
73 SKB_GSO_UDP_TUNNEL | 74 SKB_GSO_UDP_TUNNEL |
75 SKB_GSO_UDP_TUNNEL_CSUM |
74 SKB_GSO_IPIP | 76 SKB_GSO_IPIP |
75 SKB_GSO_GRE | SKB_GSO_MPLS) || 77 SKB_GSO_GRE | SKB_GSO_GRE_CSUM |
78 SKB_GSO_MPLS) ||
76 !(type & (SKB_GSO_UDP)))) 79 !(type & (SKB_GSO_UDP))))
77 goto out; 80 goto out;
78 81
@@ -197,6 +200,7 @@ unflush:
197 } 200 }
198 201
199 skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ 202 skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
203 skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
200 pp = uo_priv->offload->callbacks.gro_receive(head, skb); 204 pp = uo_priv->offload->callbacks.gro_receive(head, skb);
201 205
202out_unlock: 206out_unlock:
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 2c46acd4cc36..3b3efbda48e1 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -70,7 +70,6 @@ static struct inet_protosw udplite4_protosw = {
70 .protocol = IPPROTO_UDPLITE, 70 .protocol = IPPROTO_UDPLITE,
71 .prot = &udplite_prot, 71 .prot = &udplite_prot,
72 .ops = &inet_dgram_ops, 72 .ops = &inet_dgram_ops,
73 .no_check = 0, /* must checksum (RFC 3828) */
74 .flags = INET_PROTOSW_PERMANENT, 73 .flags = INET_PROTOSW_PERMANENT,
75}; 74};
76 75
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 05f2b484954f..91771a7c802f 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -58,12 +58,12 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
58 58
59 top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? 59 top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
60 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); 60 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
61 ip_select_ident(skb, dst->child, NULL);
62 61
63 top_iph->ttl = ip4_dst_hoplimit(dst->child); 62 top_iph->ttl = ip4_dst_hoplimit(dst->child);
64 63
65 top_iph->saddr = x->props.saddr.a4; 64 top_iph->saddr = x->props.saddr.a4;
66 top_iph->daddr = x->id.daddr.a4; 65 top_iph->daddr = x->id.daddr.a4;
66 ip_select_ident(skb, NULL);
67 67
68 return 0; 68 return 0;
69} 69}
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 186a8ecf92fa..d5f6bd9a210a 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -25,7 +25,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
25 if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) 25 if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
26 goto out; 26 goto out;
27 27
28 if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) 28 if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df)
29 goto out; 29 goto out;
30 30
31 mtu = dst_mtu(skb_dst(skb)); 31 mtu = dst_mtu(skb_dst(skb));