aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-06-12 17:27:40 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-12 17:27:40 -0400
commitf9da455b93f6ba076935b4ef4589f61e529ae046 (patch)
tree3c4e69ce1ba1d6bf65915b97a76ca2172105b278 /net/ipv4
parent0e04c641b199435f3779454055f6a7de258ecdfc (diff)
parente5eca6d41f53db48edd8cf88a3f59d2c30227f8e (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: 1) Seccomp BPF filters can now be JIT'd, from Alexei Starovoitov. 2) Multiqueue support in xen-netback and xen-netfront, from Andrew J Benniston. 3) Allow tweaking of aggregation settings in cdc_ncm driver, from Bjørn Mork. 4) BPF now has a "random" opcode, from Chema Gonzalez. 5) Add more BPF documentation and improve test framework, from Daniel Borkmann. 6) Support TCP fastopen over ipv6, from Daniel Lee. 7) Add software TSO helper functions and use them to support software TSO in mvneta and mv643xx_eth drivers. From Ezequiel Garcia. 8) Support software TSO in fec driver too, from Nimrod Andy. 9) Add Broadcom SYSTEMPORT driver, from Florian Fainelli. 10) Handle broadcasts more gracefully over macvlan when there are large numbers of interfaces configured, from Herbert Xu. 11) Allow more control over fwmark used for non-socket based responses, from Lorenzo Colitti. 12) Do TCP congestion window limiting based upon measurements, from Neal Cardwell. 13) Support busy polling in SCTP, from Neal Horman. 14) Allow RSS key to be configured via ethtool, from Venkata Duvvuru. 15) Bridge promisc mode handling improvements from Vlad Yasevich. 16) Don't use inetpeer entries to implement ID generation any more, it performs poorly, from Eric Dumazet. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1522 commits) rtnetlink: fix userspace API breakage for iproute2 < v3.9.0 tcp: fixing TLP's FIN recovery net: fec: Add software TSO support net: fec: Add Scatter/gather support net: fec: Increase buffer descriptor entry number net: fec: Factorize feature setting net: fec: Enable IP header hardware checksum net: fec: Factorize the .xmit transmit function bridge: fix compile error when compiling without IPv6 support bridge: fix smatch warning / potential null pointer dereference via-rhine: fix full-duplex with autoneg disable bnx2x: Enlarge the dorq threshold for VFs bnx2x: Check for UNDI in uncommon branch bnx2x: Fix 1G-baseT link bnx2x: Fix link for KR with swapped polarity lane sctp: Fix sk_ack_backlog wrap-around problem net/core: Add VF link state control policy net/fsl: xgmac_mdio is dependent on OF_MDIO net/fsl: Make xgmac_mdio read error message useful net_sched: drr: warn when qdisc is not work conserving ...
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c110
-rw-r--r--net/ipv4/datagram.c20
-rw-r--r--net/ipv4/devinet.c9
-rw-r--r--net/ipv4/gre_demux.c27
-rw-r--r--net/ipv4/gre_offload.c16
-rw-r--r--net/ipv4/icmp.c23
-rw-r--r--net/ipv4/igmp.c16
-rw-r--r--net/ipv4/inet_connection_sock.c11
-rw-r--r--net/ipv4/inet_hashtables.c6
-rw-r--r--net/ipv4/inetpeer.c20
-rw-r--r--net/ipv4/ip_forward.c2
-rw-r--r--net/ipv4/ip_gre.c7
-rw-r--r--net/ipv4/ip_options.c6
-rw-r--r--net/ipv4/ip_output.c22
-rw-r--r--net/ipv4/ip_tunnel.c25
-rw-r--r--net/ipv4/ip_tunnel_core.c10
-rw-r--r--net/ipv4/ip_vti.c8
-rw-r--r--net/ipv4/ipip.c5
-rw-r--r--net/ipv4/ipmr.c4
-rw-r--r--net/ipv4/netfilter/iptable_nat.c14
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nft_chain_nat_ipv4.c12
-rw-r--r--net/ipv4/proc.c24
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/route.c52
-rw-r--r--net/ipv4/syncookies.c3
-rw-r--r--net/ipv4/sysctl_net_ipv4.c45
-rw-r--r--net/ipv4/tcp.c8
-rw-r--r--net/ipv4/tcp_bic.c5
-rw-r--r--net/ipv4/tcp_cong.c24
-rw-r--r--net/ipv4/tcp_cubic.c5
-rw-r--r--net/ipv4/tcp_fastopen.c219
-rw-r--r--net/ipv4/tcp_highspeed.c4
-rw-r--r--net/ipv4/tcp_htcp.c4
-rw-r--r--net/ipv4/tcp_hybla.c7
-rw-r--r--net/ipv4/tcp_illinois.c5
-rw-r--r--net/ipv4/tcp_input.c36
-rw-r--r--net/ipv4/tcp_ipv4.c303
-rw-r--r--net/ipv4/tcp_lp.c5
-rw-r--r--net/ipv4/tcp_metrics.c5
-rw-r--r--net/ipv4/tcp_minisocks.c31
-rw-r--r--net/ipv4/tcp_offload.c9
-rw-r--r--net/ipv4/tcp_output.c126
-rw-r--r--net/ipv4/tcp_scalable.c5
-rw-r--r--net/ipv4/tcp_vegas.c7
-rw-r--r--net/ipv4/tcp_veno.c9
-rw-r--r--net/ipv4/tcp_yeah.c5
-rw-r--r--net/ipv4/udp.c135
-rw-r--r--net/ipv4/udp_offload.c8
-rw-r--r--net/ipv4/udplite.c1
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c2
-rw-r--r--net/ipv4/xfrm4_output.c2
52 files changed, 715 insertions, 756 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6d6dd345bc4d..d5e6836cf772 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -254,7 +254,6 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
254 struct inet_sock *inet; 254 struct inet_sock *inet;
255 struct proto *answer_prot; 255 struct proto *answer_prot;
256 unsigned char answer_flags; 256 unsigned char answer_flags;
257 char answer_no_check;
258 int try_loading_module = 0; 257 int try_loading_module = 0;
259 int err; 258 int err;
260 259
@@ -312,7 +311,6 @@ lookup_protocol:
312 311
313 sock->ops = answer->ops; 312 sock->ops = answer->ops;
314 answer_prot = answer->prot; 313 answer_prot = answer->prot;
315 answer_no_check = answer->no_check;
316 answer_flags = answer->flags; 314 answer_flags = answer->flags;
317 rcu_read_unlock(); 315 rcu_read_unlock();
318 316
@@ -324,7 +322,6 @@ lookup_protocol:
324 goto out; 322 goto out;
325 323
326 err = 0; 324 err = 0;
327 sk->sk_no_check = answer_no_check;
328 if (INET_PROTOSW_REUSE & answer_flags) 325 if (INET_PROTOSW_REUSE & answer_flags)
329 sk->sk_reuse = SK_CAN_REUSE; 326 sk->sk_reuse = SK_CAN_REUSE;
330 327
@@ -1002,7 +999,6 @@ static struct inet_protosw inetsw_array[] =
1002 .protocol = IPPROTO_TCP, 999 .protocol = IPPROTO_TCP,
1003 .prot = &tcp_prot, 1000 .prot = &tcp_prot,
1004 .ops = &inet_stream_ops, 1001 .ops = &inet_stream_ops,
1005 .no_check = 0,
1006 .flags = INET_PROTOSW_PERMANENT | 1002 .flags = INET_PROTOSW_PERMANENT |
1007 INET_PROTOSW_ICSK, 1003 INET_PROTOSW_ICSK,
1008 }, 1004 },
@@ -1012,7 +1008,6 @@ static struct inet_protosw inetsw_array[] =
1012 .protocol = IPPROTO_UDP, 1008 .protocol = IPPROTO_UDP,
1013 .prot = &udp_prot, 1009 .prot = &udp_prot,
1014 .ops = &inet_dgram_ops, 1010 .ops = &inet_dgram_ops,
1015 .no_check = UDP_CSUM_DEFAULT,
1016 .flags = INET_PROTOSW_PERMANENT, 1011 .flags = INET_PROTOSW_PERMANENT,
1017 }, 1012 },
1018 1013
@@ -1021,7 +1016,6 @@ static struct inet_protosw inetsw_array[] =
1021 .protocol = IPPROTO_ICMP, 1016 .protocol = IPPROTO_ICMP,
1022 .prot = &ping_prot, 1017 .prot = &ping_prot,
1023 .ops = &inet_dgram_ops, 1018 .ops = &inet_dgram_ops,
1024 .no_check = UDP_CSUM_DEFAULT,
1025 .flags = INET_PROTOSW_REUSE, 1019 .flags = INET_PROTOSW_REUSE,
1026 }, 1020 },
1027 1021
@@ -1030,7 +1024,6 @@ static struct inet_protosw inetsw_array[] =
1030 .protocol = IPPROTO_IP, /* wild card */ 1024 .protocol = IPPROTO_IP, /* wild card */
1031 .prot = &raw_prot, 1025 .prot = &raw_prot,
1032 .ops = &inet_sockraw_ops, 1026 .ops = &inet_sockraw_ops,
1033 .no_check = UDP_CSUM_DEFAULT,
1034 .flags = INET_PROTOSW_REUSE, 1027 .flags = INET_PROTOSW_REUSE,
1035 } 1028 }
1036}; 1029};
@@ -1261,10 +1254,12 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
1261 SKB_GSO_DODGY | 1254 SKB_GSO_DODGY |
1262 SKB_GSO_TCP_ECN | 1255 SKB_GSO_TCP_ECN |
1263 SKB_GSO_GRE | 1256 SKB_GSO_GRE |
1257 SKB_GSO_GRE_CSUM |
1264 SKB_GSO_IPIP | 1258 SKB_GSO_IPIP |
1265 SKB_GSO_SIT | 1259 SKB_GSO_SIT |
1266 SKB_GSO_TCPV6 | 1260 SKB_GSO_TCPV6 |
1267 SKB_GSO_UDP_TUNNEL | 1261 SKB_GSO_UDP_TUNNEL |
1262 SKB_GSO_UDP_TUNNEL_CSUM |
1268 SKB_GSO_MPLS | 1263 SKB_GSO_MPLS |
1269 0))) 1264 0)))
1270 goto out; 1265 goto out;
@@ -1476,22 +1471,20 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
1476} 1471}
1477EXPORT_SYMBOL_GPL(inet_ctl_sock_create); 1472EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
1478 1473
1479unsigned long snmp_fold_field(void __percpu *mib[], int offt) 1474unsigned long snmp_fold_field(void __percpu *mib, int offt)
1480{ 1475{
1481 unsigned long res = 0; 1476 unsigned long res = 0;
1482 int i, j; 1477 int i;
1483 1478
1484 for_each_possible_cpu(i) { 1479 for_each_possible_cpu(i)
1485 for (j = 0; j < SNMP_ARRAY_SZ; j++) 1480 res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt);
1486 res += *(((unsigned long *) per_cpu_ptr(mib[j], i)) + offt);
1487 }
1488 return res; 1481 return res;
1489} 1482}
1490EXPORT_SYMBOL_GPL(snmp_fold_field); 1483EXPORT_SYMBOL_GPL(snmp_fold_field);
1491 1484
1492#if BITS_PER_LONG==32 1485#if BITS_PER_LONG==32
1493 1486
1494u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) 1487u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
1495{ 1488{
1496 u64 res = 0; 1489 u64 res = 0;
1497 int cpu; 1490 int cpu;
@@ -1502,7 +1495,7 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
1502 u64 v; 1495 u64 v;
1503 unsigned int start; 1496 unsigned int start;
1504 1497
1505 bhptr = per_cpu_ptr(mib[0], cpu); 1498 bhptr = per_cpu_ptr(mib, cpu);
1506 syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); 1499 syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
1507 do { 1500 do {
1508 start = u64_stats_fetch_begin_irq(syncp); 1501 start = u64_stats_fetch_begin_irq(syncp);
@@ -1516,25 +1509,6 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
1516EXPORT_SYMBOL_GPL(snmp_fold_field64); 1509EXPORT_SYMBOL_GPL(snmp_fold_field64);
1517#endif 1510#endif
1518 1511
1519int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
1520{
1521 BUG_ON(ptr == NULL);
1522 ptr[0] = __alloc_percpu(mibsize, align);
1523 if (!ptr[0])
1524 return -ENOMEM;
1525
1526#if SNMP_ARRAY_SZ == 2
1527 ptr[1] = __alloc_percpu(mibsize, align);
1528 if (!ptr[1]) {
1529 free_percpu(ptr[0]);
1530 ptr[0] = NULL;
1531 return -ENOMEM;
1532 }
1533#endif
1534 return 0;
1535}
1536EXPORT_SYMBOL_GPL(snmp_mib_init);
1537
1538#ifdef CONFIG_IP_MULTICAST 1512#ifdef CONFIG_IP_MULTICAST
1539static const struct net_protocol igmp_protocol = { 1513static const struct net_protocol igmp_protocol = {
1540 .handler = igmp_rcv, 1514 .handler = igmp_rcv,
@@ -1570,40 +1544,30 @@ static __net_init int ipv4_mib_init_net(struct net *net)
1570{ 1544{
1571 int i; 1545 int i;
1572 1546
1573 if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics, 1547 net->mib.tcp_statistics = alloc_percpu(struct tcp_mib);
1574 sizeof(struct tcp_mib), 1548 if (!net->mib.tcp_statistics)
1575 __alignof__(struct tcp_mib)) < 0)
1576 goto err_tcp_mib; 1549 goto err_tcp_mib;
1577 if (snmp_mib_init((void __percpu **)net->mib.ip_statistics, 1550 net->mib.ip_statistics = alloc_percpu(struct ipstats_mib);
1578 sizeof(struct ipstats_mib), 1551 if (!net->mib.ip_statistics)
1579 __alignof__(struct ipstats_mib)) < 0)
1580 goto err_ip_mib; 1552 goto err_ip_mib;
1581 1553
1582 for_each_possible_cpu(i) { 1554 for_each_possible_cpu(i) {
1583 struct ipstats_mib *af_inet_stats; 1555 struct ipstats_mib *af_inet_stats;
1584 af_inet_stats = per_cpu_ptr(net->mib.ip_statistics[0], i); 1556 af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i);
1585 u64_stats_init(&af_inet_stats->syncp); 1557 u64_stats_init(&af_inet_stats->syncp);
1586#if SNMP_ARRAY_SZ == 2
1587 af_inet_stats = per_cpu_ptr(net->mib.ip_statistics[1], i);
1588 u64_stats_init(&af_inet_stats->syncp);
1589#endif
1590 } 1558 }
1591 1559
1592 if (snmp_mib_init((void __percpu **)net->mib.net_statistics, 1560 net->mib.net_statistics = alloc_percpu(struct linux_mib);
1593 sizeof(struct linux_mib), 1561 if (!net->mib.net_statistics)
1594 __alignof__(struct linux_mib)) < 0)
1595 goto err_net_mib; 1562 goto err_net_mib;
1596 if (snmp_mib_init((void __percpu **)net->mib.udp_statistics, 1563 net->mib.udp_statistics = alloc_percpu(struct udp_mib);
1597 sizeof(struct udp_mib), 1564 if (!net->mib.udp_statistics)
1598 __alignof__(struct udp_mib)) < 0)
1599 goto err_udp_mib; 1565 goto err_udp_mib;
1600 if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics, 1566 net->mib.udplite_statistics = alloc_percpu(struct udp_mib);
1601 sizeof(struct udp_mib), 1567 if (!net->mib.udplite_statistics)
1602 __alignof__(struct udp_mib)) < 0)
1603 goto err_udplite_mib; 1568 goto err_udplite_mib;
1604 if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics, 1569 net->mib.icmp_statistics = alloc_percpu(struct icmp_mib);
1605 sizeof(struct icmp_mib), 1570 if (!net->mib.icmp_statistics)
1606 __alignof__(struct icmp_mib)) < 0)
1607 goto err_icmp_mib; 1571 goto err_icmp_mib;
1608 net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib), 1572 net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),
1609 GFP_KERNEL); 1573 GFP_KERNEL);
@@ -1614,17 +1578,17 @@ static __net_init int ipv4_mib_init_net(struct net *net)
1614 return 0; 1578 return 0;
1615 1579
1616err_icmpmsg_mib: 1580err_icmpmsg_mib:
1617 snmp_mib_free((void __percpu **)net->mib.icmp_statistics); 1581 free_percpu(net->mib.icmp_statistics);
1618err_icmp_mib: 1582err_icmp_mib:
1619 snmp_mib_free((void __percpu **)net->mib.udplite_statistics); 1583 free_percpu(net->mib.udplite_statistics);
1620err_udplite_mib: 1584err_udplite_mib:
1621 snmp_mib_free((void __percpu **)net->mib.udp_statistics); 1585 free_percpu(net->mib.udp_statistics);
1622err_udp_mib: 1586err_udp_mib:
1623 snmp_mib_free((void __percpu **)net->mib.net_statistics); 1587 free_percpu(net->mib.net_statistics);
1624err_net_mib: 1588err_net_mib:
1625 snmp_mib_free((void __percpu **)net->mib.ip_statistics); 1589 free_percpu(net->mib.ip_statistics);
1626err_ip_mib: 1590err_ip_mib:
1627 snmp_mib_free((void __percpu **)net->mib.tcp_statistics); 1591 free_percpu(net->mib.tcp_statistics);
1628err_tcp_mib: 1592err_tcp_mib:
1629 return -ENOMEM; 1593 return -ENOMEM;
1630} 1594}
@@ -1632,12 +1596,12 @@ err_tcp_mib:
1632static __net_exit void ipv4_mib_exit_net(struct net *net) 1596static __net_exit void ipv4_mib_exit_net(struct net *net)
1633{ 1597{
1634 kfree(net->mib.icmpmsg_statistics); 1598 kfree(net->mib.icmpmsg_statistics);
1635 snmp_mib_free((void __percpu **)net->mib.icmp_statistics); 1599 free_percpu(net->mib.icmp_statistics);
1636 snmp_mib_free((void __percpu **)net->mib.udplite_statistics); 1600 free_percpu(net->mib.udplite_statistics);
1637 snmp_mib_free((void __percpu **)net->mib.udp_statistics); 1601 free_percpu(net->mib.udp_statistics);
1638 snmp_mib_free((void __percpu **)net->mib.net_statistics); 1602 free_percpu(net->mib.net_statistics);
1639 snmp_mib_free((void __percpu **)net->mib.ip_statistics); 1603 free_percpu(net->mib.ip_statistics);
1640 snmp_mib_free((void __percpu **)net->mib.tcp_statistics); 1604 free_percpu(net->mib.tcp_statistics);
1641} 1605}
1642 1606
1643static __net_initdata struct pernet_operations ipv4_mib_ops = { 1607static __net_initdata struct pernet_operations ipv4_mib_ops = {
@@ -1736,13 +1700,9 @@ static int __init inet_init(void)
1736 1700
1737 BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); 1701 BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));
1738 1702
1739 sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
1740 if (!sysctl_local_reserved_ports)
1741 goto out;
1742
1743 rc = proto_register(&tcp_prot, 1); 1703 rc = proto_register(&tcp_prot, 1);
1744 if (rc) 1704 if (rc)
1745 goto out_free_reserved_ports; 1705 goto out;
1746 1706
1747 rc = proto_register(&udp_prot, 1); 1707 rc = proto_register(&udp_prot, 1);
1748 if (rc) 1708 if (rc)
@@ -1852,8 +1812,6 @@ out_unregister_udp_proto:
1852 proto_unregister(&udp_prot); 1812 proto_unregister(&udp_prot);
1853out_unregister_tcp_proto: 1813out_unregister_tcp_proto:
1854 proto_unregister(&tcp_prot); 1814 proto_unregister(&tcp_prot);
1855out_free_reserved_ports:
1856 kfree(sysctl_local_reserved_ports);
1857 goto out; 1815 goto out;
1858} 1816}
1859 1817
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 8b5134c582f1..a3095fdefbed 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -86,18 +86,26 @@ out:
86} 86}
87EXPORT_SYMBOL(ip4_datagram_connect); 87EXPORT_SYMBOL(ip4_datagram_connect);
88 88
89/* Because UDP xmit path can manipulate sk_dst_cache without holding
90 * socket lock, we need to use sk_dst_set() here,
91 * even if we own the socket lock.
92 */
89void ip4_datagram_release_cb(struct sock *sk) 93void ip4_datagram_release_cb(struct sock *sk)
90{ 94{
91 const struct inet_sock *inet = inet_sk(sk); 95 const struct inet_sock *inet = inet_sk(sk);
92 const struct ip_options_rcu *inet_opt; 96 const struct ip_options_rcu *inet_opt;
93 __be32 daddr = inet->inet_daddr; 97 __be32 daddr = inet->inet_daddr;
98 struct dst_entry *dst;
94 struct flowi4 fl4; 99 struct flowi4 fl4;
95 struct rtable *rt; 100 struct rtable *rt;
96 101
97 if (! __sk_dst_get(sk) || __sk_dst_check(sk, 0))
98 return;
99
100 rcu_read_lock(); 102 rcu_read_lock();
103
104 dst = __sk_dst_get(sk);
105 if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) {
106 rcu_read_unlock();
107 return;
108 }
101 inet_opt = rcu_dereference(inet->inet_opt); 109 inet_opt = rcu_dereference(inet->inet_opt);
102 if (inet_opt && inet_opt->opt.srr) 110 if (inet_opt && inet_opt->opt.srr)
103 daddr = inet_opt->opt.faddr; 111 daddr = inet_opt->opt.faddr;
@@ -105,8 +113,10 @@ void ip4_datagram_release_cb(struct sock *sk)
105 inet->inet_saddr, inet->inet_dport, 113 inet->inet_saddr, inet->inet_dport,
106 inet->inet_sport, sk->sk_protocol, 114 inet->inet_sport, sk->sk_protocol,
107 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); 115 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
108 if (!IS_ERR(rt)) 116
109 __sk_dst_set(sk, &rt->dst); 117 dst = !IS_ERR(rt) ? &rt->dst : NULL;
118 sk_dst_set(sk, dst);
119
110 rcu_read_unlock(); 120 rcu_read_unlock();
111} 121}
112EXPORT_SYMBOL_GPL(ip4_datagram_release_cb); 122EXPORT_SYMBOL_GPL(ip4_datagram_release_cb);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index bdbf68bb2e2d..e9449376b58e 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -106,7 +106,6 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
106#define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT) 106#define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT)
107 107
108static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; 108static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
109static DEFINE_SPINLOCK(inet_addr_hash_lock);
110 109
111static u32 inet_addr_hash(struct net *net, __be32 addr) 110static u32 inet_addr_hash(struct net *net, __be32 addr)
112{ 111{
@@ -119,16 +118,14 @@ static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
119{ 118{
120 u32 hash = inet_addr_hash(net, ifa->ifa_local); 119 u32 hash = inet_addr_hash(net, ifa->ifa_local);
121 120
122 spin_lock(&inet_addr_hash_lock); 121 ASSERT_RTNL();
123 hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); 122 hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
124 spin_unlock(&inet_addr_hash_lock);
125} 123}
126 124
127static void inet_hash_remove(struct in_ifaddr *ifa) 125static void inet_hash_remove(struct in_ifaddr *ifa)
128{ 126{
129 spin_lock(&inet_addr_hash_lock); 127 ASSERT_RTNL();
130 hlist_del_init_rcu(&ifa->hash); 128 hlist_del_init_rcu(&ifa->hash);
131 spin_unlock(&inet_addr_hash_lock);
132} 129}
133 130
134/** 131/**
@@ -830,7 +827,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
830 ifa_existing = find_matching_ifa(ifa); 827 ifa_existing = find_matching_ifa(ifa);
831 if (!ifa_existing) { 828 if (!ifa_existing) {
832 /* It would be best to check for !NLM_F_CREATE here but 829 /* It would be best to check for !NLM_F_CREATE here but
833 * userspace alreay relies on not having to provide this. 830 * userspace already relies on not having to provide this.
834 */ 831 */
835 set_ifa_lifetime(ifa, valid_lft, prefered_lft); 832 set_ifa_lifetime(ifa, valid_lft, prefered_lft);
836 return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); 833 return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 250be7421ab3..4e9619bca732 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -84,7 +84,8 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
84 ptr--; 84 ptr--;
85 } 85 }
86 if (tpi->flags&TUNNEL_CSUM && 86 if (tpi->flags&TUNNEL_CSUM &&
87 !(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) { 87 !(skb_shinfo(skb)->gso_type &
88 (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) {
88 *ptr = 0; 89 *ptr = 0;
89 *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, 90 *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
90 skb->len, 0)); 91 skb->len, 0));
@@ -93,28 +94,6 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
93} 94}
94EXPORT_SYMBOL_GPL(gre_build_header); 95EXPORT_SYMBOL_GPL(gre_build_header);
95 96
96static __sum16 check_checksum(struct sk_buff *skb)
97{
98 __sum16 csum = 0;
99
100 switch (skb->ip_summed) {
101 case CHECKSUM_COMPLETE:
102 csum = csum_fold(skb->csum);
103
104 if (!csum)
105 break;
106 /* Fall through. */
107
108 case CHECKSUM_NONE:
109 skb->csum = 0;
110 csum = __skb_checksum_complete(skb);
111 skb->ip_summed = CHECKSUM_COMPLETE;
112 break;
113 }
114
115 return csum;
116}
117
118static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, 97static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
119 bool *csum_err) 98 bool *csum_err)
120{ 99{
@@ -141,7 +120,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
141 120
142 options = (__be32 *)(greh + 1); 121 options = (__be32 *)(greh + 1);
143 if (greh->flags & GRE_CSUM) { 122 if (greh->flags & GRE_CSUM) {
144 if (check_checksum(skb)) { 123 if (skb_checksum_simple_validate(skb)) {
145 *csum_err = true; 124 *csum_err = true;
146 return -EINVAL; 125 return -EINVAL;
147 } 126 }
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index f1d32280cb54..eb92deb12666 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -42,6 +42,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
42 SKB_GSO_DODGY | 42 SKB_GSO_DODGY |
43 SKB_GSO_TCP_ECN | 43 SKB_GSO_TCP_ECN |
44 SKB_GSO_GRE | 44 SKB_GSO_GRE |
45 SKB_GSO_GRE_CSUM |
45 SKB_GSO_IPIP))) 46 SKB_GSO_IPIP)))
46 goto out; 47 goto out;
47 48
@@ -55,6 +56,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
55 goto out; 56 goto out;
56 57
57 csum = !!(greh->flags & GRE_CSUM); 58 csum = !!(greh->flags & GRE_CSUM);
59 if (csum)
60 skb->encap_hdr_csum = 1;
58 61
59 if (unlikely(!pskb_may_pull(skb, ghl))) 62 if (unlikely(!pskb_may_pull(skb, ghl)))
60 goto out; 63 goto out;
@@ -94,10 +97,13 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
94 } 97 }
95 } 98 }
96 99
97 greh = (struct gre_base_hdr *)(skb->data); 100 skb_reset_transport_header(skb);
101
102 greh = (struct gre_base_hdr *)
103 skb_transport_header(skb);
98 pcsum = (__be32 *)(greh + 1); 104 pcsum = (__be32 *)(greh + 1);
99 *pcsum = 0; 105 *pcsum = 0;
100 *(__sum16 *)pcsum = csum_fold(skb_checksum(skb, 0, skb->len, 0)); 106 *(__sum16 *)pcsum = gso_make_checksum(skb, 0);
101 } 107 }
102 __skb_push(skb, tnl_hlen - ghl); 108 __skb_push(skb, tnl_hlen - ghl);
103 109
@@ -125,10 +131,12 @@ static __sum16 gro_skb_checksum(struct sk_buff *skb)
125 csum_partial(skb->data, skb_gro_offset(skb), 0)); 131 csum_partial(skb->data, skb_gro_offset(skb), 0));
126 sum = csum_fold(NAPI_GRO_CB(skb)->csum); 132 sum = csum_fold(NAPI_GRO_CB(skb)->csum);
127 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) { 133 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) {
128 if (unlikely(!sum)) 134 if (unlikely(!sum) && !skb->csum_complete_sw)
129 netdev_rx_csum_fault(skb->dev); 135 netdev_rx_csum_fault(skb->dev);
130 } else 136 } else {
131 skb->ip_summed = CHECKSUM_COMPLETE; 137 skb->ip_summed = CHECKSUM_COMPLETE;
138 skb->csum_complete_sw = 1;
139 }
132 140
133 return sum; 141 return sum;
134} 142}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 0134663fdbce..79c3d947a481 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -337,6 +337,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
337 struct sock *sk; 337 struct sock *sk;
338 struct inet_sock *inet; 338 struct inet_sock *inet;
339 __be32 daddr, saddr; 339 __be32 daddr, saddr;
340 u32 mark = IP4_REPLY_MARK(net, skb->mark);
340 341
341 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) 342 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
342 return; 343 return;
@@ -349,6 +350,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
349 icmp_param->data.icmph.checksum = 0; 350 icmp_param->data.icmph.checksum = 0;
350 351
351 inet->tos = ip_hdr(skb)->tos; 352 inet->tos = ip_hdr(skb)->tos;
353 sk->sk_mark = mark;
352 daddr = ipc.addr = ip_hdr(skb)->saddr; 354 daddr = ipc.addr = ip_hdr(skb)->saddr;
353 saddr = fib_compute_spec_dst(skb); 355 saddr = fib_compute_spec_dst(skb);
354 ipc.opt = NULL; 356 ipc.opt = NULL;
@@ -364,6 +366,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
364 memset(&fl4, 0, sizeof(fl4)); 366 memset(&fl4, 0, sizeof(fl4));
365 fl4.daddr = daddr; 367 fl4.daddr = daddr;
366 fl4.saddr = saddr; 368 fl4.saddr = saddr;
369 fl4.flowi4_mark = mark;
367 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); 370 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
368 fl4.flowi4_proto = IPPROTO_ICMP; 371 fl4.flowi4_proto = IPPROTO_ICMP;
369 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); 372 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
@@ -382,7 +385,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
382 struct flowi4 *fl4, 385 struct flowi4 *fl4,
383 struct sk_buff *skb_in, 386 struct sk_buff *skb_in,
384 const struct iphdr *iph, 387 const struct iphdr *iph,
385 __be32 saddr, u8 tos, 388 __be32 saddr, u8 tos, u32 mark,
386 int type, int code, 389 int type, int code,
387 struct icmp_bxm *param) 390 struct icmp_bxm *param)
388{ 391{
@@ -394,6 +397,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
394 fl4->daddr = (param->replyopts.opt.opt.srr ? 397 fl4->daddr = (param->replyopts.opt.opt.srr ?
395 param->replyopts.opt.opt.faddr : iph->saddr); 398 param->replyopts.opt.opt.faddr : iph->saddr);
396 fl4->saddr = saddr; 399 fl4->saddr = saddr;
400 fl4->flowi4_mark = mark;
397 fl4->flowi4_tos = RT_TOS(tos); 401 fl4->flowi4_tos = RT_TOS(tos);
398 fl4->flowi4_proto = IPPROTO_ICMP; 402 fl4->flowi4_proto = IPPROTO_ICMP;
399 fl4->fl4_icmp_type = type; 403 fl4->fl4_icmp_type = type;
@@ -491,6 +495,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
491 struct flowi4 fl4; 495 struct flowi4 fl4;
492 __be32 saddr; 496 __be32 saddr;
493 u8 tos; 497 u8 tos;
498 u32 mark;
494 struct net *net; 499 struct net *net;
495 struct sock *sk; 500 struct sock *sk;
496 501
@@ -592,6 +597,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
592 tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | 597 tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
593 IPTOS_PREC_INTERNETCONTROL) : 598 IPTOS_PREC_INTERNETCONTROL) :
594 iph->tos; 599 iph->tos;
600 mark = IP4_REPLY_MARK(net, skb_in->mark);
595 601
596 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in)) 602 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
597 goto out_unlock; 603 goto out_unlock;
@@ -608,13 +614,14 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
608 icmp_param->skb = skb_in; 614 icmp_param->skb = skb_in;
609 icmp_param->offset = skb_network_offset(skb_in); 615 icmp_param->offset = skb_network_offset(skb_in);
610 inet_sk(sk)->tos = tos; 616 inet_sk(sk)->tos = tos;
617 sk->sk_mark = mark;
611 ipc.addr = iph->saddr; 618 ipc.addr = iph->saddr;
612 ipc.opt = &icmp_param->replyopts.opt; 619 ipc.opt = &icmp_param->replyopts.opt;
613 ipc.tx_flags = 0; 620 ipc.tx_flags = 0;
614 ipc.ttl = 0; 621 ipc.ttl = 0;
615 ipc.tos = -1; 622 ipc.tos = -1;
616 623
617 rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, 624 rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
618 type, code, icmp_param); 625 type, code, icmp_param);
619 if (IS_ERR(rt)) 626 if (IS_ERR(rt))
620 goto out_unlock; 627 goto out_unlock;
@@ -908,16 +915,8 @@ int icmp_rcv(struct sk_buff *skb)
908 915
909 ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS); 916 ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS);
910 917
911 switch (skb->ip_summed) { 918 if (skb_checksum_simple_validate(skb))
912 case CHECKSUM_COMPLETE: 919 goto csum_error;
913 if (!csum_fold(skb->csum))
914 break;
915 /* fall through */
916 case CHECKSUM_NONE:
917 skb->csum = 0;
918 if (__skb_checksum_complete(skb))
919 goto csum_error;
920 }
921 920
922 if (!pskb_pull(skb, sizeof(*icmph))) 921 if (!pskb_pull(skb, sizeof(*icmph)))
923 goto error; 922 goto error;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 97e4d1655d26..6748d420f714 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -369,7 +369,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
369 pip->saddr = fl4.saddr; 369 pip->saddr = fl4.saddr;
370 pip->protocol = IPPROTO_IGMP; 370 pip->protocol = IPPROTO_IGMP;
371 pip->tot_len = 0; /* filled in later */ 371 pip->tot_len = 0; /* filled in later */
372 ip_select_ident(skb, &rt->dst, NULL); 372 ip_select_ident(skb, NULL);
373 ((u8 *)&pip[1])[0] = IPOPT_RA; 373 ((u8 *)&pip[1])[0] = IPOPT_RA;
374 ((u8 *)&pip[1])[1] = 4; 374 ((u8 *)&pip[1])[1] = 4;
375 ((u8 *)&pip[1])[2] = 0; 375 ((u8 *)&pip[1])[2] = 0;
@@ -714,7 +714,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
714 iph->daddr = dst; 714 iph->daddr = dst;
715 iph->saddr = fl4.saddr; 715 iph->saddr = fl4.saddr;
716 iph->protocol = IPPROTO_IGMP; 716 iph->protocol = IPPROTO_IGMP;
717 ip_select_ident(skb, &rt->dst, NULL); 717 ip_select_ident(skb, NULL);
718 ((u8 *)&iph[1])[0] = IPOPT_RA; 718 ((u8 *)&iph[1])[0] = IPOPT_RA;
719 ((u8 *)&iph[1])[1] = 4; 719 ((u8 *)&iph[1])[1] = 4;
720 ((u8 *)&iph[1])[2] = 0; 720 ((u8 *)&iph[1])[2] = 0;
@@ -988,16 +988,8 @@ int igmp_rcv(struct sk_buff *skb)
988 if (!pskb_may_pull(skb, sizeof(struct igmphdr))) 988 if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
989 goto drop; 989 goto drop;
990 990
991 switch (skb->ip_summed) { 991 if (skb_checksum_simple_validate(skb))
992 case CHECKSUM_COMPLETE: 992 goto drop;
993 if (!csum_fold(skb->csum))
994 break;
995 /* fall through */
996 case CHECKSUM_NONE:
997 skb->csum = 0;
998 if (__skb_checksum_complete(skb))
999 goto drop;
1000 }
1001 993
1002 ih = igmp_hdr(skb); 994 ih = igmp_hdr(skb);
1003 switch (ih->type) { 995 switch (ih->type) {
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index a56b8e6e866a..14d02ea905b6 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -29,9 +29,6 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
29EXPORT_SYMBOL(inet_csk_timer_bug_msg); 29EXPORT_SYMBOL(inet_csk_timer_bug_msg);
30#endif 30#endif
31 31
32unsigned long *sysctl_local_reserved_ports;
33EXPORT_SYMBOL(sysctl_local_reserved_ports);
34
35void inet_get_local_port_range(struct net *net, int *low, int *high) 32void inet_get_local_port_range(struct net *net, int *low, int *high)
36{ 33{
37 unsigned int seq; 34 unsigned int seq;
@@ -113,7 +110,7 @@ again:
113 110
114 smallest_size = -1; 111 smallest_size = -1;
115 do { 112 do {
116 if (inet_is_reserved_local_port(rover)) 113 if (inet_is_local_reserved_port(net, rover))
117 goto next_nolock; 114 goto next_nolock;
118 head = &hashinfo->bhash[inet_bhashfn(net, rover, 115 head = &hashinfo->bhash[inet_bhashfn(net, rover,
119 hashinfo->bhash_size)]; 116 hashinfo->bhash_size)];
@@ -408,7 +405,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
408 struct net *net = sock_net(sk); 405 struct net *net = sock_net(sk);
409 int flags = inet_sk_flowi_flags(sk); 406 int flags = inet_sk_flowi_flags(sk);
410 407
411 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 408 flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark,
412 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 409 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
413 sk->sk_protocol, 410 sk->sk_protocol,
414 flags, 411 flags,
@@ -445,7 +442,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
445 442
446 rcu_read_lock(); 443 rcu_read_lock();
447 opt = rcu_dereference(newinet->inet_opt); 444 opt = rcu_dereference(newinet->inet_opt);
448 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 445 flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark,
449 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 446 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
450 sk->sk_protocol, inet_sk_flowi_flags(sk), 447 sk->sk_protocol, inet_sk_flowi_flags(sk),
451 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, 448 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
@@ -680,6 +677,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
680 inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); 677 inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
681 newsk->sk_write_space = sk_stream_write_space; 678 newsk->sk_write_space = sk_stream_write_space;
682 679
680 newsk->sk_mark = inet_rsk(req)->ir_mark;
681
683 newicsk->icsk_retransmits = 0; 682 newicsk->icsk_retransmits = 0;
684 newicsk->icsk_backoff = 0; 683 newicsk->icsk_backoff = 0;
685 newicsk->icsk_probes_out = 0; 684 newicsk->icsk_probes_out = 0;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 8b9cf279450d..43116e8c8e13 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -274,7 +274,7 @@ struct sock *__inet_lookup_established(struct net *net,
274 const __be32 daddr, const u16 hnum, 274 const __be32 daddr, const u16 hnum,
275 const int dif) 275 const int dif)
276{ 276{
277 INET_ADDR_COOKIE(acookie, saddr, daddr) 277 INET_ADDR_COOKIE(acookie, saddr, daddr);
278 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 278 const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
279 struct sock *sk; 279 struct sock *sk;
280 const struct hlist_nulls_node *node; 280 const struct hlist_nulls_node *node;
@@ -327,7 +327,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
327 __be32 daddr = inet->inet_rcv_saddr; 327 __be32 daddr = inet->inet_rcv_saddr;
328 __be32 saddr = inet->inet_daddr; 328 __be32 saddr = inet->inet_daddr;
329 int dif = sk->sk_bound_dev_if; 329 int dif = sk->sk_bound_dev_if;
330 INET_ADDR_COOKIE(acookie, saddr, daddr) 330 INET_ADDR_COOKIE(acookie, saddr, daddr);
331 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 331 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
332 struct net *net = sock_net(sk); 332 struct net *net = sock_net(sk);
333 unsigned int hash = inet_ehashfn(net, daddr, lport, 333 unsigned int hash = inet_ehashfn(net, daddr, lport,
@@ -500,7 +500,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
500 local_bh_disable(); 500 local_bh_disable();
501 for (i = 1; i <= remaining; i++) { 501 for (i = 1; i <= remaining; i++) {
502 port = low + (i + offset) % remaining; 502 port = low + (i + offset) % remaining;
503 if (inet_is_reserved_local_port(port)) 503 if (inet_is_local_reserved_port(net, port))
504 continue; 504 continue;
505 head = &hinfo->bhash[inet_bhashfn(net, port, 505 head = &hinfo->bhash[inet_bhashfn(net, port,
506 hinfo->bhash_size)]; 506 hinfo->bhash_size)];
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 56cd458a1b8c..bd5f5928167d 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -26,20 +26,7 @@
26 * Theory of operations. 26 * Theory of operations.
27 * We keep one entry for each peer IP address. The nodes contains long-living 27 * We keep one entry for each peer IP address. The nodes contains long-living
28 * information about the peer which doesn't depend on routes. 28 * information about the peer which doesn't depend on routes.
29 * At this moment this information consists only of ID field for the next
30 * outgoing IP packet. This field is incremented with each packet as encoded
31 * in inet_getid() function (include/net/inetpeer.h).
32 * At the moment of writing this notes identifier of IP packets is generated
33 * to be unpredictable using this code only for packets subjected
34 * (actually or potentially) to defragmentation. I.e. DF packets less than
35 * PMTU in size when local fragmentation is disabled use a constant ID and do
36 * not use this code (see ip_select_ident() in include/net/ip.h).
37 * 29 *
38 * Route cache entries hold references to our nodes.
39 * New cache entries get references via lookup by destination IP address in
40 * the avl tree. The reference is grabbed only when it's needed i.e. only
41 * when we try to output IP packet which needs an unpredictable ID (see
42 * __ip_select_ident() in net/ipv4/route.c).
43 * Nodes are removed only when reference counter goes to 0. 30 * Nodes are removed only when reference counter goes to 0.
44 * When it's happened the node may be removed when a sufficient amount of 31 * When it's happened the node may be removed when a sufficient amount of
45 * time has been passed since its last use. The less-recently-used entry can 32 * time has been passed since its last use. The less-recently-used entry can
@@ -62,7 +49,6 @@
62 * refcnt: atomically against modifications on other CPU; 49 * refcnt: atomically against modifications on other CPU;
63 * usually under some other lock to prevent node disappearing 50 * usually under some other lock to prevent node disappearing
64 * daddr: unchangeable 51 * daddr: unchangeable
65 * ip_id_count: atomic value (no lock needed)
66 */ 52 */
67 53
68static struct kmem_cache *peer_cachep __read_mostly; 54static struct kmem_cache *peer_cachep __read_mostly;
@@ -120,7 +106,7 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min
120static void inetpeer_gc_worker(struct work_struct *work) 106static void inetpeer_gc_worker(struct work_struct *work)
121{ 107{
122 struct inet_peer *p, *n, *c; 108 struct inet_peer *p, *n, *c;
123 LIST_HEAD(list); 109 struct list_head list;
124 110
125 spin_lock_bh(&gc_lock); 111 spin_lock_bh(&gc_lock);
126 list_replace_init(&gc_list, &list); 112 list_replace_init(&gc_list, &list);
@@ -497,10 +483,6 @@ relookup:
497 p->daddr = *daddr; 483 p->daddr = *daddr;
498 atomic_set(&p->refcnt, 1); 484 atomic_set(&p->refcnt, 1);
499 atomic_set(&p->rid, 0); 485 atomic_set(&p->rid, 0);
500 atomic_set(&p->ip_id_count,
501 (daddr->family == AF_INET) ?
502 secure_ip_id(daddr->addr.a4) :
503 secure_ipv6_id(daddr->addr.a6));
504 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; 486 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
505 p->rate_tokens = 0; 487 p->rate_tokens = 0;
506 /* 60*HZ is arbitrary, but chosen enough high so that the first 488 /* 60*HZ is arbitrary, but chosen enough high so that the first
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 6f111e48e11c..3a83ce5efa80 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -42,7 +42,7 @@
42static bool ip_may_fragment(const struct sk_buff *skb) 42static bool ip_may_fragment(const struct sk_buff *skb)
43{ 43{
44 return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) || 44 return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) ||
45 skb->local_df; 45 skb->ignore_df;
46} 46}
47 47
48static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) 48static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 94213c891565..9b842544aea3 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -410,7 +410,7 @@ static int ipgre_open(struct net_device *dev)
410 struct flowi4 fl4; 410 struct flowi4 fl4;
411 struct rtable *rt; 411 struct rtable *rt;
412 412
413 rt = ip_route_output_gre(dev_net(dev), &fl4, 413 rt = ip_route_output_gre(t->net, &fl4,
414 t->parms.iph.daddr, 414 t->parms.iph.daddr,
415 t->parms.iph.saddr, 415 t->parms.iph.saddr,
416 t->parms.o_key, 416 t->parms.o_key,
@@ -434,7 +434,7 @@ static int ipgre_close(struct net_device *dev)
434 434
435 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { 435 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
436 struct in_device *in_dev; 436 struct in_device *in_dev;
437 in_dev = inetdev_by_index(dev_net(dev), t->mlink); 437 in_dev = inetdev_by_index(t->net, t->mlink);
438 if (in_dev) 438 if (in_dev)
439 ip_mc_dec_group(in_dev, t->parms.iph.daddr); 439 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
440 } 440 }
@@ -478,7 +478,7 @@ static void __gre_tunnel_init(struct net_device *dev)
478 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; 478 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
479 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; 479 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
480 480
481 dev->features |= NETIF_F_NETNS_LOCAL | GRE_FEATURES; 481 dev->features |= GRE_FEATURES;
482 dev->hw_features |= GRE_FEATURES; 482 dev->hw_features |= GRE_FEATURES;
483 483
484 if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { 484 if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
@@ -649,6 +649,7 @@ static void ipgre_tap_setup(struct net_device *dev)
649{ 649{
650 ether_setup(dev); 650 ether_setup(dev);
651 dev->netdev_ops = &gre_tap_netdev_ops; 651 dev->netdev_ops = &gre_tap_netdev_ops;
652 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
652 ip_tunnel_setup(dev, gre_tap_net_id); 653 ip_tunnel_setup(dev, gre_tap_net_id);
653} 654}
654 655
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index f4ab72e19af9..5e7aecea05cd 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -364,7 +364,7 @@ int ip_options_compile(struct net *net,
364 } 364 }
365 if (optptr[2] <= optlen) { 365 if (optptr[2] <= optlen) {
366 unsigned char *timeptr = NULL; 366 unsigned char *timeptr = NULL;
367 if (optptr[2]+3 > optptr[1]) { 367 if (optptr[2]+3 > optlen) {
368 pp_ptr = optptr + 2; 368 pp_ptr = optptr + 2;
369 goto error; 369 goto error;
370 } 370 }
@@ -376,7 +376,7 @@ int ip_options_compile(struct net *net,
376 optptr[2] += 4; 376 optptr[2] += 4;
377 break; 377 break;
378 case IPOPT_TS_TSANDADDR: 378 case IPOPT_TS_TSANDADDR:
379 if (optptr[2]+7 > optptr[1]) { 379 if (optptr[2]+7 > optlen) {
380 pp_ptr = optptr + 2; 380 pp_ptr = optptr + 2;
381 goto error; 381 goto error;
382 } 382 }
@@ -390,7 +390,7 @@ int ip_options_compile(struct net *net,
390 optptr[2] += 8; 390 optptr[2] += 8;
391 break; 391 break;
392 case IPOPT_TS_PRESPEC: 392 case IPOPT_TS_PRESPEC:
393 if (optptr[2]+7 > optptr[1]) { 393 if (optptr[2]+7 > optlen) {
394 pp_ptr = optptr + 2; 394 pp_ptr = optptr + 2;
395 goto error; 395 goto error;
396 } 396 }
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index a52f50187b54..8d3b6b0e9857 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -148,7 +148,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
148 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); 148 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
149 iph->saddr = saddr; 149 iph->saddr = saddr;
150 iph->protocol = sk->sk_protocol; 150 iph->protocol = sk->sk_protocol;
151 ip_select_ident(skb, &rt->dst, sk); 151 ip_select_ident(skb, sk);
152 152
153 if (opt && opt->opt.optlen) { 153 if (opt && opt->opt.optlen) {
154 iph->ihl += opt->opt.optlen>>2; 154 iph->ihl += opt->opt.optlen>>2;
@@ -415,7 +415,7 @@ packet_routed:
415 skb_reset_network_header(skb); 415 skb_reset_network_header(skb);
416 iph = ip_hdr(skb); 416 iph = ip_hdr(skb);
417 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); 417 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
418 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df) 418 if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
419 iph->frag_off = htons(IP_DF); 419 iph->frag_off = htons(IP_DF);
420 else 420 else
421 iph->frag_off = 0; 421 iph->frag_off = 0;
@@ -430,8 +430,7 @@ packet_routed:
430 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); 430 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
431 } 431 }
432 432
433 ip_select_ident_more(skb, &rt->dst, sk, 433 ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1);
434 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
435 434
436 /* TODO : should we use skb->sk here instead of sk ? */ 435 /* TODO : should we use skb->sk here instead of sk ? */
437 skb->priority = sk->sk_priority; 436 skb->priority = sk->sk_priority;
@@ -501,7 +500,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
501 iph = ip_hdr(skb); 500 iph = ip_hdr(skb);
502 501
503 mtu = ip_skb_dst_mtu(skb); 502 mtu = ip_skb_dst_mtu(skb);
504 if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) || 503 if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
505 (IPCB(skb)->frag_max_size && 504 (IPCB(skb)->frag_max_size &&
506 IPCB(skb)->frag_max_size > mtu))) { 505 IPCB(skb)->frag_max_size > mtu))) {
507 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 506 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
@@ -866,7 +865,7 @@ static int __ip_append_data(struct sock *sk,
866 865
867 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 866 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
868 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 867 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
869 maxnonfragsize = ip_sk_local_df(sk) ? 0xFFFF : mtu; 868 maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
870 869
871 if (cork->length + length > maxnonfragsize - fragheaderlen) { 870 if (cork->length + length > maxnonfragsize - fragheaderlen) {
872 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, 871 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
@@ -1189,7 +1188,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1189 1188
1190 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 1189 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1191 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 1190 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1192 maxnonfragsize = ip_sk_local_df(sk) ? 0xFFFF : mtu; 1191 maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
1193 1192
1194 if (cork->length + size > maxnonfragsize - fragheaderlen) { 1193 if (cork->length + size > maxnonfragsize - fragheaderlen) {
1195 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, 1194 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
@@ -1350,10 +1349,10 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
1350 * to fragment the frame generated here. No matter, what transforms 1349 * to fragment the frame generated here. No matter, what transforms
1351 * how transforms change size of the packet, it will come out. 1350 * how transforms change size of the packet, it will come out.
1352 */ 1351 */
1353 skb->local_df = ip_sk_local_df(sk); 1352 skb->ignore_df = ip_sk_ignore_df(sk);
1354 1353
1355 /* DF bit is set when we want to see DF on outgoing frames. 1354 /* DF bit is set when we want to see DF on outgoing frames.
1356 * If local_df is set too, we still allow to fragment this frame 1355 * If ignore_df is set too, we still allow to fragment this frame
1357 * locally. */ 1356 * locally. */
1358 if (inet->pmtudisc == IP_PMTUDISC_DO || 1357 if (inet->pmtudisc == IP_PMTUDISC_DO ||
1359 inet->pmtudisc == IP_PMTUDISC_PROBE || 1358 inet->pmtudisc == IP_PMTUDISC_PROBE ||
@@ -1379,7 +1378,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
1379 iph->ttl = ttl; 1378 iph->ttl = ttl;
1380 iph->protocol = sk->sk_protocol; 1379 iph->protocol = sk->sk_protocol;
1381 ip_copy_addrs(iph, fl4); 1380 ip_copy_addrs(iph, fl4);
1382 ip_select_ident(skb, &rt->dst, sk); 1381 ip_select_ident(skb, sk);
1383 1382
1384 if (opt) { 1383 if (opt) {
1385 iph->ihl += opt->optlen>>2; 1384 iph->ihl += opt->optlen>>2;
@@ -1546,7 +1545,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
1546 daddr = replyopts.opt.opt.faddr; 1545 daddr = replyopts.opt.opt.faddr;
1547 } 1546 }
1548 1547
1549 flowi4_init_output(&fl4, arg->bound_dev_if, 0, 1548 flowi4_init_output(&fl4, arg->bound_dev_if,
1549 IP4_REPLY_MARK(net, skb->mark),
1550 RT_TOS(arg->tos), 1550 RT_TOS(arg->tos),
1551 RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, 1551 RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
1552 ip_reply_arg_flowi_flags(arg), 1552 ip_reply_arg_flowi_flags(arg),
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 2acc2337d38b..097b3e7c1e8f 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -268,6 +268,7 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
268 __be32 remote = parms->iph.daddr; 268 __be32 remote = parms->iph.daddr;
269 __be32 local = parms->iph.saddr; 269 __be32 local = parms->iph.saddr;
270 __be32 key = parms->i_key; 270 __be32 key = parms->i_key;
271 __be16 flags = parms->i_flags;
271 int link = parms->link; 272 int link = parms->link;
272 struct ip_tunnel *t = NULL; 273 struct ip_tunnel *t = NULL;
273 struct hlist_head *head = ip_bucket(itn, parms); 274 struct hlist_head *head = ip_bucket(itn, parms);
@@ -275,9 +276,9 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
275 hlist_for_each_entry_rcu(t, head, hash_node) { 276 hlist_for_each_entry_rcu(t, head, hash_node) {
276 if (local == t->parms.iph.saddr && 277 if (local == t->parms.iph.saddr &&
277 remote == t->parms.iph.daddr && 278 remote == t->parms.iph.daddr &&
278 key == t->parms.i_key &&
279 link == t->parms.link && 279 link == t->parms.link &&
280 type == t->dev->type) 280 type == t->dev->type &&
281 ip_tunnel_key_match(&t->parms, flags, key))
281 break; 282 break;
282 } 283 }
283 return t; 284 return t;
@@ -395,11 +396,10 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
395 struct ip_tunnel_net *itn, 396 struct ip_tunnel_net *itn,
396 struct ip_tunnel_parm *parms) 397 struct ip_tunnel_parm *parms)
397{ 398{
398 struct ip_tunnel *nt, *fbt; 399 struct ip_tunnel *nt;
399 struct net_device *dev; 400 struct net_device *dev;
400 401
401 BUG_ON(!itn->fb_tunnel_dev); 402 BUG_ON(!itn->fb_tunnel_dev);
402 fbt = netdev_priv(itn->fb_tunnel_dev);
403 dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); 403 dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
404 if (IS_ERR(dev)) 404 if (IS_ERR(dev))
405 return ERR_CAST(dev); 405 return ERR_CAST(dev);
@@ -668,6 +668,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
668 dev->needed_headroom = max_headroom; 668 dev->needed_headroom = max_headroom;
669 669
670 if (skb_cow_head(skb, dev->needed_headroom)) { 670 if (skb_cow_head(skb, dev->needed_headroom)) {
671 ip_rt_put(rt);
671 dev->stats.tx_dropped++; 672 dev->stats.tx_dropped++;
672 kfree_skb(skb); 673 kfree_skb(skb);
673 return; 674 return;
@@ -747,19 +748,19 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
747 goto done; 748 goto done;
748 if (p->iph.ttl) 749 if (p->iph.ttl)
749 p->iph.frag_off |= htons(IP_DF); 750 p->iph.frag_off |= htons(IP_DF);
750 if (!(p->i_flags&TUNNEL_KEY)) 751 if (!(p->i_flags & VTI_ISVTI)) {
751 p->i_key = 0; 752 if (!(p->i_flags & TUNNEL_KEY))
752 if (!(p->o_flags&TUNNEL_KEY)) 753 p->i_key = 0;
753 p->o_key = 0; 754 if (!(p->o_flags & TUNNEL_KEY))
755 p->o_key = 0;
756 }
754 757
755 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 758 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
756 759
757 if (!t && (cmd == SIOCADDTUNNEL)) { 760 if (!t && (cmd == SIOCADDTUNNEL)) {
758 t = ip_tunnel_create(net, itn, p); 761 t = ip_tunnel_create(net, itn, p);
759 if (IS_ERR(t)) { 762 err = PTR_ERR_OR_ZERO(t);
760 err = PTR_ERR(t); 763 break;
761 break;
762 }
763 } 764 }
764 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { 765 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
765 if (t != NULL) { 766 if (t != NULL) {
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index bcf206c79005..f4c987bb7e94 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -74,7 +74,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
74 iph->daddr = dst; 74 iph->daddr = dst;
75 iph->saddr = src; 75 iph->saddr = src;
76 iph->ttl = ttl; 76 iph->ttl = ttl;
77 __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1); 77 __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1);
78 78
79 err = ip_local_out_sk(sk, skb); 79 err = ip_local_out_sk(sk, skb);
80 if (unlikely(net_xmit_eval(err))) 80 if (unlikely(net_xmit_eval(err)))
@@ -135,6 +135,14 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
135 return skb; 135 return skb;
136 } 136 }
137 137
138 /* If packet is not gso and we are resolving any partial checksum,
139 * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL
140 * on the outer header without confusing devices that implement
141 * NETIF_F_IP_CSUM with encapsulation.
142 */
143 if (csum_help)
144 skb->encapsulation = 0;
145
138 if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) { 146 if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) {
139 err = skb_checksum_help(skb); 147 err = skb_checksum_help(skb);
140 if (unlikely(err)) 148 if (unlikely(err))
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 13ef00f1e17b..b8960f3527f3 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -313,7 +313,13 @@ vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
313 return -EINVAL; 313 return -EINVAL;
314 } 314 }
315 315
316 p.i_flags |= VTI_ISVTI; 316 if (!(p.i_flags & GRE_KEY))
317 p.i_key = 0;
318 if (!(p.o_flags & GRE_KEY))
319 p.o_key = 0;
320
321 p.i_flags = VTI_ISVTI;
322
317 err = ip_tunnel_ioctl(dev, &p, cmd); 323 err = ip_tunnel_ioctl(dev, &p, cmd);
318 if (err) 324 if (err)
319 return err; 325 return err;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 812b18351462..62eaa005e146 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -149,13 +149,13 @@ static int ipip_err(struct sk_buff *skb, u32 info)
149 149
150 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { 150 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
151 ipv4_update_pmtu(skb, dev_net(skb->dev), info, 151 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
152 t->dev->ifindex, 0, IPPROTO_IPIP, 0); 152 t->parms.link, 0, IPPROTO_IPIP, 0);
153 err = 0; 153 err = 0;
154 goto out; 154 goto out;
155 } 155 }
156 156
157 if (type == ICMP_REDIRECT) { 157 if (type == ICMP_REDIRECT) {
158 ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0, 158 ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
159 IPPROTO_IPIP, 0); 159 IPPROTO_IPIP, 0);
160 err = 0; 160 err = 0;
161 goto out; 161 goto out;
@@ -486,4 +486,5 @@ static void __exit ipip_fini(void)
486module_init(ipip_init); 486module_init(ipip_init);
487module_exit(ipip_fini); 487module_exit(ipip_fini);
488MODULE_LICENSE("GPL"); 488MODULE_LICENSE("GPL");
489MODULE_ALIAS_RTNL_LINK("ipip");
489MODULE_ALIAS_NETDEV("tunl0"); 490MODULE_ALIAS_NETDEV("tunl0");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index d84dc8d4c916..65bcaa789043 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -484,7 +484,7 @@ static void reg_vif_setup(struct net_device *dev)
484 dev->type = ARPHRD_PIMREG; 484 dev->type = ARPHRD_PIMREG;
485 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; 485 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
486 dev->flags = IFF_NOARP; 486 dev->flags = IFF_NOARP;
487 dev->netdev_ops = &reg_vif_netdev_ops, 487 dev->netdev_ops = &reg_vif_netdev_ops;
488 dev->destructor = free_netdev; 488 dev->destructor = free_netdev;
489 dev->features |= NETIF_F_NETNS_LOCAL; 489 dev->features |= NETIF_F_NETNS_LOCAL;
490} 490}
@@ -1663,7 +1663,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1663 iph->protocol = IPPROTO_IPIP; 1663 iph->protocol = IPPROTO_IPIP;
1664 iph->ihl = 5; 1664 iph->ihl = 5;
1665 iph->tot_len = htons(skb->len); 1665 iph->tot_len = htons(skb->len);
1666 ip_select_ident(skb, skb_dst(skb), NULL); 1666 ip_select_ident(skb, NULL);
1667 ip_send_check(iph); 1667 ip_send_check(iph);
1668 1668
1669 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 1669 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index ee2886126e3d..f1787c04a4dd 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -91,17 +91,9 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops,
91 if (nf_ct_is_untracked(ct)) 91 if (nf_ct_is_untracked(ct))
92 return NF_ACCEPT; 92 return NF_ACCEPT;
93 93
94 nat = nfct_nat(ct); 94 nat = nf_ct_nat_ext_add(ct);
95 if (!nat) { 95 if (nat == NULL)
96 /* NAT module was loaded late. */ 96 return NF_ACCEPT;
97 if (nf_ct_is_confirmed(ct))
98 return NF_ACCEPT;
99 nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
100 if (nat == NULL) {
101 pr_debug("failed to add NAT extension\n");
102 return NF_ACCEPT;
103 }
104 }
105 97
106 switch (ctinfo) { 98 switch (ctinfo) {
107 case IP_CT_RELATED: 99 case IP_CT_RELATED:
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index f40f321b41fc..b8f6381c7d0b 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -34,7 +34,7 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
34 34
35 if (!err) { 35 if (!err) {
36 ip_send_check(ip_hdr(skb)); 36 ip_send_check(ip_hdr(skb));
37 skb->local_df = 1; 37 skb->ignore_df = 1;
38 } 38 }
39 39
40 return err; 40 return err;
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index b5b256d45e67..3964157d826c 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -48,15 +48,9 @@ static unsigned int nf_nat_fn(const struct nf_hook_ops *ops,
48 48
49 NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))); 49 NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)));
50 50
51 nat = nfct_nat(ct); 51 nat = nf_ct_nat_ext_add(ct);
52 if (nat == NULL) { 52 if (nat == NULL)
53 /* Conntrack module was loaded late, can't add extension. */ 53 return NF_ACCEPT;
54 if (nf_ct_is_confirmed(ct))
55 return NF_ACCEPT;
56 nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
57 if (nat == NULL)
58 return NF_ACCEPT;
59 }
60 54
61 switch (ctinfo) { 55 switch (ctinfo) {
62 case IP_CT_RELATED: 56 case IP_CT_RELATED:
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index ad737fad6d8b..ae0af9386f7c 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -345,15 +345,15 @@ static void icmp_put(struct seq_file *seq)
345 for (i = 0; icmpmibmap[i].name != NULL; i++) 345 for (i = 0; icmpmibmap[i].name != NULL; i++)
346 seq_printf(seq, " Out%s", icmpmibmap[i].name); 346 seq_printf(seq, " Out%s", icmpmibmap[i].name);
347 seq_printf(seq, "\nIcmp: %lu %lu %lu", 347 seq_printf(seq, "\nIcmp: %lu %lu %lu",
348 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS), 348 snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS),
349 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS), 349 snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS),
350 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); 350 snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS));
351 for (i = 0; icmpmibmap[i].name != NULL; i++) 351 for (i = 0; icmpmibmap[i].name != NULL; i++)
352 seq_printf(seq, " %lu", 352 seq_printf(seq, " %lu",
353 atomic_long_read(ptr + icmpmibmap[i].index)); 353 atomic_long_read(ptr + icmpmibmap[i].index));
354 seq_printf(seq, " %lu %lu", 354 seq_printf(seq, " %lu %lu",
355 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), 355 snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
356 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); 356 snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
357 for (i = 0; icmpmibmap[i].name != NULL; i++) 357 for (i = 0; icmpmibmap[i].name != NULL; i++)
358 seq_printf(seq, " %lu", 358 seq_printf(seq, " %lu",
359 atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); 359 atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
@@ -379,7 +379,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
379 BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); 379 BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
380 for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) 380 for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
381 seq_printf(seq, " %llu", 381 seq_printf(seq, " %llu",
382 snmp_fold_field64((void __percpu **)net->mib.ip_statistics, 382 snmp_fold_field64(net->mib.ip_statistics,
383 snmp4_ipstats_list[i].entry, 383 snmp4_ipstats_list[i].entry,
384 offsetof(struct ipstats_mib, syncp))); 384 offsetof(struct ipstats_mib, syncp)));
385 385
@@ -395,11 +395,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
395 /* MaxConn field is signed, RFC 2012 */ 395 /* MaxConn field is signed, RFC 2012 */
396 if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) 396 if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
397 seq_printf(seq, " %ld", 397 seq_printf(seq, " %ld",
398 snmp_fold_field((void __percpu **)net->mib.tcp_statistics, 398 snmp_fold_field(net->mib.tcp_statistics,
399 snmp4_tcp_list[i].entry)); 399 snmp4_tcp_list[i].entry));
400 else 400 else
401 seq_printf(seq, " %lu", 401 seq_printf(seq, " %lu",
402 snmp_fold_field((void __percpu **)net->mib.tcp_statistics, 402 snmp_fold_field(net->mib.tcp_statistics,
403 snmp4_tcp_list[i].entry)); 403 snmp4_tcp_list[i].entry));
404 } 404 }
405 405
@@ -410,7 +410,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
410 seq_puts(seq, "\nUdp:"); 410 seq_puts(seq, "\nUdp:");
411 for (i = 0; snmp4_udp_list[i].name != NULL; i++) 411 for (i = 0; snmp4_udp_list[i].name != NULL; i++)
412 seq_printf(seq, " %lu", 412 seq_printf(seq, " %lu",
413 snmp_fold_field((void __percpu **)net->mib.udp_statistics, 413 snmp_fold_field(net->mib.udp_statistics,
414 snmp4_udp_list[i].entry)); 414 snmp4_udp_list[i].entry));
415 415
416 /* the UDP and UDP-Lite MIBs are the same */ 416 /* the UDP and UDP-Lite MIBs are the same */
@@ -421,7 +421,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
421 seq_puts(seq, "\nUdpLite:"); 421 seq_puts(seq, "\nUdpLite:");
422 for (i = 0; snmp4_udp_list[i].name != NULL; i++) 422 for (i = 0; snmp4_udp_list[i].name != NULL; i++)
423 seq_printf(seq, " %lu", 423 seq_printf(seq, " %lu",
424 snmp_fold_field((void __percpu **)net->mib.udplite_statistics, 424 snmp_fold_field(net->mib.udplite_statistics,
425 snmp4_udp_list[i].entry)); 425 snmp4_udp_list[i].entry));
426 426
427 seq_putc(seq, '\n'); 427 seq_putc(seq, '\n');
@@ -458,7 +458,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
458 seq_puts(seq, "\nTcpExt:"); 458 seq_puts(seq, "\nTcpExt:");
459 for (i = 0; snmp4_net_list[i].name != NULL; i++) 459 for (i = 0; snmp4_net_list[i].name != NULL; i++)
460 seq_printf(seq, " %lu", 460 seq_printf(seq, " %lu",
461 snmp_fold_field((void __percpu **)net->mib.net_statistics, 461 snmp_fold_field(net->mib.net_statistics,
462 snmp4_net_list[i].entry)); 462 snmp4_net_list[i].entry));
463 463
464 seq_puts(seq, "\nIpExt:"); 464 seq_puts(seq, "\nIpExt:");
@@ -468,7 +468,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
468 seq_puts(seq, "\nIpExt:"); 468 seq_puts(seq, "\nIpExt:");
469 for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) 469 for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
470 seq_printf(seq, " %llu", 470 seq_printf(seq, " %llu",
471 snmp_fold_field64((void __percpu **)net->mib.ip_statistics, 471 snmp_fold_field64(net->mib.ip_statistics,
472 snmp4_ipextstats_list[i].entry, 472 snmp4_ipextstats_list[i].entry,
473 offsetof(struct ipstats_mib, syncp))); 473 offsetof(struct ipstats_mib, syncp)));
474 474
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index a9dbe58bdfe7..2c65160565e1 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -389,7 +389,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
389 iph->check = 0; 389 iph->check = 0;
390 iph->tot_len = htons(length); 390 iph->tot_len = htons(length);
391 if (!iph->id) 391 if (!iph->id)
392 ip_select_ident(skb, &rt->dst, NULL); 392 ip_select_ident(skb, NULL);
393 393
394 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); 394 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
395 } 395 }
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 5e676be3daeb..082239ffe34a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -89,6 +89,7 @@
89#include <linux/rcupdate.h> 89#include <linux/rcupdate.h>
90#include <linux/times.h> 90#include <linux/times.h>
91#include <linux/slab.h> 91#include <linux/slab.h>
92#include <linux/jhash.h>
92#include <net/dst.h> 93#include <net/dst.h>
93#include <net/net_namespace.h> 94#include <net/net_namespace.h>
94#include <net/protocol.h> 95#include <net/protocol.h>
@@ -456,39 +457,19 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
456 return neigh_create(&arp_tbl, pkey, dev); 457 return neigh_create(&arp_tbl, pkey, dev);
457} 458}
458 459
459/* 460atomic_t *ip_idents __read_mostly;
460 * Peer allocation may fail only in serious out-of-memory conditions. However 461EXPORT_SYMBOL(ip_idents);
461 * we still can generate some output.
462 * Random ID selection looks a bit dangerous because we have no chances to
463 * select ID being unique in a reasonable period of time.
464 * But broken packet identifier may be better than no packet at all.
465 */
466static void ip_select_fb_ident(struct iphdr *iph)
467{
468 static DEFINE_SPINLOCK(ip_fb_id_lock);
469 static u32 ip_fallback_id;
470 u32 salt;
471
472 spin_lock_bh(&ip_fb_id_lock);
473 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
474 iph->id = htons(salt & 0xFFFF);
475 ip_fallback_id = salt;
476 spin_unlock_bh(&ip_fb_id_lock);
477}
478 462
479void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) 463void __ip_select_ident(struct iphdr *iph, int segs)
480{ 464{
481 struct net *net = dev_net(dst->dev); 465 static u32 ip_idents_hashrnd __read_mostly;
482 struct inet_peer *peer; 466 u32 hash, id;
483 467
484 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1); 468 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
485 if (peer) {
486 iph->id = htons(inet_getid(peer, more));
487 inet_putpeer(peer);
488 return;
489 }
490 469
491 ip_select_fb_ident(iph); 470 hash = jhash_1word((__force u32)iph->daddr, ip_idents_hashrnd);
471 id = ip_idents_reserve(hash, segs);
472 iph->id = htons(id);
492} 473}
493EXPORT_SYMBOL(__ip_select_ident); 474EXPORT_SYMBOL(__ip_select_ident);
494 475
@@ -993,6 +974,9 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
993 struct flowi4 fl4; 974 struct flowi4 fl4;
994 struct rtable *rt; 975 struct rtable *rt;
995 976
977 if (!mark)
978 mark = IP4_REPLY_MARK(net, skb->mark);
979
996 __build_flow_key(&fl4, NULL, iph, oif, 980 __build_flow_key(&fl4, NULL, iph, oif,
997 RT_TOS(iph->tos), protocol, mark, flow_flags); 981 RT_TOS(iph->tos), protocol, mark, flow_flags);
998 rt = __ip_route_output_key(net, &fl4); 982 rt = __ip_route_output_key(net, &fl4);
@@ -1010,6 +994,10 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1010 struct rtable *rt; 994 struct rtable *rt;
1011 995
1012 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); 996 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
997
998 if (!fl4.flowi4_mark)
999 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1000
1013 rt = __ip_route_output_key(sock_net(sk), &fl4); 1001 rt = __ip_route_output_key(sock_net(sk), &fl4);
1014 if (!IS_ERR(rt)) { 1002 if (!IS_ERR(rt)) {
1015 __ip_rt_update_pmtu(rt, &fl4, mtu); 1003 __ip_rt_update_pmtu(rt, &fl4, mtu);
@@ -2704,6 +2692,12 @@ int __init ip_rt_init(void)
2704{ 2692{
2705 int rc = 0; 2693 int rc = 0;
2706 2694
2695 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2696 if (!ip_idents)
2697 panic("IP: failed to allocate ip_idents\n");
2698
2699 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2700
2707#ifdef CONFIG_IP_ROUTE_CLASSID 2701#ifdef CONFIG_IP_ROUTE_CLASSID
2708 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 2702 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2709 if (!ip_rt_acct) 2703 if (!ip_rt_acct)
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index f2ed13c2125f..c86624b36a62 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -303,6 +303,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
303 ireq->ir_rmt_port = th->source; 303 ireq->ir_rmt_port = th->source;
304 ireq->ir_loc_addr = ip_hdr(skb)->daddr; 304 ireq->ir_loc_addr = ip_hdr(skb)->daddr;
305 ireq->ir_rmt_addr = ip_hdr(skb)->saddr; 305 ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
306 ireq->ir_mark = inet_request_mark(sk, skb);
306 ireq->ecn_ok = ecn_ok; 307 ireq->ecn_ok = ecn_ok;
307 ireq->snd_wscale = tcp_opt.snd_wscale; 308 ireq->snd_wscale = tcp_opt.snd_wscale;
308 ireq->sack_ok = tcp_opt.sack_ok; 309 ireq->sack_ok = tcp_opt.sack_ok;
@@ -339,7 +340,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
339 * hasn't changed since we received the original syn, but I see 340 * hasn't changed since we received the original syn, but I see
340 * no easy way to do this. 341 * no easy way to do this.
341 */ 342 */
342 flowi4_init_output(&fl4, sk->sk_bound_dev_if, sk->sk_mark, 343 flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark,
343 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, 344 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
344 inet_sk_flowi_flags(sk), 345 inet_sk_flowi_flags(sk),
345 (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr, 346 (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr,
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 5cde8f263d40..79a007c52558 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -437,13 +437,6 @@ static struct ctl_table ipv4_table[] = {
437 .proc_handler = proc_dointvec 437 .proc_handler = proc_dointvec
438 }, 438 },
439 { 439 {
440 .procname = "ip_local_reserved_ports",
441 .data = NULL, /* initialized in sysctl_ipv4_init */
442 .maxlen = 65536,
443 .mode = 0644,
444 .proc_handler = proc_do_large_bitmap,
445 },
446 {
447 .procname = "igmp_max_memberships", 440 .procname = "igmp_max_memberships",
448 .data = &sysctl_igmp_max_memberships, 441 .data = &sysctl_igmp_max_memberships,
449 .maxlen = sizeof(int), 442 .maxlen = sizeof(int),
@@ -825,6 +818,13 @@ static struct ctl_table ipv4_net_table[] = {
825 .proc_handler = ipv4_local_port_range, 818 .proc_handler = ipv4_local_port_range,
826 }, 819 },
827 { 820 {
821 .procname = "ip_local_reserved_ports",
822 .data = &init_net.ipv4.sysctl_local_reserved_ports,
823 .maxlen = 65536,
824 .mode = 0644,
825 .proc_handler = proc_do_large_bitmap,
826 },
827 {
828 .procname = "ip_no_pmtu_disc", 828 .procname = "ip_no_pmtu_disc",
829 .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc, 829 .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc,
830 .maxlen = sizeof(int), 830 .maxlen = sizeof(int),
@@ -838,6 +838,20 @@ static struct ctl_table ipv4_net_table[] = {
838 .mode = 0644, 838 .mode = 0644,
839 .proc_handler = proc_dointvec, 839 .proc_handler = proc_dointvec,
840 }, 840 },
841 {
842 .procname = "fwmark_reflect",
843 .data = &init_net.ipv4.sysctl_fwmark_reflect,
844 .maxlen = sizeof(int),
845 .mode = 0644,
846 .proc_handler = proc_dointvec,
847 },
848 {
849 .procname = "tcp_fwmark_accept",
850 .data = &init_net.ipv4.sysctl_tcp_fwmark_accept,
851 .maxlen = sizeof(int),
852 .mode = 0644,
853 .proc_handler = proc_dointvec,
854 },
841 { } 855 { }
842}; 856};
843 857
@@ -862,8 +876,14 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
862 if (net->ipv4.ipv4_hdr == NULL) 876 if (net->ipv4.ipv4_hdr == NULL)
863 goto err_reg; 877 goto err_reg;
864 878
879 net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
880 if (!net->ipv4.sysctl_local_reserved_ports)
881 goto err_ports;
882
865 return 0; 883 return 0;
866 884
885err_ports:
886 unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
867err_reg: 887err_reg:
868 if (!net_eq(net, &init_net)) 888 if (!net_eq(net, &init_net))
869 kfree(table); 889 kfree(table);
@@ -875,6 +895,7 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net)
875{ 895{
876 struct ctl_table *table; 896 struct ctl_table *table;
877 897
898 kfree(net->ipv4.sysctl_local_reserved_ports);
878 table = net->ipv4.ipv4_hdr->ctl_table_arg; 899 table = net->ipv4.ipv4_hdr->ctl_table_arg;
879 unregister_net_sysctl_table(net->ipv4.ipv4_hdr); 900 unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
880 kfree(table); 901 kfree(table);
@@ -888,16 +909,6 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
888static __init int sysctl_ipv4_init(void) 909static __init int sysctl_ipv4_init(void)
889{ 910{
890 struct ctl_table_header *hdr; 911 struct ctl_table_header *hdr;
891 struct ctl_table *i;
892
893 for (i = ipv4_table; i->procname; i++) {
894 if (strcmp(i->procname, "ip_local_reserved_ports") == 0) {
895 i->data = sysctl_local_reserved_ports;
896 break;
897 }
898 }
899 if (!i->procname)
900 return -EINVAL;
901 912
902 hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table); 913 hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table);
903 if (hdr == NULL) 914 if (hdr == NULL)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4bd6d52eeffb..eb1dde37e678 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2916,6 +2916,14 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2916 case TCP_USER_TIMEOUT: 2916 case TCP_USER_TIMEOUT:
2917 val = jiffies_to_msecs(icsk->icsk_user_timeout); 2917 val = jiffies_to_msecs(icsk->icsk_user_timeout);
2918 break; 2918 break;
2919
2920 case TCP_FASTOPEN:
2921 if (icsk->icsk_accept_queue.fastopenq != NULL)
2922 val = icsk->icsk_accept_queue.fastopenq->max_qlen;
2923 else
2924 val = 0;
2925 break;
2926
2919 case TCP_TIMESTAMP: 2927 case TCP_TIMESTAMP:
2920 val = tcp_time_stamp + tp->tsoffset; 2928 val = tcp_time_stamp + tp->tsoffset;
2921 break; 2929 break;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 821846fb0a7e..d5de69bc04f5 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -140,13 +140,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
140 ca->cnt = 1; 140 ca->cnt = 1;
141} 141}
142 142
143static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, 143static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
144 u32 in_flight)
145{ 144{
146 struct tcp_sock *tp = tcp_sk(sk); 145 struct tcp_sock *tp = tcp_sk(sk);
147 struct bictcp *ca = inet_csk_ca(sk); 146 struct bictcp *ca = inet_csk_ca(sk);
148 147
149 if (!tcp_is_cwnd_limited(sk, in_flight)) 148 if (!tcp_is_cwnd_limited(sk))
150 return; 149 return;
151 150
152 if (tp->snd_cwnd <= tp->snd_ssthresh) 151 if (tp->snd_cwnd <= tp->snd_ssthresh)
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 2b9464c93b88..7b09d8b49fa5 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -276,26 +276,6 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
276 return err; 276 return err;
277} 277}
278 278
279/* RFC2861 Check whether we are limited by application or congestion window
280 * This is the inverse of cwnd check in tcp_tso_should_defer
281 */
282bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
283{
284 const struct tcp_sock *tp = tcp_sk(sk);
285 u32 left;
286
287 if (in_flight >= tp->snd_cwnd)
288 return true;
289
290 left = tp->snd_cwnd - in_flight;
291 if (sk_can_gso(sk) &&
292 left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
293 left < tp->xmit_size_goal_segs)
294 return true;
295 return left <= tcp_max_tso_deferred_mss(tp);
296}
297EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
298
299/* Slow start is used when congestion window is no greater than the slow start 279/* Slow start is used when congestion window is no greater than the slow start
300 * threshold. We base on RFC2581 and also handle stretch ACKs properly. 280 * threshold. We base on RFC2581 and also handle stretch ACKs properly.
301 * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but 281 * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
@@ -337,11 +317,11 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
337/* This is Jacobson's slow start and congestion avoidance. 317/* This is Jacobson's slow start and congestion avoidance.
338 * SIGCOMM '88, p. 328. 318 * SIGCOMM '88, p. 328.
339 */ 319 */
340void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) 320void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
341{ 321{
342 struct tcp_sock *tp = tcp_sk(sk); 322 struct tcp_sock *tp = tcp_sk(sk);
343 323
344 if (!tcp_is_cwnd_limited(sk, in_flight)) 324 if (!tcp_is_cwnd_limited(sk))
345 return; 325 return;
346 326
347 /* In "safe" area, increase. */ 327 /* In "safe" area, increase. */
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index b4f1b29b08bd..a9bd8a4828a9 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -304,13 +304,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
304 ca->cnt = 1; 304 ca->cnt = 1;
305} 305}
306 306
307static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, 307static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
308 u32 in_flight)
309{ 308{
310 struct tcp_sock *tp = tcp_sk(sk); 309 struct tcp_sock *tp = tcp_sk(sk);
311 struct bictcp *ca = inet_csk_ca(sk); 310 struct bictcp *ca = inet_csk_ca(sk);
312 311
313 if (!tcp_is_cwnd_limited(sk, in_flight)) 312 if (!tcp_is_cwnd_limited(sk))
314 return; 313 return;
315 314
316 if (tp->snd_cwnd <= tp->snd_ssthresh) { 315 if (tp->snd_cwnd <= tp->snd_ssthresh) {
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index f195d9316e55..62e48cf84e60 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -72,25 +72,224 @@ error: kfree(ctx);
72 return err; 72 return err;
73} 73}
74 74
75/* Computes the fastopen cookie for the IP path. 75static bool __tcp_fastopen_cookie_gen(const void *path,
76 * The path is a 128 bits long (pad with zeros for IPv4). 76 struct tcp_fastopen_cookie *foc)
77 *
78 * The caller must check foc->len to determine if a valid cookie
79 * has been generated successfully.
80*/
81void tcp_fastopen_cookie_gen(__be32 src, __be32 dst,
82 struct tcp_fastopen_cookie *foc)
83{ 77{
84 __be32 path[4] = { src, dst, 0, 0 };
85 struct tcp_fastopen_context *ctx; 78 struct tcp_fastopen_context *ctx;
79 bool ok = false;
86 80
87 tcp_fastopen_init_key_once(true); 81 tcp_fastopen_init_key_once(true);
88 82
89 rcu_read_lock(); 83 rcu_read_lock();
90 ctx = rcu_dereference(tcp_fastopen_ctx); 84 ctx = rcu_dereference(tcp_fastopen_ctx);
91 if (ctx) { 85 if (ctx) {
92 crypto_cipher_encrypt_one(ctx->tfm, foc->val, (__u8 *)path); 86 crypto_cipher_encrypt_one(ctx->tfm, foc->val, path);
93 foc->len = TCP_FASTOPEN_COOKIE_SIZE; 87 foc->len = TCP_FASTOPEN_COOKIE_SIZE;
88 ok = true;
94 } 89 }
95 rcu_read_unlock(); 90 rcu_read_unlock();
91 return ok;
92}
93
94/* Generate the fastopen cookie by doing aes128 encryption on both
95 * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6
96 * addresses. For the longer IPv6 addresses use CBC-MAC.
97 *
98 * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
99 */
100static bool tcp_fastopen_cookie_gen(struct request_sock *req,
101 struct sk_buff *syn,
102 struct tcp_fastopen_cookie *foc)
103{
104 if (req->rsk_ops->family == AF_INET) {
105 const struct iphdr *iph = ip_hdr(syn);
106
107 __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
108 return __tcp_fastopen_cookie_gen(path, foc);
109 }
110
111#if IS_ENABLED(CONFIG_IPV6)
112 if (req->rsk_ops->family == AF_INET6) {
113 const struct ipv6hdr *ip6h = ipv6_hdr(syn);
114 struct tcp_fastopen_cookie tmp;
115
116 if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) {
117 struct in6_addr *buf = (struct in6_addr *) tmp.val;
118 int i = 4;
119
120 for (i = 0; i < 4; i++)
121 buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
122 return __tcp_fastopen_cookie_gen(buf, foc);
123 }
124 }
125#endif
126 return false;
127}
128
129static bool tcp_fastopen_create_child(struct sock *sk,
130 struct sk_buff *skb,
131 struct dst_entry *dst,
132 struct request_sock *req)
133{
134 struct tcp_sock *tp = tcp_sk(sk);
135 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
136 struct sock *child;
137
138 req->num_retrans = 0;
139 req->num_timeout = 0;
140 req->sk = NULL;
141
142 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
143 if (child == NULL)
144 return false;
145
146 spin_lock(&queue->fastopenq->lock);
147 queue->fastopenq->qlen++;
148 spin_unlock(&queue->fastopenq->lock);
149
150 /* Initialize the child socket. Have to fix some values to take
151 * into account the child is a Fast Open socket and is created
152 * only out of the bits carried in the SYN packet.
153 */
154 tp = tcp_sk(child);
155
156 tp->fastopen_rsk = req;
157 /* Do a hold on the listner sk so that if the listener is being
158 * closed, the child that has been accepted can live on and still
159 * access listen_lock.
160 */
161 sock_hold(sk);
162 tcp_rsk(req)->listener = sk;
163
164 /* RFC1323: The window in SYN & SYN/ACK segments is never
165 * scaled. So correct it appropriately.
166 */
167 tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
168
169 /* Activate the retrans timer so that SYNACK can be retransmitted.
170 * The request socket is not added to the SYN table of the parent
171 * because it's been added to the accept queue directly.
172 */
173 inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
174 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
175
176 /* Add the child socket directly into the accept queue */
177 inet_csk_reqsk_queue_add(sk, req, child);
178
179 /* Now finish processing the fastopen child socket. */
180 inet_csk(child)->icsk_af_ops->rebuild_header(child);
181 tcp_init_congestion_control(child);
182 tcp_mtup_init(child);
183 tcp_init_metrics(child);
184 tcp_init_buffer_space(child);
185
186 /* Queue the data carried in the SYN packet. We need to first
187 * bump skb's refcnt because the caller will attempt to free it.
188 *
189 * XXX (TFO) - we honor a zero-payload TFO request for now,
190 * (any reason not to?) but no need to queue the skb since
191 * there is no data. How about SYN+FIN?
192 */
193 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1) {
194 skb = skb_get(skb);
195 skb_dst_drop(skb);
196 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
197 skb_set_owner_r(skb, child);
198 __skb_queue_tail(&child->sk_receive_queue, skb);
199 tp->syn_data_acked = 1;
200 }
201 tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
202 sk->sk_data_ready(sk);
203 bh_unlock_sock(child);
204 sock_put(child);
205 WARN_ON(req->sk == NULL);
206 return true;
207}
208EXPORT_SYMBOL(tcp_fastopen_create_child);
209
210static bool tcp_fastopen_queue_check(struct sock *sk)
211{
212 struct fastopen_queue *fastopenq;
213
214 /* Make sure the listener has enabled fastopen, and we don't
215 * exceed the max # of pending TFO requests allowed before trying
216 * to validating the cookie in order to avoid burning CPU cycles
217 * unnecessarily.
218 *
219 * XXX (TFO) - The implication of checking the max_qlen before
220 * processing a cookie request is that clients can't differentiate
221 * between qlen overflow causing Fast Open to be disabled
222 * temporarily vs a server not supporting Fast Open at all.
223 */
224 fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
225 if (fastopenq == NULL || fastopenq->max_qlen == 0)
226 return false;
227
228 if (fastopenq->qlen >= fastopenq->max_qlen) {
229 struct request_sock *req1;
230 spin_lock(&fastopenq->lock);
231 req1 = fastopenq->rskq_rst_head;
232 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
233 spin_unlock(&fastopenq->lock);
234 NET_INC_STATS_BH(sock_net(sk),
235 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
236 return false;
237 }
238 fastopenq->rskq_rst_head = req1->dl_next;
239 fastopenq->qlen--;
240 spin_unlock(&fastopenq->lock);
241 reqsk_free(req1);
242 }
243 return true;
244}
245
246/* Returns true if we should perform Fast Open on the SYN. The cookie (foc)
247 * may be updated and return the client in the SYN-ACK later. E.g., Fast Open
248 * cookie request (foc->len == 0).
249 */
250bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
251 struct request_sock *req,
252 struct tcp_fastopen_cookie *foc,
253 struct dst_entry *dst)
254{
255 struct tcp_fastopen_cookie valid_foc = { .len = -1 };
256 bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
257
258 if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
259 (syn_data || foc->len >= 0) &&
260 tcp_fastopen_queue_check(sk))) {
261 foc->len = -1;
262 return false;
263 }
264
265 if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
266 goto fastopen;
267
268 if (tcp_fastopen_cookie_gen(req, skb, &valid_foc) &&
269 foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
270 foc->len == valid_foc.len &&
271 !memcmp(foc->val, valid_foc.val, foc->len)) {
272 /* Cookie is valid. Create a (full) child socket to accept
273 * the data in SYN before returning a SYN-ACK to ack the
274 * data. If we fail to create the socket, fall back and
275 * ack the ISN only but includes the same cookie.
276 *
277 * Note: Data-less SYN with valid cookie is allowed to send
278 * data in SYN_RECV state.
279 */
280fastopen:
281 if (tcp_fastopen_create_child(sk, skb, dst, req)) {
282 foc->len = -1;
283 NET_INC_STATS_BH(sock_net(sk),
284 LINUX_MIB_TCPFASTOPENPASSIVE);
285 return true;
286 }
287 }
288
289 NET_INC_STATS_BH(sock_net(sk), foc->len ?
290 LINUX_MIB_TCPFASTOPENPASSIVEFAIL :
291 LINUX_MIB_TCPFASTOPENCOOKIEREQD);
292 *foc = valid_foc;
293 return false;
96} 294}
295EXPORT_SYMBOL(tcp_try_fastopen);
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 8b9e7bad77c0..1c4908280d92 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -109,12 +109,12 @@ static void hstcp_init(struct sock *sk)
109 tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); 109 tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
110} 110}
111 111
112static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) 112static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
113{ 113{
114 struct tcp_sock *tp = tcp_sk(sk); 114 struct tcp_sock *tp = tcp_sk(sk);
115 struct hstcp *ca = inet_csk_ca(sk); 115 struct hstcp *ca = inet_csk_ca(sk);
116 116
117 if (!tcp_is_cwnd_limited(sk, in_flight)) 117 if (!tcp_is_cwnd_limited(sk))
118 return; 118 return;
119 119
120 if (tp->snd_cwnd <= tp->snd_ssthresh) 120 if (tp->snd_cwnd <= tp->snd_ssthresh)
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 4a194acfd923..031361311a8b 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -227,12 +227,12 @@ static u32 htcp_recalc_ssthresh(struct sock *sk)
227 return max((tp->snd_cwnd * ca->beta) >> 7, 2U); 227 return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
228} 228}
229 229
230static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) 230static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
231{ 231{
232 struct tcp_sock *tp = tcp_sk(sk); 232 struct tcp_sock *tp = tcp_sk(sk);
233 struct htcp *ca = inet_csk_ca(sk); 233 struct htcp *ca = inet_csk_ca(sk);
234 234
235 if (!tcp_is_cwnd_limited(sk, in_flight)) 235 if (!tcp_is_cwnd_limited(sk))
236 return; 236 return;
237 237
238 if (tp->snd_cwnd <= tp->snd_ssthresh) 238 if (tp->snd_cwnd <= tp->snd_ssthresh)
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index a15a799bf768..d8f8f05a4951 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -87,8 +87,7 @@ static inline u32 hybla_fraction(u32 odds)
87 * o Give cwnd a new value based on the model proposed 87 * o Give cwnd a new value based on the model proposed
88 * o remember increments <1 88 * o remember increments <1
89 */ 89 */
90static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked, 90static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
91 u32 in_flight)
92{ 91{
93 struct tcp_sock *tp = tcp_sk(sk); 92 struct tcp_sock *tp = tcp_sk(sk);
94 struct hybla *ca = inet_csk_ca(sk); 93 struct hybla *ca = inet_csk_ca(sk);
@@ -101,11 +100,11 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked,
101 ca->minrtt_us = tp->srtt_us; 100 ca->minrtt_us = tp->srtt_us;
102 } 101 }
103 102
104 if (!tcp_is_cwnd_limited(sk, in_flight)) 103 if (!tcp_is_cwnd_limited(sk))
105 return; 104 return;
106 105
107 if (!ca->hybla_en) { 106 if (!ca->hybla_en) {
108 tcp_reno_cong_avoid(sk, ack, acked, in_flight); 107 tcp_reno_cong_avoid(sk, ack, acked);
109 return; 108 return;
110 } 109 }
111 110
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 863d105e3015..5999b3972e64 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -255,8 +255,7 @@ static void tcp_illinois_state(struct sock *sk, u8 new_state)
255/* 255/*
256 * Increase window in response to successful acknowledgment. 256 * Increase window in response to successful acknowledgment.
257 */ 257 */
258static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked, 258static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)
259 u32 in_flight)
260{ 259{
261 struct tcp_sock *tp = tcp_sk(sk); 260 struct tcp_sock *tp = tcp_sk(sk);
262 struct illinois *ca = inet_csk_ca(sk); 261 struct illinois *ca = inet_csk_ca(sk);
@@ -265,7 +264,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked,
265 update_params(sk); 264 update_params(sk);
266 265
267 /* RFC2861 only increase cwnd if fully utilized */ 266 /* RFC2861 only increase cwnd if fully utilized */
268 if (!tcp_is_cwnd_limited(sk, in_flight)) 267 if (!tcp_is_cwnd_limited(sk))
269 return; 268 return;
270 269
271 /* In slow start */ 270 /* In slow start */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3a26b3b23f16..40661fc1e233 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1167,7 +1167,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1167 } 1167 }
1168 pkt_len = new_len; 1168 pkt_len = new_len;
1169 } 1169 }
1170 err = tcp_fragment(sk, skb, pkt_len, mss); 1170 err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
1171 if (err < 0) 1171 if (err < 0)
1172 return err; 1172 return err;
1173 } 1173 }
@@ -2241,7 +2241,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2241 break; 2241 break;
2242 2242
2243 mss = skb_shinfo(skb)->gso_size; 2243 mss = skb_shinfo(skb)->gso_size;
2244 err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss); 2244 err = tcp_fragment(sk, skb, (packets - oldcnt) * mss,
2245 mss, GFP_ATOMIC);
2245 if (err < 0) 2246 if (err < 0)
2246 break; 2247 break;
2247 cnt = packets; 2248 cnt = packets;
@@ -2937,10 +2938,11 @@ static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
2937 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L); 2938 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L);
2938} 2939}
2939 2940
2940static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) 2941static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
2941{ 2942{
2942 const struct inet_connection_sock *icsk = inet_csk(sk); 2943 const struct inet_connection_sock *icsk = inet_csk(sk);
2943 icsk->icsk_ca_ops->cong_avoid(sk, ack, acked, in_flight); 2944
2945 icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
2944 tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; 2946 tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
2945} 2947}
2946 2948
@@ -3363,7 +3365,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3363 u32 ack_seq = TCP_SKB_CB(skb)->seq; 3365 u32 ack_seq = TCP_SKB_CB(skb)->seq;
3364 u32 ack = TCP_SKB_CB(skb)->ack_seq; 3366 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3365 bool is_dupack = false; 3367 bool is_dupack = false;
3366 u32 prior_in_flight;
3367 u32 prior_fackets; 3368 u32 prior_fackets;
3368 int prior_packets = tp->packets_out; 3369 int prior_packets = tp->packets_out;
3369 const int prior_unsacked = tp->packets_out - tp->sacked_out; 3370 const int prior_unsacked = tp->packets_out - tp->sacked_out;
@@ -3396,7 +3397,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3396 flag |= FLAG_SND_UNA_ADVANCED; 3397 flag |= FLAG_SND_UNA_ADVANCED;
3397 3398
3398 prior_fackets = tp->fackets_out; 3399 prior_fackets = tp->fackets_out;
3399 prior_in_flight = tcp_packets_in_flight(tp);
3400 3400
3401 /* ts_recent update must be made after we are sure that the packet 3401 /* ts_recent update must be made after we are sure that the packet
3402 * is in window. 3402 * is in window.
@@ -3451,7 +3451,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3451 3451
3452 /* Advance cwnd if state allows */ 3452 /* Advance cwnd if state allows */
3453 if (tcp_may_raise_cwnd(sk, flag)) 3453 if (tcp_may_raise_cwnd(sk, flag))
3454 tcp_cong_avoid(sk, ack, acked, prior_in_flight); 3454 tcp_cong_avoid(sk, ack, acked);
3455 3455
3456 if (tcp_ack_is_dubious(sk, flag)) { 3456 if (tcp_ack_is_dubious(sk, flag)) {
3457 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3457 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
@@ -4702,28 +4702,6 @@ static int tcp_prune_queue(struct sock *sk)
4702 return -1; 4702 return -1;
4703} 4703}
4704 4704
4705/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
4706 * As additional protections, we do not touch cwnd in retransmission phases,
4707 * and if application hit its sndbuf limit recently.
4708 */
4709void tcp_cwnd_application_limited(struct sock *sk)
4710{
4711 struct tcp_sock *tp = tcp_sk(sk);
4712
4713 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
4714 sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
4715 /* Limited by application or receiver window. */
4716 u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
4717 u32 win_used = max(tp->snd_cwnd_used, init_win);
4718 if (win_used < tp->snd_cwnd) {
4719 tp->snd_ssthresh = tcp_current_ssthresh(sk);
4720 tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
4721 }
4722 tp->snd_cwnd_used = 0;
4723 }
4724 tp->snd_cwnd_stamp = tcp_time_stamp;
4725}
4726
4727static bool tcp_should_expand_sndbuf(const struct sock *sk) 4705static bool tcp_should_expand_sndbuf(const struct sock *sk)
4728{ 4706{
4729 const struct tcp_sock *tp = tcp_sk(sk); 4707 const struct tcp_sock *tp = tcp_sk(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 438f3b95143d..77cccda1ad0c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -336,8 +336,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
336 const int code = icmp_hdr(icmp_skb)->code; 336 const int code = icmp_hdr(icmp_skb)->code;
337 struct sock *sk; 337 struct sock *sk;
338 struct sk_buff *skb; 338 struct sk_buff *skb;
339 struct request_sock *req; 339 struct request_sock *fastopen;
340 __u32 seq; 340 __u32 seq, snd_una;
341 __u32 remaining; 341 __u32 remaining;
342 int err; 342 int err;
343 struct net *net = dev_net(icmp_skb->dev); 343 struct net *net = dev_net(icmp_skb->dev);
@@ -378,12 +378,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
378 378
379 icsk = inet_csk(sk); 379 icsk = inet_csk(sk);
380 tp = tcp_sk(sk); 380 tp = tcp_sk(sk);
381 req = tp->fastopen_rsk;
382 seq = ntohl(th->seq); 381 seq = ntohl(th->seq);
382 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
383 fastopen = tp->fastopen_rsk;
384 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
383 if (sk->sk_state != TCP_LISTEN && 385 if (sk->sk_state != TCP_LISTEN &&
384 !between(seq, tp->snd_una, tp->snd_nxt) && 386 !between(seq, snd_una, tp->snd_nxt)) {
385 (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
386 /* For a Fast Open socket, allow seq to be snt_isn. */
387 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); 387 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
388 goto out; 388 goto out;
389 } 389 }
@@ -426,11 +426,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
426 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) 426 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
427 break; 427 break;
428 if (seq != tp->snd_una || !icsk->icsk_retransmits || 428 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
429 !icsk->icsk_backoff) 429 !icsk->icsk_backoff || fastopen)
430 break; 430 break;
431 431
432 /* XXX (TFO) - revisit the following logic for TFO */
433
434 if (sock_owned_by_user(sk)) 432 if (sock_owned_by_user(sk))
435 break; 433 break;
436 434
@@ -462,14 +460,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
462 goto out; 460 goto out;
463 } 461 }
464 462
465 /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
466 * than following the TCP_SYN_RECV case and closing the socket,
467 * we ignore the ICMP error and keep trying like a fully established
468 * socket. Is this the right thing to do?
469 */
470 if (req && req->sk == NULL)
471 goto out;
472
473 switch (sk->sk_state) { 463 switch (sk->sk_state) {
474 struct request_sock *req, **prev; 464 struct request_sock *req, **prev;
475 case TCP_LISTEN: 465 case TCP_LISTEN:
@@ -502,10 +492,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
502 goto out; 492 goto out;
503 493
504 case TCP_SYN_SENT: 494 case TCP_SYN_SENT:
505 case TCP_SYN_RECV: /* Cannot happen. 495 case TCP_SYN_RECV:
506 It can f.e. if SYNs crossed, 496 /* Only in fast or simultaneous open. If a fast open socket is
507 or Fast Open. 497 * is already accepted it is treated as a connected one below.
508 */ 498 */
499 if (fastopen && fastopen->sk == NULL)
500 break;
501
509 if (!sock_owned_by_user(sk)) { 502 if (!sock_owned_by_user(sk)) {
510 sk->sk_err = err; 503 sk->sk_err = err;
511 504
@@ -822,7 +815,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
822 */ 815 */
823static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, 816static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
824 struct request_sock *req, 817 struct request_sock *req,
825 u16 queue_mapping) 818 u16 queue_mapping,
819 struct tcp_fastopen_cookie *foc)
826{ 820{
827 const struct inet_request_sock *ireq = inet_rsk(req); 821 const struct inet_request_sock *ireq = inet_rsk(req);
828 struct flowi4 fl4; 822 struct flowi4 fl4;
@@ -833,7 +827,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
833 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 827 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
834 return -1; 828 return -1;
835 829
836 skb = tcp_make_synack(sk, dst, req, NULL); 830 skb = tcp_make_synack(sk, dst, req, foc);
837 831
838 if (skb) { 832 if (skb) {
839 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 833 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
@@ -852,7 +846,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
852 846
853static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) 847static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
854{ 848{
855 int res = tcp_v4_send_synack(sk, NULL, req, 0); 849 int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL);
856 850
857 if (!res) { 851 if (!res) {
858 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); 852 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
@@ -1260,187 +1254,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1260}; 1254};
1261#endif 1255#endif
1262 1256
1263static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1264 struct request_sock *req,
1265 struct tcp_fastopen_cookie *foc,
1266 struct tcp_fastopen_cookie *valid_foc)
1267{
1268 bool skip_cookie = false;
1269 struct fastopen_queue *fastopenq;
1270
1271 if (likely(!fastopen_cookie_present(foc))) {
1272 /* See include/net/tcp.h for the meaning of these knobs */
1273 if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1274 ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1275 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1276 skip_cookie = true; /* no cookie to validate */
1277 else
1278 return false;
1279 }
1280 fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1281 /* A FO option is present; bump the counter. */
1282 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1283
1284 /* Make sure the listener has enabled fastopen, and we don't
1285 * exceed the max # of pending TFO requests allowed before trying
1286 * to validating the cookie in order to avoid burning CPU cycles
1287 * unnecessarily.
1288 *
1289 * XXX (TFO) - The implication of checking the max_qlen before
1290 * processing a cookie request is that clients can't differentiate
1291 * between qlen overflow causing Fast Open to be disabled
1292 * temporarily vs a server not supporting Fast Open at all.
1293 */
1294 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1295 fastopenq == NULL || fastopenq->max_qlen == 0)
1296 return false;
1297
1298 if (fastopenq->qlen >= fastopenq->max_qlen) {
1299 struct request_sock *req1;
1300 spin_lock(&fastopenq->lock);
1301 req1 = fastopenq->rskq_rst_head;
1302 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1303 spin_unlock(&fastopenq->lock);
1304 NET_INC_STATS_BH(sock_net(sk),
1305 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1306 /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1307 foc->len = -1;
1308 return false;
1309 }
1310 fastopenq->rskq_rst_head = req1->dl_next;
1311 fastopenq->qlen--;
1312 spin_unlock(&fastopenq->lock);
1313 reqsk_free(req1);
1314 }
1315 if (skip_cookie) {
1316 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1317 return true;
1318 }
1319
1320 if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1321 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1322 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
1323 ip_hdr(skb)->daddr, valid_foc);
1324 if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1325 memcmp(&foc->val[0], &valid_foc->val[0],
1326 TCP_FASTOPEN_COOKIE_SIZE) != 0)
1327 return false;
1328 valid_foc->len = -1;
1329 }
1330 /* Acknowledge the data received from the peer. */
1331 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1332 return true;
1333 } else if (foc->len == 0) { /* Client requesting a cookie */
1334 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
1335 ip_hdr(skb)->daddr, valid_foc);
1336 NET_INC_STATS_BH(sock_net(sk),
1337 LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1338 } else {
1339 /* Client sent a cookie with wrong size. Treat it
1340 * the same as invalid and return a valid one.
1341 */
1342 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
1343 ip_hdr(skb)->daddr, valid_foc);
1344 }
1345 return false;
1346}
1347
1348static int tcp_v4_conn_req_fastopen(struct sock *sk,
1349 struct sk_buff *skb,
1350 struct sk_buff *skb_synack,
1351 struct request_sock *req)
1352{
1353 struct tcp_sock *tp = tcp_sk(sk);
1354 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1355 const struct inet_request_sock *ireq = inet_rsk(req);
1356 struct sock *child;
1357 int err;
1358
1359 req->num_retrans = 0;
1360 req->num_timeout = 0;
1361 req->sk = NULL;
1362
1363 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1364 if (child == NULL) {
1365 NET_INC_STATS_BH(sock_net(sk),
1366 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1367 kfree_skb(skb_synack);
1368 return -1;
1369 }
1370 err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr,
1371 ireq->ir_rmt_addr, ireq->opt);
1372 err = net_xmit_eval(err);
1373 if (!err)
1374 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1375 /* XXX (TFO) - is it ok to ignore error and continue? */
1376
1377 spin_lock(&queue->fastopenq->lock);
1378 queue->fastopenq->qlen++;
1379 spin_unlock(&queue->fastopenq->lock);
1380
1381 /* Initialize the child socket. Have to fix some values to take
1382 * into account the child is a Fast Open socket and is created
1383 * only out of the bits carried in the SYN packet.
1384 */
1385 tp = tcp_sk(child);
1386
1387 tp->fastopen_rsk = req;
1388 /* Do a hold on the listner sk so that if the listener is being
1389 * closed, the child that has been accepted can live on and still
1390 * access listen_lock.
1391 */
1392 sock_hold(sk);
1393 tcp_rsk(req)->listener = sk;
1394
1395 /* RFC1323: The window in SYN & SYN/ACK segments is never
1396 * scaled. So correct it appropriately.
1397 */
1398 tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1399
1400 /* Activate the retrans timer so that SYNACK can be retransmitted.
1401 * The request socket is not added to the SYN table of the parent
1402 * because it's been added to the accept queue directly.
1403 */
1404 inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1405 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1406
1407 /* Add the child socket directly into the accept queue */
1408 inet_csk_reqsk_queue_add(sk, req, child);
1409
1410 /* Now finish processing the fastopen child socket. */
1411 inet_csk(child)->icsk_af_ops->rebuild_header(child);
1412 tcp_init_congestion_control(child);
1413 tcp_mtup_init(child);
1414 tcp_init_metrics(child);
1415 tcp_init_buffer_space(child);
1416
1417 /* Queue the data carried in the SYN packet. We need to first
1418 * bump skb's refcnt because the caller will attempt to free it.
1419 *
1420 * XXX (TFO) - we honor a zero-payload TFO request for now.
1421 * (Any reason not to?)
1422 */
1423 if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1424 /* Don't queue the skb if there is no payload in SYN.
1425 * XXX (TFO) - How about SYN+FIN?
1426 */
1427 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1428 } else {
1429 skb = skb_get(skb);
1430 skb_dst_drop(skb);
1431 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1432 skb_set_owner_r(skb, child);
1433 __skb_queue_tail(&child->sk_receive_queue, skb);
1434 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1435 tp->syn_data_acked = 1;
1436 }
1437 sk->sk_data_ready(sk);
1438 bh_unlock_sock(child);
1439 sock_put(child);
1440 WARN_ON(req->sk == NULL);
1441 return 0;
1442}
1443
1444int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1257int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1445{ 1258{
1446 struct tcp_options_received tmp_opt; 1259 struct tcp_options_received tmp_opt;
@@ -1451,12 +1264,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1451 __be32 saddr = ip_hdr(skb)->saddr; 1264 __be32 saddr = ip_hdr(skb)->saddr;
1452 __be32 daddr = ip_hdr(skb)->daddr; 1265 __be32 daddr = ip_hdr(skb)->daddr;
1453 __u32 isn = TCP_SKB_CB(skb)->when; 1266 __u32 isn = TCP_SKB_CB(skb)->when;
1454 bool want_cookie = false; 1267 bool want_cookie = false, fastopen;
1455 struct flowi4 fl4; 1268 struct flowi4 fl4;
1456 struct tcp_fastopen_cookie foc = { .len = -1 }; 1269 struct tcp_fastopen_cookie foc = { .len = -1 };
1457 struct tcp_fastopen_cookie valid_foc = { .len = -1 }; 1270 int err;
1458 struct sk_buff *skb_synack;
1459 int do_fastopen;
1460 1271
1461 /* Never answer to SYNs send to broadcast or multicast */ 1272 /* Never answer to SYNs send to broadcast or multicast */
1462 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1273 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -1507,6 +1318,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1507 ireq->ir_rmt_addr = saddr; 1318 ireq->ir_rmt_addr = saddr;
1508 ireq->no_srccheck = inet_sk(sk)->transparent; 1319 ireq->no_srccheck = inet_sk(sk)->transparent;
1509 ireq->opt = tcp_v4_save_options(skb); 1320 ireq->opt = tcp_v4_save_options(skb);
1321 ireq->ir_mark = inet_request_mark(sk, skb);
1510 1322
1511 if (security_inet_conn_request(sk, skb, req)) 1323 if (security_inet_conn_request(sk, skb, req))
1512 goto drop_and_free; 1324 goto drop_and_free;
@@ -1555,52 +1367,24 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1555 1367
1556 isn = tcp_v4_init_sequence(skb); 1368 isn = tcp_v4_init_sequence(skb);
1557 } 1369 }
1558 tcp_rsk(req)->snt_isn = isn; 1370 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1559
1560 if (dst == NULL) {
1561 dst = inet_csk_route_req(sk, &fl4, req);
1562 if (dst == NULL)
1563 goto drop_and_free;
1564 }
1565 do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1566
1567 /* We don't call tcp_v4_send_synack() directly because we need
1568 * to make sure a child socket can be created successfully before
1569 * sending back synack!
1570 *
1571 * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1572 * (or better yet, call tcp_send_synack() in the child context
1573 * directly, but will have to fix bunch of other code first)
1574 * after syn_recv_sock() except one will need to first fix the
1575 * latter to remove its dependency on the current implementation
1576 * of tcp_v4_send_synack()->tcp_select_initial_window().
1577 */
1578 skb_synack = tcp_make_synack(sk, dst, req,
1579 fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1580
1581 if (skb_synack) {
1582 __tcp_v4_send_check(skb_synack, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1583 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1584 } else
1585 goto drop_and_free; 1371 goto drop_and_free;
1586 1372
1587 if (likely(!do_fastopen)) { 1373 tcp_rsk(req)->snt_isn = isn;
1588 int err; 1374 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1589 err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr, 1375 tcp_openreq_init_rwin(req, sk, dst);
1590 ireq->ir_rmt_addr, ireq->opt); 1376 fastopen = !want_cookie &&
1591 err = net_xmit_eval(err); 1377 tcp_try_fastopen(sk, skb, req, &foc, dst);
1378 err = tcp_v4_send_synack(sk, dst, req,
1379 skb_get_queue_mapping(skb), &foc);
1380 if (!fastopen) {
1592 if (err || want_cookie) 1381 if (err || want_cookie)
1593 goto drop_and_free; 1382 goto drop_and_free;
1594 1383
1595 tcp_rsk(req)->snt_synack = tcp_time_stamp; 1384 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1596 tcp_rsk(req)->listener = NULL; 1385 tcp_rsk(req)->listener = NULL;
1597 /* Add the request_sock to the SYN table */
1598 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); 1386 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1599 if (fastopen_cookie_present(&foc) && foc.len != 0) 1387 }
1600 NET_INC_STATS_BH(sock_net(sk),
1601 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1602 } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req))
1603 goto drop_and_free;
1604 1388
1605 return 0; 1389 return 0;
1606 1390
@@ -1744,28 +1528,6 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1744 return sk; 1528 return sk;
1745} 1529}
1746 1530
1747static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1748{
1749 const struct iphdr *iph = ip_hdr(skb);
1750
1751 if (skb->ip_summed == CHECKSUM_COMPLETE) {
1752 if (!tcp_v4_check(skb->len, iph->saddr,
1753 iph->daddr, skb->csum)) {
1754 skb->ip_summed = CHECKSUM_UNNECESSARY;
1755 return 0;
1756 }
1757 }
1758
1759 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1760 skb->len, IPPROTO_TCP, 0);
1761
1762 if (skb->len <= 76) {
1763 return __skb_checksum_complete(skb);
1764 }
1765 return 0;
1766}
1767
1768
1769/* The socket must have it's spinlock held when we get 1531/* The socket must have it's spinlock held when we get
1770 * here. 1532 * here.
1771 * 1533 *
@@ -1960,7 +1722,8 @@ int tcp_v4_rcv(struct sk_buff *skb)
1960 * Packet length and doff are validated by header prediction, 1722 * Packet length and doff are validated by header prediction,
1961 * provided case of th->doff==0 is eliminated. 1723 * provided case of th->doff==0 is eliminated.
1962 * So, we defer the checks. */ 1724 * So, we defer the checks. */
1963 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) 1725
1726 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1964 goto csum_error; 1727 goto csum_error;
1965 1728
1966 th = tcp_hdr(skb); 1729 th = tcp_hdr(skb);
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index c9aecae31327..1e70fa8fa793 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -115,13 +115,12 @@ static void tcp_lp_init(struct sock *sk)
115 * Will only call newReno CA when away from inference. 115 * Will only call newReno CA when away from inference.
116 * From TCP-LP's paper, this will be handled in additive increasement. 116 * From TCP-LP's paper, this will be handled in additive increasement.
117 */ 117 */
118static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked, 118static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
119 u32 in_flight)
120{ 119{
121 struct lp *lp = inet_csk_ca(sk); 120 struct lp *lp = inet_csk_ca(sk);
122 121
123 if (!(lp->flag & LP_WITHIN_INF)) 122 if (!(lp->flag & LP_WITHIN_INF))
124 tcp_reno_cong_avoid(sk, ack, acked, in_flight); 123 tcp_reno_cong_avoid(sk, ack, acked);
125} 124}
126 125
127/** 126/**
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index dcaf72f10216..4fe041805989 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1159,10 +1159,7 @@ static void __net_exit tcp_net_metrics_exit(struct net *net)
1159 tm = next; 1159 tm = next;
1160 } 1160 }
1161 } 1161 }
1162 if (is_vmalloc_addr(net->ipv4.tcp_metrics_hash)) 1162 kvfree(net->ipv4.tcp_metrics_hash);
1163 vfree(net->ipv4.tcp_metrics_hash);
1164 else
1165 kfree(net->ipv4.tcp_metrics_hash);
1166} 1163}
1167 1164
1168static __net_initdata struct pernet_operations tcp_net_metrics_ops = { 1165static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 05c1b155251d..e68e0d4af6c9 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -362,6 +362,37 @@ void tcp_twsk_destructor(struct sock *sk)
362} 362}
363EXPORT_SYMBOL_GPL(tcp_twsk_destructor); 363EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
364 364
365void tcp_openreq_init_rwin(struct request_sock *req,
366 struct sock *sk, struct dst_entry *dst)
367{
368 struct inet_request_sock *ireq = inet_rsk(req);
369 struct tcp_sock *tp = tcp_sk(sk);
370 __u8 rcv_wscale;
371 int mss = dst_metric_advmss(dst);
372
373 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
374 mss = tp->rx_opt.user_mss;
375
376 /* Set this up on the first call only */
377 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
378
379 /* limit the window selection if the user enforce a smaller rx buffer */
380 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
381 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
382 req->window_clamp = tcp_full_space(sk);
383
384 /* tcp_full_space because it is guaranteed to be the first packet */
385 tcp_select_initial_window(tcp_full_space(sk),
386 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
387 &req->rcv_wnd,
388 &req->window_clamp,
389 ireq->wscale_ok,
390 &rcv_wscale,
391 dst_metric(dst, RTAX_INITRWND));
392 ireq->rcv_wscale = rcv_wscale;
393}
394EXPORT_SYMBOL(tcp_openreq_init_rwin);
395
365static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, 396static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
366 struct request_sock *req) 397 struct request_sock *req)
367{ 398{
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index b92b81718ca4..4e86c59ec7f7 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -57,10 +57,12 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
57 SKB_GSO_TCP_ECN | 57 SKB_GSO_TCP_ECN |
58 SKB_GSO_TCPV6 | 58 SKB_GSO_TCPV6 |
59 SKB_GSO_GRE | 59 SKB_GSO_GRE |
60 SKB_GSO_GRE_CSUM |
60 SKB_GSO_IPIP | 61 SKB_GSO_IPIP |
61 SKB_GSO_SIT | 62 SKB_GSO_SIT |
62 SKB_GSO_MPLS | 63 SKB_GSO_MPLS |
63 SKB_GSO_UDP_TUNNEL | 64 SKB_GSO_UDP_TUNNEL |
65 SKB_GSO_UDP_TUNNEL_CSUM |
64 0) || 66 0) ||
65 !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) 67 !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
66 goto out; 68 goto out;
@@ -97,9 +99,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
97 th->check = newcheck; 99 th->check = newcheck;
98 100
99 if (skb->ip_summed != CHECKSUM_PARTIAL) 101 if (skb->ip_summed != CHECKSUM_PARTIAL)
100 th->check = 102 th->check = gso_make_checksum(skb, ~th->check);
101 csum_fold(csum_partial(skb_transport_header(skb),
102 thlen, skb->csum));
103 103
104 seq += mss; 104 seq += mss;
105 if (copy_destructor) { 105 if (copy_destructor) {
@@ -133,8 +133,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
133 th->check = ~csum_fold((__force __wsum)((__force u32)th->check + 133 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
134 (__force u32)delta)); 134 (__force u32)delta));
135 if (skb->ip_summed != CHECKSUM_PARTIAL) 135 if (skb->ip_summed != CHECKSUM_PARTIAL)
136 th->check = csum_fold(csum_partial(skb_transport_header(skb), 136 th->check = gso_make_checksum(skb, ~th->check);
137 thlen, skb->csum));
138out: 137out:
139 return segs; 138 return segs;
140} 139}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2d340bd2cd3d..d92bce0ea24e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -627,7 +627,7 @@ static unsigned int tcp_synack_options(struct sock *sk,
627 if (unlikely(!ireq->tstamp_ok)) 627 if (unlikely(!ireq->tstamp_ok))
628 remaining -= TCPOLEN_SACKPERM_ALIGNED; 628 remaining -= TCPOLEN_SACKPERM_ALIGNED;
629 } 629 }
630 if (foc != NULL) { 630 if (foc != NULL && foc->len >= 0) {
631 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; 631 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
632 need = (need + 3) & ~3U; /* Align to 32 bits */ 632 need = (need + 3) & ~3U; /* Align to 32 bits */
633 if (remaining >= need) { 633 if (remaining >= need) {
@@ -878,15 +878,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
878 BUG_ON(!skb || !tcp_skb_pcount(skb)); 878 BUG_ON(!skb || !tcp_skb_pcount(skb));
879 879
880 if (clone_it) { 880 if (clone_it) {
881 const struct sk_buff *fclone = skb + 1;
882
883 skb_mstamp_get(&skb->skb_mstamp); 881 skb_mstamp_get(&skb->skb_mstamp);
884 882
885 if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
886 fclone->fclone == SKB_FCLONE_CLONE))
887 NET_INC_STATS(sock_net(sk),
888 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
889
890 if (unlikely(skb_cloned(skb))) 883 if (unlikely(skb_cloned(skb)))
891 skb = pskb_copy(skb, gfp_mask); 884 skb = pskb_copy(skb, gfp_mask);
892 else 885 else
@@ -1081,7 +1074,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
1081 * Remember, these are still headerless SKBs at this point. 1074 * Remember, these are still headerless SKBs at this point.
1082 */ 1075 */
1083int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, 1076int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1084 unsigned int mss_now) 1077 unsigned int mss_now, gfp_t gfp)
1085{ 1078{
1086 struct tcp_sock *tp = tcp_sk(sk); 1079 struct tcp_sock *tp = tcp_sk(sk);
1087 struct sk_buff *buff; 1080 struct sk_buff *buff;
@@ -1096,11 +1089,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1096 if (nsize < 0) 1089 if (nsize < 0)
1097 nsize = 0; 1090 nsize = 0;
1098 1091
1099 if (skb_unclone(skb, GFP_ATOMIC)) 1092 if (skb_unclone(skb, gfp))
1100 return -ENOMEM; 1093 return -ENOMEM;
1101 1094
1102 /* Get a new skb... force flag on. */ 1095 /* Get a new skb... force flag on. */
1103 buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); 1096 buff = sk_stream_alloc_skb(sk, nsize, gfp);
1104 if (buff == NULL) 1097 if (buff == NULL)
1105 return -ENOMEM; /* We'll just try again later. */ 1098 return -ENOMEM; /* We'll just try again later. */
1106 1099
@@ -1387,12 +1380,43 @@ unsigned int tcp_current_mss(struct sock *sk)
1387 return mss_now; 1380 return mss_now;
1388} 1381}
1389 1382
1390/* Congestion window validation. (RFC2861) */ 1383/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
1391static void tcp_cwnd_validate(struct sock *sk) 1384 * As additional protections, we do not touch cwnd in retransmission phases,
1385 * and if application hit its sndbuf limit recently.
1386 */
1387static void tcp_cwnd_application_limited(struct sock *sk)
1388{
1389 struct tcp_sock *tp = tcp_sk(sk);
1390
1391 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
1392 sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
1393 /* Limited by application or receiver window. */
1394 u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
1395 u32 win_used = max(tp->snd_cwnd_used, init_win);
1396 if (win_used < tp->snd_cwnd) {
1397 tp->snd_ssthresh = tcp_current_ssthresh(sk);
1398 tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
1399 }
1400 tp->snd_cwnd_used = 0;
1401 }
1402 tp->snd_cwnd_stamp = tcp_time_stamp;
1403}
1404
1405static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1392{ 1406{
1393 struct tcp_sock *tp = tcp_sk(sk); 1407 struct tcp_sock *tp = tcp_sk(sk);
1394 1408
1395 if (tp->packets_out >= tp->snd_cwnd) { 1409 /* Track the maximum number of outstanding packets in each
1410 * window, and remember whether we were cwnd-limited then.
1411 */
1412 if (!before(tp->snd_una, tp->max_packets_seq) ||
1413 tp->packets_out > tp->max_packets_out) {
1414 tp->max_packets_out = tp->packets_out;
1415 tp->max_packets_seq = tp->snd_nxt;
1416 tp->is_cwnd_limited = is_cwnd_limited;
1417 }
1418
1419 if (tcp_is_cwnd_limited(sk)) {
1396 /* Network is feed fully. */ 1420 /* Network is feed fully. */
1397 tp->snd_cwnd_used = 0; 1421 tp->snd_cwnd_used = 0;
1398 tp->snd_cwnd_stamp = tcp_time_stamp; 1422 tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -1601,7 +1625,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1601 1625
1602 /* All of a TSO frame must be composed of paged data. */ 1626 /* All of a TSO frame must be composed of paged data. */
1603 if (skb->len != skb->data_len) 1627 if (skb->len != skb->data_len)
1604 return tcp_fragment(sk, skb, len, mss_now); 1628 return tcp_fragment(sk, skb, len, mss_now, gfp);
1605 1629
1606 buff = sk_stream_alloc_skb(sk, 0, gfp); 1630 buff = sk_stream_alloc_skb(sk, 0, gfp);
1607 if (unlikely(buff == NULL)) 1631 if (unlikely(buff == NULL))
@@ -1644,7 +1668,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1644 * 1668 *
1645 * This algorithm is from John Heffner. 1669 * This algorithm is from John Heffner.
1646 */ 1670 */
1647static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) 1671static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1672 bool *is_cwnd_limited)
1648{ 1673{
1649 struct tcp_sock *tp = tcp_sk(sk); 1674 struct tcp_sock *tp = tcp_sk(sk);
1650 const struct inet_connection_sock *icsk = inet_csk(sk); 1675 const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1708,6 +1733,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1708 if (!tp->tso_deferred) 1733 if (!tp->tso_deferred)
1709 tp->tso_deferred = 1 | (jiffies << 1); 1734 tp->tso_deferred = 1 | (jiffies << 1);
1710 1735
1736 if (cong_win < send_win && cong_win < skb->len)
1737 *is_cwnd_limited = true;
1738
1711 return true; 1739 return true;
1712 1740
1713send_now: 1741send_now:
@@ -1868,6 +1896,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1868 unsigned int tso_segs, sent_pkts; 1896 unsigned int tso_segs, sent_pkts;
1869 int cwnd_quota; 1897 int cwnd_quota;
1870 int result; 1898 int result;
1899 bool is_cwnd_limited = false;
1871 1900
1872 sent_pkts = 0; 1901 sent_pkts = 0;
1873 1902
@@ -1892,6 +1921,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1892 1921
1893 cwnd_quota = tcp_cwnd_test(tp, skb); 1922 cwnd_quota = tcp_cwnd_test(tp, skb);
1894 if (!cwnd_quota) { 1923 if (!cwnd_quota) {
1924 is_cwnd_limited = true;
1895 if (push_one == 2) 1925 if (push_one == 2)
1896 /* Force out a loss probe pkt. */ 1926 /* Force out a loss probe pkt. */
1897 cwnd_quota = 1; 1927 cwnd_quota = 1;
@@ -1908,7 +1938,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1908 nonagle : TCP_NAGLE_PUSH)))) 1938 nonagle : TCP_NAGLE_PUSH))))
1909 break; 1939 break;
1910 } else { 1940 } else {
1911 if (!push_one && tcp_tso_should_defer(sk, skb)) 1941 if (!push_one &&
1942 tcp_tso_should_defer(sk, skb, &is_cwnd_limited))
1912 break; 1943 break;
1913 } 1944 }
1914 1945
@@ -1973,7 +2004,7 @@ repair:
1973 /* Send one loss probe per tail loss episode. */ 2004 /* Send one loss probe per tail loss episode. */
1974 if (push_one != 2) 2005 if (push_one != 2)
1975 tcp_schedule_loss_probe(sk); 2006 tcp_schedule_loss_probe(sk);
1976 tcp_cwnd_validate(sk); 2007 tcp_cwnd_validate(sk, is_cwnd_limited);
1977 return false; 2008 return false;
1978 } 2009 }
1979 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); 2010 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
@@ -2037,6 +2068,25 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2037 return true; 2068 return true;
2038} 2069}
2039 2070
2071/* Thanks to skb fast clones, we can detect if a prior transmit of
2072 * a packet is still in a qdisc or driver queue.
2073 * In this case, there is very little point doing a retransmit !
2074 * Note: This is called from BH context only.
2075 */
2076static bool skb_still_in_host_queue(const struct sock *sk,
2077 const struct sk_buff *skb)
2078{
2079 const struct sk_buff *fclone = skb + 1;
2080
2081 if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
2082 fclone->fclone == SKB_FCLONE_CLONE)) {
2083 NET_INC_STATS_BH(sock_net(sk),
2084 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
2085 return true;
2086 }
2087 return false;
2088}
2089
2040/* When probe timeout (PTO) fires, send a new segment if one exists, else 2090/* When probe timeout (PTO) fires, send a new segment if one exists, else
2041 * retransmit the last segment. 2091 * retransmit the last segment.
2042 */ 2092 */
@@ -2062,12 +2112,16 @@ void tcp_send_loss_probe(struct sock *sk)
2062 if (WARN_ON(!skb)) 2112 if (WARN_ON(!skb))
2063 goto rearm_timer; 2113 goto rearm_timer;
2064 2114
2115 if (skb_still_in_host_queue(sk, skb))
2116 goto rearm_timer;
2117
2065 pcount = tcp_skb_pcount(skb); 2118 pcount = tcp_skb_pcount(skb);
2066 if (WARN_ON(!pcount)) 2119 if (WARN_ON(!pcount))
2067 goto rearm_timer; 2120 goto rearm_timer;
2068 2121
2069 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { 2122 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
2070 if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss))) 2123 if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
2124 GFP_ATOMIC)))
2071 goto rearm_timer; 2125 goto rearm_timer;
2072 skb = tcp_write_queue_tail(sk); 2126 skb = tcp_write_queue_tail(sk);
2073 } 2127 }
@@ -2075,9 +2129,7 @@ void tcp_send_loss_probe(struct sock *sk)
2075 if (WARN_ON(!skb || !tcp_skb_pcount(skb))) 2129 if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
2076 goto rearm_timer; 2130 goto rearm_timer;
2077 2131
2078 /* Probe with zero data doesn't trigger fast recovery. */ 2132 err = __tcp_retransmit_skb(sk, skb);
2079 if (skb->len > 0)
2080 err = __tcp_retransmit_skb(sk, skb);
2081 2133
2082 /* Record snd_nxt for loss detection. */ 2134 /* Record snd_nxt for loss detection. */
2083 if (likely(!err)) 2135 if (likely(!err))
@@ -2383,6 +2435,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2383 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) 2435 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
2384 return -EAGAIN; 2436 return -EAGAIN;
2385 2437
2438 if (skb_still_in_host_queue(sk, skb))
2439 return -EBUSY;
2440
2386 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { 2441 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
2387 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) 2442 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
2388 BUG(); 2443 BUG();
@@ -2405,7 +2460,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2405 return -EAGAIN; 2460 return -EAGAIN;
2406 2461
2407 if (skb->len > cur_mss) { 2462 if (skb->len > cur_mss) {
2408 if (tcp_fragment(sk, skb, cur_mss, cur_mss)) 2463 if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC))
2409 return -ENOMEM; /* We'll try again later. */ 2464 return -ENOMEM; /* We'll try again later. */
2410 } else { 2465 } else {
2411 int oldpcount = tcp_skb_pcount(skb); 2466 int oldpcount = tcp_skb_pcount(skb);
@@ -2476,7 +2531,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2476 * see tcp_input.c tcp_sacktag_write_queue(). 2531 * see tcp_input.c tcp_sacktag_write_queue().
2477 */ 2532 */
2478 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; 2533 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
2479 } else { 2534 } else if (err != -EBUSY) {
2480 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); 2535 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
2481 } 2536 }
2482 return err; 2537 return err;
@@ -2754,27 +2809,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2754 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) 2809 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
2755 mss = tp->rx_opt.user_mss; 2810 mss = tp->rx_opt.user_mss;
2756 2811
2757 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
2758 __u8 rcv_wscale;
2759 /* Set this up on the first call only */
2760 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
2761
2762 /* limit the window selection if the user enforce a smaller rx buffer */
2763 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2764 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
2765 req->window_clamp = tcp_full_space(sk);
2766
2767 /* tcp_full_space because it is guaranteed to be the first packet */
2768 tcp_select_initial_window(tcp_full_space(sk),
2769 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
2770 &req->rcv_wnd,
2771 &req->window_clamp,
2772 ireq->wscale_ok,
2773 &rcv_wscale,
2774 dst_metric(dst, RTAX_INITRWND));
2775 ireq->rcv_wscale = rcv_wscale;
2776 }
2777
2778 memset(&opts, 0, sizeof(opts)); 2812 memset(&opts, 0, sizeof(opts));
2779#ifdef CONFIG_SYN_COOKIES 2813#ifdef CONFIG_SYN_COOKIES
2780 if (unlikely(req->cookie_ts)) 2814 if (unlikely(req->cookie_ts))
@@ -3207,7 +3241,7 @@ int tcp_write_wakeup(struct sock *sk)
3207 skb->len > mss) { 3241 skb->len > mss) {
3208 seg_size = min(seg_size, mss); 3242 seg_size = min(seg_size, mss);
3209 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; 3243 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3210 if (tcp_fragment(sk, skb, seg_size, mss)) 3244 if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
3211 return -1; 3245 return -1;
3212 } else if (!tcp_skb_pcount(skb)) 3246 } else if (!tcp_skb_pcount(skb))
3213 tcp_set_skb_tso_segs(sk, skb, mss); 3247 tcp_set_skb_tso_segs(sk, skb, mss);
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 0ac50836da4d..8250949b8853 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -15,12 +15,11 @@
15#define TCP_SCALABLE_AI_CNT 50U 15#define TCP_SCALABLE_AI_CNT 50U
16#define TCP_SCALABLE_MD_SCALE 3 16#define TCP_SCALABLE_MD_SCALE 3
17 17
18static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked, 18static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
19 u32 in_flight)
20{ 19{
21 struct tcp_sock *tp = tcp_sk(sk); 20 struct tcp_sock *tp = tcp_sk(sk);
22 21
23 if (!tcp_is_cwnd_limited(sk, in_flight)) 22 if (!tcp_is_cwnd_limited(sk))
24 return; 23 return;
25 24
26 if (tp->snd_cwnd <= tp->snd_ssthresh) 25 if (tp->snd_cwnd <= tp->snd_ssthresh)
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 48539fff6357..9a5e05f27f4f 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -163,14 +163,13 @@ static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
163 return min(tp->snd_ssthresh, tp->snd_cwnd-1); 163 return min(tp->snd_ssthresh, tp->snd_cwnd-1);
164} 164}
165 165
166static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked, 166static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
167 u32 in_flight)
168{ 167{
169 struct tcp_sock *tp = tcp_sk(sk); 168 struct tcp_sock *tp = tcp_sk(sk);
170 struct vegas *vegas = inet_csk_ca(sk); 169 struct vegas *vegas = inet_csk_ca(sk);
171 170
172 if (!vegas->doing_vegas_now) { 171 if (!vegas->doing_vegas_now) {
173 tcp_reno_cong_avoid(sk, ack, acked, in_flight); 172 tcp_reno_cong_avoid(sk, ack, acked);
174 return; 173 return;
175 } 174 }
176 175
@@ -195,7 +194,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked,
195 /* We don't have enough RTT samples to do the Vegas 194 /* We don't have enough RTT samples to do the Vegas
196 * calculation, so we'll behave like Reno. 195 * calculation, so we'll behave like Reno.
197 */ 196 */
198 tcp_reno_cong_avoid(sk, ack, acked, in_flight); 197 tcp_reno_cong_avoid(sk, ack, acked);
199 } else { 198 } else {
200 u32 rtt, diff; 199 u32 rtt, diff;
201 u64 target_cwnd; 200 u64 target_cwnd;
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 1b8e28fcd7e1..27b9825753d1 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -114,19 +114,18 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)
114 tcp_veno_init(sk); 114 tcp_veno_init(sk);
115} 115}
116 116
117static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked, 117static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
118 u32 in_flight)
119{ 118{
120 struct tcp_sock *tp = tcp_sk(sk); 119 struct tcp_sock *tp = tcp_sk(sk);
121 struct veno *veno = inet_csk_ca(sk); 120 struct veno *veno = inet_csk_ca(sk);
122 121
123 if (!veno->doing_veno_now) { 122 if (!veno->doing_veno_now) {
124 tcp_reno_cong_avoid(sk, ack, acked, in_flight); 123 tcp_reno_cong_avoid(sk, ack, acked);
125 return; 124 return;
126 } 125 }
127 126
128 /* limited by applications */ 127 /* limited by applications */
129 if (!tcp_is_cwnd_limited(sk, in_flight)) 128 if (!tcp_is_cwnd_limited(sk))
130 return; 129 return;
131 130
132 /* We do the Veno calculations only if we got enough rtt samples */ 131 /* We do the Veno calculations only if we got enough rtt samples */
@@ -134,7 +133,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked,
134 /* We don't have enough rtt samples to do the Veno 133 /* We don't have enough rtt samples to do the Veno
135 * calculation, so we'll behave like Reno. 134 * calculation, so we'll behave like Reno.
136 */ 135 */
137 tcp_reno_cong_avoid(sk, ack, acked, in_flight); 136 tcp_reno_cong_avoid(sk, ack, acked);
138 } else { 137 } else {
139 u64 target_cwnd; 138 u64 target_cwnd;
140 u32 rtt; 139 u32 rtt;
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 5ede0e727945..599b79b8eac0 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -69,13 +69,12 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
69 tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us); 69 tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
70} 70}
71 71
72static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked, 72static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
73 u32 in_flight)
74{ 73{
75 struct tcp_sock *tp = tcp_sk(sk); 74 struct tcp_sock *tp = tcp_sk(sk);
76 struct yeah *yeah = inet_csk_ca(sk); 75 struct yeah *yeah = inet_csk_ca(sk);
77 76
78 if (!tcp_is_cwnd_limited(sk, in_flight)) 77 if (!tcp_is_cwnd_limited(sk))
79 return; 78 return;
80 79
81 if (tp->snd_cwnd <= tp->snd_ssthresh) 80 if (tp->snd_cwnd <= tp->snd_ssthresh)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4468e1adc094..185ed3e59802 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -246,7 +246,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
246 do { 246 do {
247 if (low <= snum && snum <= high && 247 if (low <= snum && snum <= high &&
248 !test_bit(snum >> udptable->log, bitmap) && 248 !test_bit(snum >> udptable->log, bitmap) &&
249 !inet_is_reserved_local_port(snum)) 249 !inet_is_local_reserved_port(net, snum))
250 goto found; 250 goto found;
251 snum += rand; 251 snum += rand;
252 } while (snum != first); 252 } while (snum != first);
@@ -727,13 +727,12 @@ EXPORT_SYMBOL(udp_flush_pending_frames);
727void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) 727void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
728{ 728{
729 struct udphdr *uh = udp_hdr(skb); 729 struct udphdr *uh = udp_hdr(skb);
730 struct sk_buff *frags = skb_shinfo(skb)->frag_list;
731 int offset = skb_transport_offset(skb); 730 int offset = skb_transport_offset(skb);
732 int len = skb->len - offset; 731 int len = skb->len - offset;
733 int hlen = len; 732 int hlen = len;
734 __wsum csum = 0; 733 __wsum csum = 0;
735 734
736 if (!frags) { 735 if (!skb_has_frag_list(skb)) {
737 /* 736 /*
738 * Only one fragment on the socket. 737 * Only one fragment on the socket.
739 */ 738 */
@@ -742,15 +741,17 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
742 uh->check = ~csum_tcpudp_magic(src, dst, len, 741 uh->check = ~csum_tcpudp_magic(src, dst, len,
743 IPPROTO_UDP, 0); 742 IPPROTO_UDP, 0);
744 } else { 743 } else {
744 struct sk_buff *frags;
745
745 /* 746 /*
746 * HW-checksum won't work as there are two or more 747 * HW-checksum won't work as there are two or more
747 * fragments on the socket so that all csums of sk_buffs 748 * fragments on the socket so that all csums of sk_buffs
748 * should be together 749 * should be together
749 */ 750 */
750 do { 751 skb_walk_frags(skb, frags) {
751 csum = csum_add(csum, frags->csum); 752 csum = csum_add(csum, frags->csum);
752 hlen -= frags->len; 753 hlen -= frags->len;
753 } while ((frags = frags->next)); 754 }
754 755
755 csum = skb_checksum(skb, offset, hlen, csum); 756 csum = skb_checksum(skb, offset, hlen, csum);
756 skb->ip_summed = CHECKSUM_NONE; 757 skb->ip_summed = CHECKSUM_NONE;
@@ -762,6 +763,43 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
762} 763}
763EXPORT_SYMBOL_GPL(udp4_hwcsum); 764EXPORT_SYMBOL_GPL(udp4_hwcsum);
764 765
766/* Function to set UDP checksum for an IPv4 UDP packet. This is intended
767 * for the simple case like when setting the checksum for a UDP tunnel.
768 */
769void udp_set_csum(bool nocheck, struct sk_buff *skb,
770 __be32 saddr, __be32 daddr, int len)
771{
772 struct udphdr *uh = udp_hdr(skb);
773
774 if (nocheck)
775 uh->check = 0;
776 else if (skb_is_gso(skb))
777 uh->check = ~udp_v4_check(len, saddr, daddr, 0);
778 else if (skb_dst(skb) && skb_dst(skb)->dev &&
779 (skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) {
780
781 BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
782
783 skb->ip_summed = CHECKSUM_PARTIAL;
784 skb->csum_start = skb_transport_header(skb) - skb->head;
785 skb->csum_offset = offsetof(struct udphdr, check);
786 uh->check = ~udp_v4_check(len, saddr, daddr, 0);
787 } else {
788 __wsum csum;
789
790 BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
791
792 uh->check = 0;
793 csum = skb_checksum(skb, 0, len, 0);
794 uh->check = udp_v4_check(len, saddr, daddr, csum);
795 if (uh->check == 0)
796 uh->check = CSUM_MANGLED_0;
797
798 skb->ip_summed = CHECKSUM_UNNECESSARY;
799 }
800}
801EXPORT_SYMBOL(udp_set_csum);
802
765static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) 803static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
766{ 804{
767 struct sock *sk = skb->sk; 805 struct sock *sk = skb->sk;
@@ -785,7 +823,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
785 if (is_udplite) /* UDP-Lite */ 823 if (is_udplite) /* UDP-Lite */
786 csum = udplite_csum(skb); 824 csum = udplite_csum(skb);
787 825
788 else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */ 826 else if (sk->sk_no_check_tx) { /* UDP csum disabled */
789 827
790 skb->ip_summed = CHECKSUM_NONE; 828 skb->ip_summed = CHECKSUM_NONE;
791 goto send; 829 goto send;
@@ -1495,6 +1533,10 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1495 if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) { 1533 if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) {
1496 int ret; 1534 int ret;
1497 1535
1536 /* Verify checksum before giving to encap */
1537 if (udp_lib_checksum_complete(skb))
1538 goto csum_error;
1539
1498 ret = encap_rcv(sk, skb); 1540 ret = encap_rcv(sk, skb);
1499 if (ret <= 0) { 1541 if (ret <= 0) {
1500 UDP_INC_STATS_BH(sock_net(sk), 1542 UDP_INC_STATS_BH(sock_net(sk),
@@ -1672,7 +1714,6 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
1672static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, 1714static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
1673 int proto) 1715 int proto)
1674{ 1716{
1675 const struct iphdr *iph;
1676 int err; 1717 int err;
1677 1718
1678 UDP_SKB_CB(skb)->partial_cov = 0; 1719 UDP_SKB_CB(skb)->partial_cov = 0;
@@ -1684,22 +1725,8 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
1684 return err; 1725 return err;
1685 } 1726 }
1686 1727
1687 iph = ip_hdr(skb); 1728 return skb_checksum_init_zero_check(skb, proto, uh->check,
1688 if (uh->check == 0) { 1729 inet_compute_pseudo);
1689 skb->ip_summed = CHECKSUM_UNNECESSARY;
1690 } else if (skb->ip_summed == CHECKSUM_COMPLETE) {
1691 if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
1692 proto, skb->csum))
1693 skb->ip_summed = CHECKSUM_UNNECESSARY;
1694 }
1695 if (!skb_csum_unnecessary(skb))
1696 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1697 skb->len, proto, 0);
1698 /* Probably, we should checksum udp header (it should be in cache
1699 * in any case) and data in tiny packets (< rx copybreak).
1700 */
1701
1702 return 0;
1703} 1730}
1704 1731
1705/* 1732/*
@@ -1886,7 +1913,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
1886 unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); 1913 unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
1887 unsigned int slot2 = hash2 & udp_table.mask; 1914 unsigned int slot2 = hash2 & udp_table.mask;
1888 struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; 1915 struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
1889 INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr) 1916 INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
1890 const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); 1917 const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
1891 1918
1892 rcu_read_lock(); 1919 rcu_read_lock();
@@ -1979,7 +2006,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
1979 int (*push_pending_frames)(struct sock *)) 2006 int (*push_pending_frames)(struct sock *))
1980{ 2007{
1981 struct udp_sock *up = udp_sk(sk); 2008 struct udp_sock *up = udp_sk(sk);
1982 int val; 2009 int val, valbool;
1983 int err = 0; 2010 int err = 0;
1984 int is_udplite = IS_UDPLITE(sk); 2011 int is_udplite = IS_UDPLITE(sk);
1985 2012
@@ -1989,6 +2016,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
1989 if (get_user(val, (int __user *)optval)) 2016 if (get_user(val, (int __user *)optval))
1990 return -EFAULT; 2017 return -EFAULT;
1991 2018
2019 valbool = val ? 1 : 0;
2020
1992 switch (optname) { 2021 switch (optname) {
1993 case UDP_CORK: 2022 case UDP_CORK:
1994 if (val != 0) { 2023 if (val != 0) {
@@ -2018,6 +2047,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
2018 } 2047 }
2019 break; 2048 break;
2020 2049
2050 case UDP_NO_CHECK6_TX:
2051 up->no_check6_tx = valbool;
2052 break;
2053
2054 case UDP_NO_CHECK6_RX:
2055 up->no_check6_rx = valbool;
2056 break;
2057
2021 /* 2058 /*
2022 * UDP-Lite's partial checksum coverage (RFC 3828). 2059 * UDP-Lite's partial checksum coverage (RFC 3828).
2023 */ 2060 */
@@ -2100,6 +2137,14 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
2100 val = up->encap_type; 2137 val = up->encap_type;
2101 break; 2138 break;
2102 2139
2140 case UDP_NO_CHECK6_TX:
2141 val = up->no_check6_tx;
2142 break;
2143
2144 case UDP_NO_CHECK6_RX:
2145 val = up->no_check6_rx;
2146 break;
2147
2103 /* The following two cannot be changed on UDP sockets, the return is 2148 /* The following two cannot be changed on UDP sockets, the return is
2104 * always 0 (which corresponds to the full checksum coverage of UDP). */ 2149 * always 0 (which corresponds to the full checksum coverage of UDP). */
2105 case UDPLITE_SEND_CSCOV: 2150 case UDPLITE_SEND_CSCOV:
@@ -2484,7 +2529,11 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
2484 int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); 2529 int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
2485 __be16 protocol = skb->protocol; 2530 __be16 protocol = skb->protocol;
2486 netdev_features_t enc_features; 2531 netdev_features_t enc_features;
2487 int outer_hlen; 2532 int udp_offset, outer_hlen;
2533 unsigned int oldlen;
2534 bool need_csum;
2535
2536 oldlen = (u16)~skb->len;
2488 2537
2489 if (unlikely(!pskb_may_pull(skb, tnl_hlen))) 2538 if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
2490 goto out; 2539 goto out;
@@ -2496,6 +2545,10 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
2496 skb->mac_len = skb_inner_network_offset(skb); 2545 skb->mac_len = skb_inner_network_offset(skb);
2497 skb->protocol = htons(ETH_P_TEB); 2546 skb->protocol = htons(ETH_P_TEB);
2498 2547
2548 need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
2549 if (need_csum)
2550 skb->encap_hdr_csum = 1;
2551
2499 /* segment inner packet. */ 2552 /* segment inner packet. */
2500 enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); 2553 enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
2501 segs = skb_mac_gso_segment(skb, enc_features); 2554 segs = skb_mac_gso_segment(skb, enc_features);
@@ -2506,10 +2559,11 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
2506 } 2559 }
2507 2560
2508 outer_hlen = skb_tnl_header_len(skb); 2561 outer_hlen = skb_tnl_header_len(skb);
2562 udp_offset = outer_hlen - tnl_hlen;
2509 skb = segs; 2563 skb = segs;
2510 do { 2564 do {
2511 struct udphdr *uh; 2565 struct udphdr *uh;
2512 int udp_offset = outer_hlen - tnl_hlen; 2566 int len;
2513 2567
2514 skb_reset_inner_headers(skb); 2568 skb_reset_inner_headers(skb);
2515 skb->encapsulation = 1; 2569 skb->encapsulation = 1;
@@ -2520,31 +2574,20 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
2520 skb_reset_mac_header(skb); 2574 skb_reset_mac_header(skb);
2521 skb_set_network_header(skb, mac_len); 2575 skb_set_network_header(skb, mac_len);
2522 skb_set_transport_header(skb, udp_offset); 2576 skb_set_transport_header(skb, udp_offset);
2577 len = skb->len - udp_offset;
2523 uh = udp_hdr(skb); 2578 uh = udp_hdr(skb);
2524 uh->len = htons(skb->len - udp_offset); 2579 uh->len = htons(len);
2525
2526 /* csum segment if tunnel sets skb with csum. */
2527 if (protocol == htons(ETH_P_IP) && unlikely(uh->check)) {
2528 struct iphdr *iph = ip_hdr(skb);
2529 2580
2530 uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, 2581 if (need_csum) {
2531 skb->len - udp_offset, 2582 __be32 delta = htonl(oldlen + len);
2532 IPPROTO_UDP, 0);
2533 uh->check = csum_fold(skb_checksum(skb, udp_offset,
2534 skb->len - udp_offset, 0));
2535 if (uh->check == 0)
2536 uh->check = CSUM_MANGLED_0;
2537 2583
2538 } else if (protocol == htons(ETH_P_IPV6)) { 2584 uh->check = ~csum_fold((__force __wsum)
2539 struct ipv6hdr *ipv6h = ipv6_hdr(skb); 2585 ((__force u32)uh->check +
2540 u32 len = skb->len - udp_offset; 2586 (__force u32)delta));
2587 uh->check = gso_make_checksum(skb, ~uh->check);
2541 2588
2542 uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
2543 len, IPPROTO_UDP, 0);
2544 uh->check = csum_fold(skb_checksum(skb, udp_offset, len, 0));
2545 if (uh->check == 0) 2589 if (uh->check == 0)
2546 uh->check = CSUM_MANGLED_0; 2590 uh->check = CSUM_MANGLED_0;
2547 skb->ip_summed = CHECKSUM_NONE;
2548 } 2591 }
2549 2592
2550 skb->protocol = protocol; 2593 skb->protocol = protocol;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 88b4023ecfcf..546d2d439dda 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -56,7 +56,8 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
56 __wsum csum; 56 __wsum csum;
57 57
58 if (skb->encapsulation && 58 if (skb->encapsulation &&
59 skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) { 59 (skb_shinfo(skb)->gso_type &
60 (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) {
60 segs = skb_udp_tunnel_segment(skb, features); 61 segs = skb_udp_tunnel_segment(skb, features);
61 goto out; 62 goto out;
62 } 63 }
@@ -71,8 +72,10 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
71 72
72 if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | 73 if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
73 SKB_GSO_UDP_TUNNEL | 74 SKB_GSO_UDP_TUNNEL |
75 SKB_GSO_UDP_TUNNEL_CSUM |
74 SKB_GSO_IPIP | 76 SKB_GSO_IPIP |
75 SKB_GSO_GRE | SKB_GSO_MPLS) || 77 SKB_GSO_GRE | SKB_GSO_GRE_CSUM |
78 SKB_GSO_MPLS) ||
76 !(type & (SKB_GSO_UDP)))) 79 !(type & (SKB_GSO_UDP))))
77 goto out; 80 goto out;
78 81
@@ -197,6 +200,7 @@ unflush:
197 } 200 }
198 201
199 skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ 202 skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
203 skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
200 pp = uo_priv->offload->callbacks.gro_receive(head, skb); 204 pp = uo_priv->offload->callbacks.gro_receive(head, skb);
201 205
202out_unlock: 206out_unlock:
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 2c46acd4cc36..3b3efbda48e1 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -70,7 +70,6 @@ static struct inet_protosw udplite4_protosw = {
70 .protocol = IPPROTO_UDPLITE, 70 .protocol = IPPROTO_UDPLITE,
71 .prot = &udplite_prot, 71 .prot = &udplite_prot,
72 .ops = &inet_dgram_ops, 72 .ops = &inet_dgram_ops,
73 .no_check = 0, /* must checksum (RFC 3828) */
74 .flags = INET_PROTOSW_PERMANENT, 73 .flags = INET_PROTOSW_PERMANENT,
75}; 74};
76 75
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 05f2b484954f..91771a7c802f 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -58,12 +58,12 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
58 58
59 top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? 59 top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
60 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); 60 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
61 ip_select_ident(skb, dst->child, NULL);
62 61
63 top_iph->ttl = ip4_dst_hoplimit(dst->child); 62 top_iph->ttl = ip4_dst_hoplimit(dst->child);
64 63
65 top_iph->saddr = x->props.saddr.a4; 64 top_iph->saddr = x->props.saddr.a4;
66 top_iph->daddr = x->id.daddr.a4; 65 top_iph->daddr = x->id.daddr.a4;
66 ip_select_ident(skb, NULL);
67 67
68 return 0; 68 return 0;
69} 69}
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 186a8ecf92fa..d5f6bd9a210a 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -25,7 +25,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
25 if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) 25 if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
26 goto out; 26 goto out;
27 27
28 if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) 28 if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df)
29 goto out; 29 goto out;
30 30
31 mtu = dst_mtu(skb_dst(skb)); 31 mtu = dst_mtu(skb_dst(skb));