aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-08-06 12:38:14 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-06 12:38:14 -0400
commitae045e2455429c418a418a3376301a9e5753a0a8 (patch)
treeb445bdeecd3f38aa0d0a29c9585cee49e4ccb0f1 /net/ipv4
parentf4f142ed4ef835709c7e6d12eaca10d190bcebed (diff)
parentd247b6ab3ce6dd43665780865ec5fa145d9ab6bd (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Highlights: 1) Steady transitioning of the BPF instructure to a generic spot so all kernel subsystems can make use of it, from Alexei Starovoitov. 2) SFC driver supports busy polling, from Alexandre Rames. 3) Take advantage of hash table in UDP multicast delivery, from David Held. 4) Lighten locking, in particular by getting rid of the LRU lists, in inet frag handling. From Florian Westphal. 5) Add support for various RFC6458 control messages in SCTP, from Geir Ola Vaagland. 6) Allow to filter bridge forwarding database dumps by device, from Jamal Hadi Salim. 7) virtio-net also now supports busy polling, from Jason Wang. 8) Some low level optimization tweaks in pktgen from Jesper Dangaard Brouer. 9) Add support for ipv6 address generation modes, so that userland can have some input into the process. From Jiri Pirko. 10) Consolidate common TCP connection request code in ipv4 and ipv6, from Octavian Purdila. 11) New ARP packet logger in netfilter, from Pablo Neira Ayuso. 12) Generic resizable RCU hash table, with intial users in netlink and nftables. From Thomas Graf. 13) Maintain a name assignment type so that userspace can see where a network device name came from (enumerated by kernel, assigned explicitly by userspace, etc.) From Tom Gundersen. 14) Automatic flow label generation on transmit in ipv6, from Tom Herbert. 15) New packet timestamping facilities from Willem de Bruijn, meant to assist in measuring latencies going into/out-of the packet scheduler, latency from TCP data transmission to ACK, etc" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1536 commits) cxgb4 : Disable recursive mailbox commands when enabling vi net: reduce USB network driver config options. tg3: Modify tg3_tso_bug() to handle multiple TX rings amd-xgbe: Perform phy connect/disconnect at dev open/stop amd-xgbe: Use dma_set_mask_and_coherent to set DMA mask net: sun4i-emac: fix memory leak on bad packet sctp: fix possible seqlock seadlock in sctp_packet_transmit() Revert "net: phy: Set the driver when registering an MDIO bus device" cxgb4vf: Turn off SGE RX/TX Callback Timers and interrupts in PCI shutdown routine team: Simplify return path of team_newlink bridge: Update outdated comment on promiscuous mode net-timestamp: ACK timestamp for bytestreams net-timestamp: TCP timestamping net-timestamp: SCHED timestamp on entering packet scheduler net-timestamp: add key to disambiguate concurrent datagrams net-timestamp: move timestamp flags out of sk_flags net-timestamp: extend SCM_TIMESTAMPING ancillary data struct cxgb4i : Move stray CPL definitions to cxgb4 driver tcp: reduce spurious retransmits due to transient SACK reneging qlcnic: Initialize dcbnl_ops before register_netdev ...
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig4
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/datagram.c1
-rw-r--r--net/ipv4/devinet.c36
-rw-r--r--net/ipv4/gre_offload.c2
-rw-r--r--net/ipv4/icmp.c6
-rw-r--r--net/ipv4/igmp.c2
-rw-r--r--net/ipv4/inet_fragment.c318
-rw-r--r--net/ipv4/ip_fragment.c87
-rw-r--r--net/ipv4/ip_output.c13
-rw-r--r--net/ipv4/ip_sockglue.c2
-rw-r--r--net/ipv4/ip_tunnel.c31
-rw-r--r--net/ipv4/ip_vti.c54
-rw-r--r--net/ipv4/ipconfig.c5
-rw-r--r--net/ipv4/ipmr.c2
-rw-r--r--net/ipv4/netfilter/Kconfig29
-rw-r--r--net/ipv4/netfilter/Makefile5
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c498
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c6
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c4
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c8
-rw-r--r--net/ipv4/netfilter/nf_log_arp.c149
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c385
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c2
-rw-r--r--net/ipv4/ping.c2
-rw-r--r--net/ipv4/proc.c5
-rw-r--r--net/ipv4/raw.c9
-rw-r--r--net/ipv4/syncookies.c3
-rw-r--r--net/ipv4/tcp.c22
-rw-r--r--net/ipv4/tcp_input.c200
-rw-r--r--net/ipv4/tcp_ipv4.c201
-rw-r--r--net/ipv4/tcp_metrics.c1
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv4/tcp_offload.c18
-rw-r--r--net/ipv4/tcp_output.c18
-rw-r--r--net/ipv4/tcp_timer.c4
-rw-r--r--net/ipv4/tcp_vegas.c3
-rw-r--r--net/ipv4/tcp_veno.c2
-rw-r--r--net/ipv4/udp.c156
-rw-r--r--net/ipv4/udp_offload.c76
-rw-r--r--net/ipv4/udp_tunnel.c100
-rw-r--r--net/ipv4/xfrm4_protocol.c2
44 files changed, 1416 insertions, 1064 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 05c57f0fcabe..dbc10d84161f 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -307,6 +307,10 @@ config NET_IPVTI
307 the notion of a secure tunnel for IPSEC and then use routing protocol 307 the notion of a secure tunnel for IPSEC and then use routing protocol
308 on top. 308 on top.
309 309
310config NET_UDP_TUNNEL
311 tristate
312 default n
313
310config INET_AH 314config INET_AH
311 tristate "IP: AH transformation" 315 tristate "IP: AH transformation"
312 select XFRM_ALGO 316 select XFRM_ALGO
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f032688d20d3..8ee1cd4053ee 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_NET_IPIP) += ipip.o
22gre-y := gre_demux.o 22gre-y := gre_demux.o
23obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o 23obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
24obj-$(CONFIG_NET_IPGRE) += ip_gre.o 24obj-$(CONFIG_NET_IPGRE) += ip_gre.o
25obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o
25obj-$(CONFIG_NET_IPVTI) += ip_vti.o 26obj-$(CONFIG_NET_IPVTI) += ip_vti.o
26obj-$(CONFIG_SYN_COOKIES) += syncookies.o 27obj-$(CONFIG_SYN_COOKIES) += syncookies.o
27obj-$(CONFIG_INET_AH) += ah4.o 28obj-$(CONFIG_INET_AH) += ah4.o
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index a3095fdefbed..90c0e8386116 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -76,6 +76,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
76 inet->inet_daddr = fl4->daddr; 76 inet->inet_daddr = fl4->daddr;
77 inet->inet_dport = usin->sin_port; 77 inet->inet_dport = usin->sin_port;
78 sk->sk_state = TCP_ESTABLISHED; 78 sk->sk_state = TCP_ESTABLISHED;
79 inet_set_txhash(sk);
79 inet->inet_id = jiffies; 80 inet->inet_id = jiffies;
80 81
81 sk_dst_set(sk, &rt->dst); 82 sk_dst_set(sk, &rt->dst);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index e9449376b58e..214882e7d6de 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -180,11 +180,12 @@ static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
180static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, 180static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
181 int destroy); 181 int destroy);
182#ifdef CONFIG_SYSCTL 182#ifdef CONFIG_SYSCTL
183static void devinet_sysctl_register(struct in_device *idev); 183static int devinet_sysctl_register(struct in_device *idev);
184static void devinet_sysctl_unregister(struct in_device *idev); 184static void devinet_sysctl_unregister(struct in_device *idev);
185#else 185#else
186static void devinet_sysctl_register(struct in_device *idev) 186static int devinet_sysctl_register(struct in_device *idev)
187{ 187{
188 return 0;
188} 189}
189static void devinet_sysctl_unregister(struct in_device *idev) 190static void devinet_sysctl_unregister(struct in_device *idev)
190{ 191{
@@ -232,6 +233,7 @@ EXPORT_SYMBOL(in_dev_finish_destroy);
232static struct in_device *inetdev_init(struct net_device *dev) 233static struct in_device *inetdev_init(struct net_device *dev)
233{ 234{
234 struct in_device *in_dev; 235 struct in_device *in_dev;
236 int err = -ENOMEM;
235 237
236 ASSERT_RTNL(); 238 ASSERT_RTNL();
237 239
@@ -252,7 +254,13 @@ static struct in_device *inetdev_init(struct net_device *dev)
252 /* Account for reference dev->ip_ptr (below) */ 254 /* Account for reference dev->ip_ptr (below) */
253 in_dev_hold(in_dev); 255 in_dev_hold(in_dev);
254 256
255 devinet_sysctl_register(in_dev); 257 err = devinet_sysctl_register(in_dev);
258 if (err) {
259 in_dev->dead = 1;
260 in_dev_put(in_dev);
261 in_dev = NULL;
262 goto out;
263 }
256 ip_mc_init_dev(in_dev); 264 ip_mc_init_dev(in_dev);
257 if (dev->flags & IFF_UP) 265 if (dev->flags & IFF_UP)
258 ip_mc_up(in_dev); 266 ip_mc_up(in_dev);
@@ -260,7 +268,7 @@ static struct in_device *inetdev_init(struct net_device *dev)
260 /* we can receive as soon as ip_ptr is set -- do this last */ 268 /* we can receive as soon as ip_ptr is set -- do this last */
261 rcu_assign_pointer(dev->ip_ptr, in_dev); 269 rcu_assign_pointer(dev->ip_ptr, in_dev);
262out: 270out:
263 return in_dev; 271 return in_dev ?: ERR_PTR(err);
264out_kfree: 272out_kfree:
265 kfree(in_dev); 273 kfree(in_dev);
266 in_dev = NULL; 274 in_dev = NULL;
@@ -1347,8 +1355,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1347 if (!in_dev) { 1355 if (!in_dev) {
1348 if (event == NETDEV_REGISTER) { 1356 if (event == NETDEV_REGISTER) {
1349 in_dev = inetdev_init(dev); 1357 in_dev = inetdev_init(dev);
1350 if (!in_dev) 1358 if (IS_ERR(in_dev))
1351 return notifier_from_errno(-ENOMEM); 1359 return notifier_from_errno(PTR_ERR(in_dev));
1352 if (dev->flags & IFF_LOOPBACK) { 1360 if (dev->flags & IFF_LOOPBACK) {
1353 IN_DEV_CONF_SET(in_dev, NOXFRM, 1); 1361 IN_DEV_CONF_SET(in_dev, NOXFRM, 1);
1354 IN_DEV_CONF_SET(in_dev, NOPOLICY, 1); 1362 IN_DEV_CONF_SET(in_dev, NOPOLICY, 1);
@@ -2182,11 +2190,21 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
2182 kfree(t); 2190 kfree(t);
2183} 2191}
2184 2192
2185static void devinet_sysctl_register(struct in_device *idev) 2193static int devinet_sysctl_register(struct in_device *idev)
2186{ 2194{
2187 neigh_sysctl_register(idev->dev, idev->arp_parms, NULL); 2195 int err;
2188 __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, 2196
2197 if (!sysctl_dev_name_is_allowed(idev->dev->name))
2198 return -EINVAL;
2199
2200 err = neigh_sysctl_register(idev->dev, idev->arp_parms, NULL);
2201 if (err)
2202 return err;
2203 err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
2189 &idev->cnf); 2204 &idev->cnf);
2205 if (err)
2206 neigh_sysctl_unregister(idev->arp_parms);
2207 return err;
2190} 2208}
2191 2209
2192static void devinet_sysctl_unregister(struct in_device *idev) 2210static void devinet_sysctl_unregister(struct in_device *idev)
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index f0bdd47bbbcb..6556263c8fa5 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -74,7 +74,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
74 /* segment inner packet. */ 74 /* segment inner packet. */
75 enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); 75 enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
76 segs = skb_mac_gso_segment(skb, enc_features); 76 segs = skb_mac_gso_segment(skb, enc_features);
77 if (!segs || IS_ERR(segs)) { 77 if (IS_ERR_OR_NULL(segs)) {
78 skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len); 78 skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len);
79 goto out; 79 goto out;
80 } 80 }
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 42b7bcf8045b..ea7d4afe8205 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -663,16 +663,16 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
663 /* Checkin full IP header plus 8 bytes of protocol to 663 /* Checkin full IP header plus 8 bytes of protocol to
664 * avoid additional coding at protocol handlers. 664 * avoid additional coding at protocol handlers.
665 */ 665 */
666 if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) 666 if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) {
667 ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
667 return; 668 return;
669 }
668 670
669 raw_icmp_error(skb, protocol, info); 671 raw_icmp_error(skb, protocol, info);
670 672
671 rcu_read_lock();
672 ipprot = rcu_dereference(inet_protos[protocol]); 673 ipprot = rcu_dereference(inet_protos[protocol]);
673 if (ipprot && ipprot->err_handler) 674 if (ipprot && ipprot->err_handler)
674 ipprot->err_handler(skb, info); 675 ipprot->err_handler(skb, info);
675 rcu_read_unlock();
676} 676}
677 677
678static bool icmp_tag_validation(int proto) 678static bool icmp_tag_validation(int proto)
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index db710b059bab..f10eab462282 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1321,7 +1321,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1321 atomic_set(&im->refcnt, 1); 1321 atomic_set(&im->refcnt, 1);
1322 spin_lock_init(&im->lock); 1322 spin_lock_init(&im->lock);
1323#ifdef CONFIG_IP_MULTICAST 1323#ifdef CONFIG_IP_MULTICAST
1324 setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im); 1324 setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im);
1325 im->unsolicit_count = IGMP_Unsolicited_Report_Count; 1325 im->unsolicit_count = IGMP_Unsolicited_Report_Count;
1326#endif 1326#endif
1327 1327
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 3b01959bf4bb..9eb89f3f0ee4 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -25,6 +25,12 @@
25#include <net/inet_frag.h> 25#include <net/inet_frag.h>
26#include <net/inet_ecn.h> 26#include <net/inet_ecn.h>
27 27
28#define INETFRAGS_EVICT_BUCKETS 128
29#define INETFRAGS_EVICT_MAX 512
30
31/* don't rebuild inetfrag table with new secret more often than this */
32#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
33
28/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements 34/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
29 * Value : 0xff if frame should be dropped. 35 * Value : 0xff if frame should be dropped.
30 * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field 36 * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
@@ -46,24 +52,39 @@ const u8 ip_frag_ecn_table[16] = {
46}; 52};
47EXPORT_SYMBOL(ip_frag_ecn_table); 53EXPORT_SYMBOL(ip_frag_ecn_table);
48 54
49static void inet_frag_secret_rebuild(unsigned long dummy) 55static unsigned int
56inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
57{
58 return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
59}
60
61static bool inet_frag_may_rebuild(struct inet_frags *f)
62{
63 return time_after(jiffies,
64 f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
65}
66
67static void inet_frag_secret_rebuild(struct inet_frags *f)
50{ 68{
51 struct inet_frags *f = (struct inet_frags *)dummy;
52 unsigned long now = jiffies;
53 int i; 69 int i;
54 70
55 /* Per bucket lock NOT needed here, due to write lock protection */ 71 write_seqlock_bh(&f->rnd_seqlock);
56 write_lock(&f->lock); 72
73 if (!inet_frag_may_rebuild(f))
74 goto out;
57 75
58 get_random_bytes(&f->rnd, sizeof(u32)); 76 get_random_bytes(&f->rnd, sizeof(u32));
77
59 for (i = 0; i < INETFRAGS_HASHSZ; i++) { 78 for (i = 0; i < INETFRAGS_HASHSZ; i++) {
60 struct inet_frag_bucket *hb; 79 struct inet_frag_bucket *hb;
61 struct inet_frag_queue *q; 80 struct inet_frag_queue *q;
62 struct hlist_node *n; 81 struct hlist_node *n;
63 82
64 hb = &f->hash[i]; 83 hb = &f->hash[i];
84 spin_lock(&hb->chain_lock);
85
65 hlist_for_each_entry_safe(q, n, &hb->chain, list) { 86 hlist_for_each_entry_safe(q, n, &hb->chain, list) {
66 unsigned int hval = f->hashfn(q); 87 unsigned int hval = inet_frag_hashfn(f, q);
67 88
68 if (hval != i) { 89 if (hval != i) {
69 struct inet_frag_bucket *hb_dest; 90 struct inet_frag_bucket *hb_dest;
@@ -72,76 +93,200 @@ static void inet_frag_secret_rebuild(unsigned long dummy)
72 93
73 /* Relink to new hash chain. */ 94 /* Relink to new hash chain. */
74 hb_dest = &f->hash[hval]; 95 hb_dest = &f->hash[hval];
96
97 /* This is the only place where we take
98 * another chain_lock while already holding
99 * one. As this will not run concurrently,
100 * we cannot deadlock on hb_dest lock below, if its
101 * already locked it will be released soon since
102 * other caller cannot be waiting for hb lock
103 * that we've taken above.
104 */
105 spin_lock_nested(&hb_dest->chain_lock,
106 SINGLE_DEPTH_NESTING);
75 hlist_add_head(&q->list, &hb_dest->chain); 107 hlist_add_head(&q->list, &hb_dest->chain);
108 spin_unlock(&hb_dest->chain_lock);
76 } 109 }
77 } 110 }
111 spin_unlock(&hb->chain_lock);
112 }
113
114 f->rebuild = false;
115 f->last_rebuild_jiffies = jiffies;
116out:
117 write_sequnlock_bh(&f->rnd_seqlock);
118}
119
120static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
121{
122 return q->net->low_thresh == 0 ||
123 frag_mem_limit(q->net) >= q->net->low_thresh;
124}
125
126static unsigned int
127inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
128{
129 struct inet_frag_queue *fq;
130 struct hlist_node *n;
131 unsigned int evicted = 0;
132 HLIST_HEAD(expired);
133
134evict_again:
135 spin_lock(&hb->chain_lock);
136
137 hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
138 if (!inet_fragq_should_evict(fq))
139 continue;
140
141 if (!del_timer(&fq->timer)) {
142 /* q expiring right now thus increment its refcount so
143 * it won't be freed under us and wait until the timer
144 * has finished executing then destroy it
145 */
146 atomic_inc(&fq->refcnt);
147 spin_unlock(&hb->chain_lock);
148 del_timer_sync(&fq->timer);
149 WARN_ON(atomic_read(&fq->refcnt) != 1);
150 inet_frag_put(fq, f);
151 goto evict_again;
152 }
153
154 fq->flags |= INET_FRAG_EVICTED;
155 hlist_del(&fq->list);
156 hlist_add_head(&fq->list, &expired);
157 ++evicted;
78 } 158 }
79 write_unlock(&f->lock);
80 159
81 mod_timer(&f->secret_timer, now + f->secret_interval); 160 spin_unlock(&hb->chain_lock);
161
162 hlist_for_each_entry_safe(fq, n, &expired, list)
163 f->frag_expire((unsigned long) fq);
164
165 return evicted;
82} 166}
83 167
84void inet_frags_init(struct inet_frags *f) 168static void inet_frag_worker(struct work_struct *work)
169{
170 unsigned int budget = INETFRAGS_EVICT_BUCKETS;
171 unsigned int i, evicted = 0;
172 struct inet_frags *f;
173
174 f = container_of(work, struct inet_frags, frags_work);
175
176 BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
177
178 local_bh_disable();
179
180 for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
181 evicted += inet_evict_bucket(f, &f->hash[i]);
182 i = (i + 1) & (INETFRAGS_HASHSZ - 1);
183 if (evicted > INETFRAGS_EVICT_MAX)
184 break;
185 }
186
187 f->next_bucket = i;
188
189 local_bh_enable();
190
191 if (f->rebuild && inet_frag_may_rebuild(f))
192 inet_frag_secret_rebuild(f);
193}
194
195static void inet_frag_schedule_worker(struct inet_frags *f)
196{
197 if (unlikely(!work_pending(&f->frags_work)))
198 schedule_work(&f->frags_work);
199}
200
201int inet_frags_init(struct inet_frags *f)
85{ 202{
86 int i; 203 int i;
87 204
205 INIT_WORK(&f->frags_work, inet_frag_worker);
206
88 for (i = 0; i < INETFRAGS_HASHSZ; i++) { 207 for (i = 0; i < INETFRAGS_HASHSZ; i++) {
89 struct inet_frag_bucket *hb = &f->hash[i]; 208 struct inet_frag_bucket *hb = &f->hash[i];
90 209
91 spin_lock_init(&hb->chain_lock); 210 spin_lock_init(&hb->chain_lock);
92 INIT_HLIST_HEAD(&hb->chain); 211 INIT_HLIST_HEAD(&hb->chain);
93 } 212 }
94 rwlock_init(&f->lock);
95 213
96 setup_timer(&f->secret_timer, inet_frag_secret_rebuild, 214 seqlock_init(&f->rnd_seqlock);
97 (unsigned long)f); 215 f->last_rebuild_jiffies = 0;
98 f->secret_timer.expires = jiffies + f->secret_interval; 216 f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
99 add_timer(&f->secret_timer); 217 NULL);
218 if (!f->frags_cachep)
219 return -ENOMEM;
220
221 return 0;
100} 222}
101EXPORT_SYMBOL(inet_frags_init); 223EXPORT_SYMBOL(inet_frags_init);
102 224
103void inet_frags_init_net(struct netns_frags *nf) 225void inet_frags_init_net(struct netns_frags *nf)
104{ 226{
105 nf->nqueues = 0;
106 init_frag_mem_limit(nf); 227 init_frag_mem_limit(nf);
107 INIT_LIST_HEAD(&nf->lru_list);
108 spin_lock_init(&nf->lru_lock);
109} 228}
110EXPORT_SYMBOL(inet_frags_init_net); 229EXPORT_SYMBOL(inet_frags_init_net);
111 230
112void inet_frags_fini(struct inet_frags *f) 231void inet_frags_fini(struct inet_frags *f)
113{ 232{
114 del_timer(&f->secret_timer); 233 cancel_work_sync(&f->frags_work);
234 kmem_cache_destroy(f->frags_cachep);
115} 235}
116EXPORT_SYMBOL(inet_frags_fini); 236EXPORT_SYMBOL(inet_frags_fini);
117 237
118void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) 238void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
119{ 239{
120 nf->low_thresh = 0; 240 unsigned int seq;
241 int i;
121 242
243 nf->low_thresh = 0;
122 local_bh_disable(); 244 local_bh_disable();
123 inet_frag_evictor(nf, f, true); 245
246evict_again:
247 seq = read_seqbegin(&f->rnd_seqlock);
248
249 for (i = 0; i < INETFRAGS_HASHSZ ; i++)
250 inet_evict_bucket(f, &f->hash[i]);
251
252 if (read_seqretry(&f->rnd_seqlock, seq))
253 goto evict_again;
254
124 local_bh_enable(); 255 local_bh_enable();
125 256
126 percpu_counter_destroy(&nf->mem); 257 percpu_counter_destroy(&nf->mem);
127} 258}
128EXPORT_SYMBOL(inet_frags_exit_net); 259EXPORT_SYMBOL(inet_frags_exit_net);
129 260
130static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) 261static struct inet_frag_bucket *
262get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
263__acquires(hb->chain_lock)
131{ 264{
132 struct inet_frag_bucket *hb; 265 struct inet_frag_bucket *hb;
133 unsigned int hash; 266 unsigned int seq, hash;
267
268 restart:
269 seq = read_seqbegin(&f->rnd_seqlock);
134 270
135 read_lock(&f->lock); 271 hash = inet_frag_hashfn(f, fq);
136 hash = f->hashfn(fq);
137 hb = &f->hash[hash]; 272 hb = &f->hash[hash];
138 273
139 spin_lock(&hb->chain_lock); 274 spin_lock(&hb->chain_lock);
275 if (read_seqretry(&f->rnd_seqlock, seq)) {
276 spin_unlock(&hb->chain_lock);
277 goto restart;
278 }
279
280 return hb;
281}
282
283static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
284{
285 struct inet_frag_bucket *hb;
286
287 hb = get_frag_bucket_locked(fq, f);
140 hlist_del(&fq->list); 288 hlist_del(&fq->list);
141 spin_unlock(&hb->chain_lock); 289 spin_unlock(&hb->chain_lock);
142
143 read_unlock(&f->lock);
144 inet_frag_lru_del(fq);
145} 290}
146 291
147void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) 292void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
@@ -149,30 +294,29 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
149 if (del_timer(&fq->timer)) 294 if (del_timer(&fq->timer))
150 atomic_dec(&fq->refcnt); 295 atomic_dec(&fq->refcnt);
151 296
152 if (!(fq->last_in & INET_FRAG_COMPLETE)) { 297 if (!(fq->flags & INET_FRAG_COMPLETE)) {
153 fq_unlink(fq, f); 298 fq_unlink(fq, f);
154 atomic_dec(&fq->refcnt); 299 atomic_dec(&fq->refcnt);
155 fq->last_in |= INET_FRAG_COMPLETE; 300 fq->flags |= INET_FRAG_COMPLETE;
156 } 301 }
157} 302}
158EXPORT_SYMBOL(inet_frag_kill); 303EXPORT_SYMBOL(inet_frag_kill);
159 304
160static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f, 305static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
161 struct sk_buff *skb) 306 struct sk_buff *skb)
162{ 307{
163 if (f->skb_free) 308 if (f->skb_free)
164 f->skb_free(skb); 309 f->skb_free(skb);
165 kfree_skb(skb); 310 kfree_skb(skb);
166} 311}
167 312
168void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, 313void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
169 int *work)
170{ 314{
171 struct sk_buff *fp; 315 struct sk_buff *fp;
172 struct netns_frags *nf; 316 struct netns_frags *nf;
173 unsigned int sum, sum_truesize = 0; 317 unsigned int sum, sum_truesize = 0;
174 318
175 WARN_ON(!(q->last_in & INET_FRAG_COMPLETE)); 319 WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
176 WARN_ON(del_timer(&q->timer) != 0); 320 WARN_ON(del_timer(&q->timer) != 0);
177 321
178 /* Release all fragment data. */ 322 /* Release all fragment data. */
@@ -186,87 +330,32 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
186 fp = xp; 330 fp = xp;
187 } 331 }
188 sum = sum_truesize + f->qsize; 332 sum = sum_truesize + f->qsize;
189 if (work)
190 *work -= sum;
191 sub_frag_mem_limit(q, sum); 333 sub_frag_mem_limit(q, sum);
192 334
193 if (f->destructor) 335 if (f->destructor)
194 f->destructor(q); 336 f->destructor(q);
195 kfree(q); 337 kmem_cache_free(f->frags_cachep, q);
196
197} 338}
198EXPORT_SYMBOL(inet_frag_destroy); 339EXPORT_SYMBOL(inet_frag_destroy);
199 340
200int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
201{
202 struct inet_frag_queue *q;
203 int work, evicted = 0;
204
205 if (!force) {
206 if (frag_mem_limit(nf) <= nf->high_thresh)
207 return 0;
208 }
209
210 work = frag_mem_limit(nf) - nf->low_thresh;
211 while (work > 0 || force) {
212 spin_lock(&nf->lru_lock);
213
214 if (list_empty(&nf->lru_list)) {
215 spin_unlock(&nf->lru_lock);
216 break;
217 }
218
219 q = list_first_entry(&nf->lru_list,
220 struct inet_frag_queue, lru_list);
221 atomic_inc(&q->refcnt);
222 /* Remove q from list to avoid several CPUs grabbing it */
223 list_del_init(&q->lru_list);
224
225 spin_unlock(&nf->lru_lock);
226
227 spin_lock(&q->lock);
228 if (!(q->last_in & INET_FRAG_COMPLETE))
229 inet_frag_kill(q, f);
230 spin_unlock(&q->lock);
231
232 if (atomic_dec_and_test(&q->refcnt))
233 inet_frag_destroy(q, f, &work);
234 evicted++;
235 }
236
237 return evicted;
238}
239EXPORT_SYMBOL(inet_frag_evictor);
240
241static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, 341static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
242 struct inet_frag_queue *qp_in, struct inet_frags *f, 342 struct inet_frag_queue *qp_in,
243 void *arg) 343 struct inet_frags *f,
344 void *arg)
244{ 345{
245 struct inet_frag_bucket *hb; 346 struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
246 struct inet_frag_queue *qp; 347 struct inet_frag_queue *qp;
247 unsigned int hash;
248
249 read_lock(&f->lock); /* Protects against hash rebuild */
250 /*
251 * While we stayed w/o the lock other CPU could update
252 * the rnd seed, so we need to re-calculate the hash
253 * chain. Fortunatelly the qp_in can be used to get one.
254 */
255 hash = f->hashfn(qp_in);
256 hb = &f->hash[hash];
257 spin_lock(&hb->chain_lock);
258 348
259#ifdef CONFIG_SMP 349#ifdef CONFIG_SMP
260 /* With SMP race we have to recheck hash table, because 350 /* With SMP race we have to recheck hash table, because
261 * such entry could be created on other cpu, while we 351 * such entry could have been created on other cpu before
262 * released the hash bucket lock. 352 * we acquired hash bucket lock.
263 */ 353 */
264 hlist_for_each_entry(qp, &hb->chain, list) { 354 hlist_for_each_entry(qp, &hb->chain, list) {
265 if (qp->net == nf && f->match(qp, arg)) { 355 if (qp->net == nf && f->match(qp, arg)) {
266 atomic_inc(&qp->refcnt); 356 atomic_inc(&qp->refcnt);
267 spin_unlock(&hb->chain_lock); 357 spin_unlock(&hb->chain_lock);
268 read_unlock(&f->lock); 358 qp_in->flags |= INET_FRAG_COMPLETE;
269 qp_in->last_in |= INET_FRAG_COMPLETE;
270 inet_frag_put(qp_in, f); 359 inet_frag_put(qp_in, f);
271 return qp; 360 return qp;
272 } 361 }
@@ -278,19 +367,24 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
278 367
279 atomic_inc(&qp->refcnt); 368 atomic_inc(&qp->refcnt);
280 hlist_add_head(&qp->list, &hb->chain); 369 hlist_add_head(&qp->list, &hb->chain);
281 inet_frag_lru_add(nf, qp); 370
282 spin_unlock(&hb->chain_lock); 371 spin_unlock(&hb->chain_lock);
283 read_unlock(&f->lock);
284 372
285 return qp; 373 return qp;
286} 374}
287 375
288static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, 376static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
289 struct inet_frags *f, void *arg) 377 struct inet_frags *f,
378 void *arg)
290{ 379{
291 struct inet_frag_queue *q; 380 struct inet_frag_queue *q;
292 381
293 q = kzalloc(f->qsize, GFP_ATOMIC); 382 if (frag_mem_limit(nf) > nf->high_thresh) {
383 inet_frag_schedule_worker(f);
384 return NULL;
385 }
386
387 q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
294 if (q == NULL) 388 if (q == NULL)
295 return NULL; 389 return NULL;
296 390
@@ -301,13 +395,13 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
301 setup_timer(&q->timer, f->frag_expire, (unsigned long)q); 395 setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
302 spin_lock_init(&q->lock); 396 spin_lock_init(&q->lock);
303 atomic_set(&q->refcnt, 1); 397 atomic_set(&q->refcnt, 1);
304 INIT_LIST_HEAD(&q->lru_list);
305 398
306 return q; 399 return q;
307} 400}
308 401
309static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, 402static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
310 struct inet_frags *f, void *arg) 403 struct inet_frags *f,
404 void *arg)
311{ 405{
312 struct inet_frag_queue *q; 406 struct inet_frag_queue *q;
313 407
@@ -319,13 +413,17 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
319} 413}
320 414
321struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, 415struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
322 struct inet_frags *f, void *key, unsigned int hash) 416 struct inet_frags *f, void *key,
323 __releases(&f->lock) 417 unsigned int hash)
324{ 418{
325 struct inet_frag_bucket *hb; 419 struct inet_frag_bucket *hb;
326 struct inet_frag_queue *q; 420 struct inet_frag_queue *q;
327 int depth = 0; 421 int depth = 0;
328 422
423 if (frag_mem_limit(nf) > nf->low_thresh)
424 inet_frag_schedule_worker(f);
425
426 hash &= (INETFRAGS_HASHSZ - 1);
329 hb = &f->hash[hash]; 427 hb = &f->hash[hash];
330 428
331 spin_lock(&hb->chain_lock); 429 spin_lock(&hb->chain_lock);
@@ -333,18 +431,22 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
333 if (q->net == nf && f->match(q, key)) { 431 if (q->net == nf && f->match(q, key)) {
334 atomic_inc(&q->refcnt); 432 atomic_inc(&q->refcnt);
335 spin_unlock(&hb->chain_lock); 433 spin_unlock(&hb->chain_lock);
336 read_unlock(&f->lock);
337 return q; 434 return q;
338 } 435 }
339 depth++; 436 depth++;
340 } 437 }
341 spin_unlock(&hb->chain_lock); 438 spin_unlock(&hb->chain_lock);
342 read_unlock(&f->lock);
343 439
344 if (depth <= INETFRAGS_MAXDEPTH) 440 if (depth <= INETFRAGS_MAXDEPTH)
345 return inet_frag_create(nf, f, key); 441 return inet_frag_create(nf, f, key);
346 else 442
347 return ERR_PTR(-ENOBUFS); 443 if (inet_frag_may_rebuild(f)) {
444 if (!f->rebuild)
445 f->rebuild = true;
446 inet_frag_schedule_worker(f);
447 }
448
449 return ERR_PTR(-ENOBUFS);
348} 450}
349EXPORT_SYMBOL(inet_frag_find); 451EXPORT_SYMBOL(inet_frag_find);
350 452
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index ed32313e307c..15f0e2bad7ad 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -55,6 +55,7 @@
55 */ 55 */
56 56
57static int sysctl_ipfrag_max_dist __read_mostly = 64; 57static int sysctl_ipfrag_max_dist __read_mostly = 64;
58static const char ip_frag_cache_name[] = "ip4-frags";
58 59
59struct ipfrag_skb_cb 60struct ipfrag_skb_cb
60{ 61{
@@ -86,11 +87,6 @@ static inline u8 ip4_frag_ecn(u8 tos)
86 87
87static struct inet_frags ip4_frags; 88static struct inet_frags ip4_frags;
88 89
89int ip_frag_nqueues(struct net *net)
90{
91 return net->ipv4.frags.nqueues;
92}
93
94int ip_frag_mem(struct net *net) 90int ip_frag_mem(struct net *net)
95{ 91{
96 return sum_frag_mem_limit(&net->ipv4.frags); 92 return sum_frag_mem_limit(&net->ipv4.frags);
@@ -109,21 +105,21 @@ static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
109 net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); 105 net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
110 return jhash_3words((__force u32)id << 16 | prot, 106 return jhash_3words((__force u32)id << 16 | prot,
111 (__force u32)saddr, (__force u32)daddr, 107 (__force u32)saddr, (__force u32)daddr,
112 ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1); 108 ip4_frags.rnd);
113} 109}
114 110
115static unsigned int ip4_hashfn(struct inet_frag_queue *q) 111static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
116{ 112{
117 struct ipq *ipq; 113 const struct ipq *ipq;
118 114
119 ipq = container_of(q, struct ipq, q); 115 ipq = container_of(q, struct ipq, q);
120 return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); 116 return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
121} 117}
122 118
123static bool ip4_frag_match(struct inet_frag_queue *q, void *a) 119static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a)
124{ 120{
125 struct ipq *qp; 121 const struct ipq *qp;
126 struct ip4_create_arg *arg = a; 122 const struct ip4_create_arg *arg = a;
127 123
128 qp = container_of(q, struct ipq, q); 124 qp = container_of(q, struct ipq, q);
129 return qp->id == arg->iph->id && 125 return qp->id == arg->iph->id &&
@@ -133,14 +129,14 @@ static bool ip4_frag_match(struct inet_frag_queue *q, void *a)
133 qp->user == arg->user; 129 qp->user == arg->user;
134} 130}
135 131
136static void ip4_frag_init(struct inet_frag_queue *q, void *a) 132static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
137{ 133{
138 struct ipq *qp = container_of(q, struct ipq, q); 134 struct ipq *qp = container_of(q, struct ipq, q);
139 struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, 135 struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
140 frags); 136 frags);
141 struct net *net = container_of(ipv4, struct net, ipv4); 137 struct net *net = container_of(ipv4, struct net, ipv4);
142 138
143 struct ip4_create_arg *arg = a; 139 const struct ip4_create_arg *arg = a;
144 140
145 qp->protocol = arg->iph->protocol; 141 qp->protocol = arg->iph->protocol;
146 qp->id = arg->iph->id; 142 qp->id = arg->iph->id;
@@ -177,18 +173,6 @@ static void ipq_kill(struct ipq *ipq)
177 inet_frag_kill(&ipq->q, &ip4_frags); 173 inet_frag_kill(&ipq->q, &ip4_frags);
178} 174}
179 175
180/* Memory limiting on fragments. Evictor trashes the oldest
181 * fragment queue until we are back under the threshold.
182 */
183static void ip_evictor(struct net *net)
184{
185 int evicted;
186
187 evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
188 if (evicted)
189 IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
190}
191
192/* 176/*
193 * Oops, a fragment queue timed out. Kill it and send an ICMP reply. 177 * Oops, a fragment queue timed out. Kill it and send an ICMP reply.
194 */ 178 */
@@ -202,19 +186,22 @@ static void ip_expire(unsigned long arg)
202 186
203 spin_lock(&qp->q.lock); 187 spin_lock(&qp->q.lock);
204 188
205 if (qp->q.last_in & INET_FRAG_COMPLETE) 189 if (qp->q.flags & INET_FRAG_COMPLETE)
206 goto out; 190 goto out;
207 191
208 ipq_kill(qp); 192 ipq_kill(qp);
209
210 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
211 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); 193 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
212 194
213 if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { 195 if (!(qp->q.flags & INET_FRAG_EVICTED)) {
214 struct sk_buff *head = qp->q.fragments; 196 struct sk_buff *head = qp->q.fragments;
215 const struct iphdr *iph; 197 const struct iphdr *iph;
216 int err; 198 int err;
217 199
200 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
201
202 if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
203 goto out;
204
218 rcu_read_lock(); 205 rcu_read_lock();
219 head->dev = dev_get_by_index_rcu(net, qp->iif); 206 head->dev = dev_get_by_index_rcu(net, qp->iif);
220 if (!head->dev) 207 if (!head->dev)
@@ -227,8 +214,7 @@ static void ip_expire(unsigned long arg)
227 if (err) 214 if (err)
228 goto out_rcu_unlock; 215 goto out_rcu_unlock;
229 216
230 /* 217 /* Only an end host needs to send an ICMP
231 * Only an end host needs to send an ICMP
232 * "Fragment Reassembly Timeout" message, per RFC792. 218 * "Fragment Reassembly Timeout" message, per RFC792.
233 */ 219 */
234 if (qp->user == IP_DEFRAG_AF_PACKET || 220 if (qp->user == IP_DEFRAG_AF_PACKET ||
@@ -237,7 +223,6 @@ static void ip_expire(unsigned long arg)
237 (skb_rtable(head)->rt_type != RTN_LOCAL))) 223 (skb_rtable(head)->rt_type != RTN_LOCAL)))
238 goto out_rcu_unlock; 224 goto out_rcu_unlock;
239 225
240
241 /* Send an ICMP "Fragment Reassembly Timeout" message. */ 226 /* Send an ICMP "Fragment Reassembly Timeout" message. */
242 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); 227 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
243out_rcu_unlock: 228out_rcu_unlock:
@@ -260,7 +245,6 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
260 arg.iph = iph; 245 arg.iph = iph;
261 arg.user = user; 246 arg.user = user;
262 247
263 read_lock(&ip4_frags.lock);
264 hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); 248 hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
265 249
266 q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); 250 q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
@@ -319,7 +303,7 @@ static int ip_frag_reinit(struct ipq *qp)
319 } while (fp); 303 } while (fp);
320 sub_frag_mem_limit(&qp->q, sum_truesize); 304 sub_frag_mem_limit(&qp->q, sum_truesize);
321 305
322 qp->q.last_in = 0; 306 qp->q.flags = 0;
323 qp->q.len = 0; 307 qp->q.len = 0;
324 qp->q.meat = 0; 308 qp->q.meat = 0;
325 qp->q.fragments = NULL; 309 qp->q.fragments = NULL;
@@ -340,7 +324,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
340 int err = -ENOENT; 324 int err = -ENOENT;
341 u8 ecn; 325 u8 ecn;
342 326
343 if (qp->q.last_in & INET_FRAG_COMPLETE) 327 if (qp->q.flags & INET_FRAG_COMPLETE)
344 goto err; 328 goto err;
345 329
346 if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && 330 if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
@@ -367,9 +351,9 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
367 * or have different end, the segment is corrupted. 351 * or have different end, the segment is corrupted.
368 */ 352 */
369 if (end < qp->q.len || 353 if (end < qp->q.len ||
370 ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len)) 354 ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
371 goto err; 355 goto err;
372 qp->q.last_in |= INET_FRAG_LAST_IN; 356 qp->q.flags |= INET_FRAG_LAST_IN;
373 qp->q.len = end; 357 qp->q.len = end;
374 } else { 358 } else {
375 if (end&7) { 359 if (end&7) {
@@ -379,7 +363,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
379 } 363 }
380 if (end > qp->q.len) { 364 if (end > qp->q.len) {
381 /* Some bits beyond end -> corruption. */ 365 /* Some bits beyond end -> corruption. */
382 if (qp->q.last_in & INET_FRAG_LAST_IN) 366 if (qp->q.flags & INET_FRAG_LAST_IN)
383 goto err; 367 goto err;
384 qp->q.len = end; 368 qp->q.len = end;
385 } 369 }
@@ -488,13 +472,13 @@ found:
488 qp->ecn |= ecn; 472 qp->ecn |= ecn;
489 add_frag_mem_limit(&qp->q, skb->truesize); 473 add_frag_mem_limit(&qp->q, skb->truesize);
490 if (offset == 0) 474 if (offset == 0)
491 qp->q.last_in |= INET_FRAG_FIRST_IN; 475 qp->q.flags |= INET_FRAG_FIRST_IN;
492 476
493 if (ip_hdr(skb)->frag_off & htons(IP_DF) && 477 if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
494 skb->len + ihl > qp->q.max_size) 478 skb->len + ihl > qp->q.max_size)
495 qp->q.max_size = skb->len + ihl; 479 qp->q.max_size = skb->len + ihl;
496 480
497 if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && 481 if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
498 qp->q.meat == qp->q.len) { 482 qp->q.meat == qp->q.len) {
499 unsigned long orefdst = skb->_skb_refdst; 483 unsigned long orefdst = skb->_skb_refdst;
500 484
@@ -505,7 +489,6 @@ found:
505 } 489 }
506 490
507 skb_dst_drop(skb); 491 skb_dst_drop(skb);
508 inet_frag_lru_move(&qp->q);
509 return -EINPROGRESS; 492 return -EINPROGRESS;
510 493
511err: 494err:
@@ -655,9 +638,6 @@ int ip_defrag(struct sk_buff *skb, u32 user)
655 net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev); 638 net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
656 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); 639 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
657 640
658 /* Start by cleaning up the memory. */
659 ip_evictor(net);
660
661 /* Lookup (or create) queue header */ 641 /* Lookup (or create) queue header */
662 if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { 642 if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
663 int ret; 643 int ret;
@@ -721,14 +701,17 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
721 .data = &init_net.ipv4.frags.high_thresh, 701 .data = &init_net.ipv4.frags.high_thresh,
722 .maxlen = sizeof(int), 702 .maxlen = sizeof(int),
723 .mode = 0644, 703 .mode = 0644,
724 .proc_handler = proc_dointvec 704 .proc_handler = proc_dointvec_minmax,
705 .extra1 = &init_net.ipv4.frags.low_thresh
725 }, 706 },
726 { 707 {
727 .procname = "ipfrag_low_thresh", 708 .procname = "ipfrag_low_thresh",
728 .data = &init_net.ipv4.frags.low_thresh, 709 .data = &init_net.ipv4.frags.low_thresh,
729 .maxlen = sizeof(int), 710 .maxlen = sizeof(int),
730 .mode = 0644, 711 .mode = 0644,
731 .proc_handler = proc_dointvec 712 .proc_handler = proc_dointvec_minmax,
713 .extra1 = &zero,
714 .extra2 = &init_net.ipv4.frags.high_thresh
732 }, 715 },
733 { 716 {
734 .procname = "ipfrag_time", 717 .procname = "ipfrag_time",
@@ -740,10 +723,12 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
740 { } 723 { }
741}; 724};
742 725
726/* secret interval has been deprecated */
727static int ip4_frags_secret_interval_unused;
743static struct ctl_table ip4_frags_ctl_table[] = { 728static struct ctl_table ip4_frags_ctl_table[] = {
744 { 729 {
745 .procname = "ipfrag_secret_interval", 730 .procname = "ipfrag_secret_interval",
746 .data = &ip4_frags.secret_interval, 731 .data = &ip4_frags_secret_interval_unused,
747 .maxlen = sizeof(int), 732 .maxlen = sizeof(int),
748 .mode = 0644, 733 .mode = 0644,
749 .proc_handler = proc_dointvec_jiffies, 734 .proc_handler = proc_dointvec_jiffies,
@@ -771,7 +756,10 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
771 goto err_alloc; 756 goto err_alloc;
772 757
773 table[0].data = &net->ipv4.frags.high_thresh; 758 table[0].data = &net->ipv4.frags.high_thresh;
759 table[0].extra1 = &net->ipv4.frags.low_thresh;
760 table[0].extra2 = &init_net.ipv4.frags.high_thresh;
774 table[1].data = &net->ipv4.frags.low_thresh; 761 table[1].data = &net->ipv4.frags.low_thresh;
762 table[1].extra2 = &net->ipv4.frags.high_thresh;
775 table[2].data = &net->ipv4.frags.timeout; 763 table[2].data = &net->ipv4.frags.timeout;
776 764
777 /* Don't export sysctls to unprivileged users */ 765 /* Don't export sysctls to unprivileged users */
@@ -873,6 +861,7 @@ void __init ipfrag_init(void)
873 ip4_frags.qsize = sizeof(struct ipq); 861 ip4_frags.qsize = sizeof(struct ipq);
874 ip4_frags.match = ip4_frag_match; 862 ip4_frags.match = ip4_frag_match;
875 ip4_frags.frag_expire = ip_expire; 863 ip4_frags.frag_expire = ip_expire;
876 ip4_frags.secret_interval = 10 * 60 * HZ; 864 ip4_frags.frags_cache_name = ip_frag_cache_name;
877 inet_frags_init(&ip4_frags); 865 if (inet_frags_init(&ip4_frags))
866 panic("IP: failed to allocate ip4_frags cache\n");
878} 867}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 8d3b6b0e9857..215af2b155cb 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -855,11 +855,15 @@ static int __ip_append_data(struct sock *sk,
855 unsigned int maxfraglen, fragheaderlen, maxnonfragsize; 855 unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
856 int csummode = CHECKSUM_NONE; 856 int csummode = CHECKSUM_NONE;
857 struct rtable *rt = (struct rtable *)cork->dst; 857 struct rtable *rt = (struct rtable *)cork->dst;
858 u32 tskey = 0;
858 859
859 skb = skb_peek_tail(queue); 860 skb = skb_peek_tail(queue);
860 861
861 exthdrlen = !skb ? rt->dst.header_len : 0; 862 exthdrlen = !skb ? rt->dst.header_len : 0;
862 mtu = cork->fragsize; 863 mtu = cork->fragsize;
864 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
865 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
866 tskey = sk->sk_tskey++;
863 867
864 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 868 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
865 869
@@ -962,10 +966,6 @@ alloc_new_skb:
962 sk->sk_allocation); 966 sk->sk_allocation);
963 if (unlikely(skb == NULL)) 967 if (unlikely(skb == NULL))
964 err = -ENOBUFS; 968 err = -ENOBUFS;
965 else
966 /* only the initial fragment is
967 time stamped */
968 cork->tx_flags = 0;
969 } 969 }
970 if (skb == NULL) 970 if (skb == NULL)
971 goto error; 971 goto error;
@@ -976,7 +976,12 @@ alloc_new_skb:
976 skb->ip_summed = csummode; 976 skb->ip_summed = csummode;
977 skb->csum = 0; 977 skb->csum = 0;
978 skb_reserve(skb, hh_len); 978 skb_reserve(skb, hh_len);
979
980 /* only the initial fragment is time stamped */
979 skb_shinfo(skb)->tx_flags = cork->tx_flags; 981 skb_shinfo(skb)->tx_flags = cork->tx_flags;
982 cork->tx_flags = 0;
983 skb_shinfo(skb)->tskey = tskey;
984 tskey = 0;
980 985
981 /* 986 /*
982 * Find where to start putting bytes. 987 * Find where to start putting bytes.
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 64741b938632..5cb830c78990 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1319,7 +1319,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1319 if (sk->sk_type != SOCK_STREAM) 1319 if (sk->sk_type != SOCK_STREAM)
1320 return -ENOPROTOOPT; 1320 return -ENOPROTOOPT;
1321 1321
1322 msg.msg_control = optval; 1322 msg.msg_control = (__force void *) optval;
1323 msg.msg_controllen = len; 1323 msg.msg_controllen = len;
1324 msg.msg_flags = flags; 1324 msg.msg_flags = flags;
1325 1325
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 6f9de61dce5f..afed1aac2638 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -69,23 +69,25 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
69} 69}
70 70
71static void __tunnel_dst_set(struct ip_tunnel_dst *idst, 71static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
72 struct dst_entry *dst) 72 struct dst_entry *dst, __be32 saddr)
73{ 73{
74 struct dst_entry *old_dst; 74 struct dst_entry *old_dst;
75 75
76 dst_clone(dst); 76 dst_clone(dst);
77 old_dst = xchg((__force struct dst_entry **)&idst->dst, dst); 77 old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
78 dst_release(old_dst); 78 dst_release(old_dst);
79 idst->saddr = saddr;
79} 80}
80 81
81static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst) 82static void tunnel_dst_set(struct ip_tunnel *t,
83 struct dst_entry *dst, __be32 saddr)
82{ 84{
83 __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst); 85 __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst, saddr);
84} 86}
85 87
86static void tunnel_dst_reset(struct ip_tunnel *t) 88static void tunnel_dst_reset(struct ip_tunnel *t)
87{ 89{
88 tunnel_dst_set(t, NULL); 90 tunnel_dst_set(t, NULL, 0);
89} 91}
90 92
91void ip_tunnel_dst_reset_all(struct ip_tunnel *t) 93void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
@@ -93,20 +95,25 @@ void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
93 int i; 95 int i;
94 96
95 for_each_possible_cpu(i) 97 for_each_possible_cpu(i)
96 __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL); 98 __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
97} 99}
98EXPORT_SYMBOL(ip_tunnel_dst_reset_all); 100EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
99 101
100static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie) 102static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
103 u32 cookie, __be32 *saddr)
101{ 104{
105 struct ip_tunnel_dst *idst;
102 struct dst_entry *dst; 106 struct dst_entry *dst;
103 107
104 rcu_read_lock(); 108 rcu_read_lock();
105 dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst); 109 idst = this_cpu_ptr(t->dst_cache);
110 dst = rcu_dereference(idst->dst);
106 if (dst && !atomic_inc_not_zero(&dst->__refcnt)) 111 if (dst && !atomic_inc_not_zero(&dst->__refcnt))
107 dst = NULL; 112 dst = NULL;
108 if (dst) { 113 if (dst) {
109 if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 114 if (!dst->obsolete || dst->ops->check(dst, cookie)) {
115 *saddr = idst->saddr;
116 } else {
110 tunnel_dst_reset(t); 117 tunnel_dst_reset(t);
111 dst_release(dst); 118 dst_release(dst);
112 dst = NULL; 119 dst = NULL;
@@ -305,7 +312,7 @@ static struct net_device *__ip_tunnel_create(struct net *net,
305 } 312 }
306 313
307 ASSERT_RTNL(); 314 ASSERT_RTNL();
308 dev = alloc_netdev(ops->priv_size, name, ops->setup); 315 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
309 if (!dev) { 316 if (!dev) {
310 err = -ENOMEM; 317 err = -ENOMEM;
311 goto failed; 318 goto failed;
@@ -367,7 +374,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
367 374
368 if (!IS_ERR(rt)) { 375 if (!IS_ERR(rt)) {
369 tdev = rt->dst.dev; 376 tdev = rt->dst.dev;
370 tunnel_dst_set(tunnel, &rt->dst); 377 tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
371 ip_rt_put(rt); 378 ip_rt_put(rt);
372 } 379 }
373 if (dev->type != ARPHRD_ETHER) 380 if (dev->type != ARPHRD_ETHER)
@@ -610,7 +617,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
610 init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, 617 init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
611 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); 618 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
612 619
613 rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL; 620 rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
614 621
615 if (!rt) { 622 if (!rt) {
616 rt = ip_route_output_key(tunnel->net, &fl4); 623 rt = ip_route_output_key(tunnel->net, &fl4);
@@ -620,7 +627,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
620 goto tx_error; 627 goto tx_error;
621 } 628 }
622 if (connected) 629 if (connected)
623 tunnel_dst_set(tunnel, &rt->dst); 630 tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
624 } 631 }
625 632
626 if (rt->dst.dev == dev) { 633 if (rt->dst.dev == dev) {
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index b8960f3527f3..e453cb724a95 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -534,40 +534,28 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = {
534 534
535static int __init vti_init(void) 535static int __init vti_init(void)
536{ 536{
537 const char *msg;
537 int err; 538 int err;
538 539
539 pr_info("IPv4 over IPSec tunneling driver\n"); 540 pr_info("IPv4 over IPsec tunneling driver\n");
540 541
542 msg = "tunnel device";
541 err = register_pernet_device(&vti_net_ops); 543 err = register_pernet_device(&vti_net_ops);
542 if (err < 0) 544 if (err < 0)
543 return err; 545 goto pernet_dev_failed;
544 err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP);
545 if (err < 0) {
546 unregister_pernet_device(&vti_net_ops);
547 pr_info("vti init: can't register tunnel\n");
548
549 return err;
550 }
551 546
547 msg = "tunnel protocols";
548 err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP);
549 if (err < 0)
550 goto xfrm_proto_esp_failed;
552 err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH); 551 err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH);
553 if (err < 0) { 552 if (err < 0)
554 xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); 553 goto xfrm_proto_ah_failed;
555 unregister_pernet_device(&vti_net_ops);
556 pr_info("vti init: can't register tunnel\n");
557
558 return err;
559 }
560
561 err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP); 554 err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP);
562 if (err < 0) { 555 if (err < 0)
563 xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); 556 goto xfrm_proto_comp_failed;
564 xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
565 unregister_pernet_device(&vti_net_ops);
566 pr_info("vti init: can't register tunnel\n");
567
568 return err;
569 }
570 557
558 msg = "netlink interface";
571 err = rtnl_link_register(&vti_link_ops); 559 err = rtnl_link_register(&vti_link_ops);
572 if (err < 0) 560 if (err < 0)
573 goto rtnl_link_failed; 561 goto rtnl_link_failed;
@@ -576,23 +564,23 @@ static int __init vti_init(void)
576 564
577rtnl_link_failed: 565rtnl_link_failed:
578 xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); 566 xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
567xfrm_proto_comp_failed:
579 xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); 568 xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
569xfrm_proto_ah_failed:
580 xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); 570 xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
571xfrm_proto_esp_failed:
581 unregister_pernet_device(&vti_net_ops); 572 unregister_pernet_device(&vti_net_ops);
573pernet_dev_failed:
574 pr_err("vti init: failed to register %s\n", msg);
582 return err; 575 return err;
583} 576}
584 577
585static void __exit vti_fini(void) 578static void __exit vti_fini(void)
586{ 579{
587 rtnl_link_unregister(&vti_link_ops); 580 rtnl_link_unregister(&vti_link_ops);
588 if (xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP)) 581 xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
589 pr_info("vti close: can't deregister tunnel\n"); 582 xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
590 if (xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH)) 583 xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
591 pr_info("vti close: can't deregister tunnel\n");
592 if (xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP))
593 pr_info("vti close: can't deregister tunnel\n");
594
595
596 unregister_pernet_device(&vti_net_ops); 584 unregister_pernet_device(&vti_net_ops);
597} 585}
598 586
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index b3e86ea7b71b..5bbef4fdcb43 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -143,8 +143,6 @@ __be32 ic_servaddr = NONE; /* Boot server IP address */
143__be32 root_server_addr = NONE; /* Address of NFS server */ 143__be32 root_server_addr = NONE; /* Address of NFS server */
144u8 root_server_path[256] = { 0, }; /* Path to mount as root */ 144u8 root_server_path[256] = { 0, }; /* Path to mount as root */
145 145
146__be32 ic_dev_xid; /* Device under configuration */
147
148/* vendor class identifier */ 146/* vendor class identifier */
149static char vendor_class_identifier[253] __initdata; 147static char vendor_class_identifier[253] __initdata;
150 148
@@ -654,6 +652,7 @@ static struct packet_type bootp_packet_type __initdata = {
654 .func = ic_bootp_recv, 652 .func = ic_bootp_recv,
655}; 653};
656 654
655static __be32 ic_dev_xid; /* Device under configuration */
657 656
658/* 657/*
659 * Initialize DHCP/BOOTP extension fields in the request. 658 * Initialize DHCP/BOOTP extension fields in the request.
@@ -1218,10 +1217,10 @@ static int __init ic_dynamic(void)
1218 get_random_bytes(&timeout, sizeof(timeout)); 1217 get_random_bytes(&timeout, sizeof(timeout));
1219 timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM); 1218 timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM);
1220 for (;;) { 1219 for (;;) {
1220#ifdef IPCONFIG_BOOTP
1221 /* Track the device we are configuring */ 1221 /* Track the device we are configuring */
1222 ic_dev_xid = d->xid; 1222 ic_dev_xid = d->xid;
1223 1223
1224#ifdef IPCONFIG_BOOTP
1225 if (do_bootp && (d->able & IC_BOOTP)) 1224 if (do_bootp && (d->able & IC_BOOTP))
1226 ic_bootp_send_if(d, jiffies - start_jiffies); 1225 ic_bootp_send_if(d, jiffies - start_jiffies);
1227#endif 1226#endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 65bcaa789043..c8034587859d 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -500,7 +500,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
500 else 500 else
501 sprintf(name, "pimreg%u", mrt->id); 501 sprintf(name, "pimreg%u", mrt->id);
502 502
503 dev = alloc_netdev(0, name, reg_vif_setup); 503 dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);
504 504
505 if (dev == NULL) 505 if (dev == NULL)
506 return NULL; 506 return NULL;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index a26ce035e3fa..fb173126f03d 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -36,6 +36,16 @@ config NF_CONNTRACK_PROC_COMPAT
36 36
37 If unsure, say Y. 37 If unsure, say Y.
38 38
39config NF_LOG_ARP
40 tristate "ARP packet logging"
41 default m if NETFILTER_ADVANCED=n
42 select NF_LOG_COMMON
43
44config NF_LOG_IPV4
45 tristate "IPv4 packet logging"
46 default m if NETFILTER_ADVANCED=n
47 select NF_LOG_COMMON
48
39config NF_TABLES_IPV4 49config NF_TABLES_IPV4
40 depends on NF_TABLES 50 depends on NF_TABLES
41 tristate "IPv4 nf_tables support" 51 tristate "IPv4 nf_tables support"
@@ -159,25 +169,6 @@ config IP_NF_TARGET_SYNPROXY
159 169
160 To compile it as a module, choose M here. If unsure, say N. 170 To compile it as a module, choose M here. If unsure, say N.
161 171
162config IP_NF_TARGET_ULOG
163 tristate "ULOG target support (obsolete)"
164 default m if NETFILTER_ADVANCED=n
165 ---help---
166
167 This option enables the old IPv4-only "ipt_ULOG" implementation
168 which has been obsoleted by the new "nfnetlink_log" code (see
169 CONFIG_NETFILTER_NETLINK_LOG).
170
171 This option adds a `ULOG' target, which allows you to create rules in
172 any iptables table. The packet is passed to a userspace logging
173 daemon using netlink multicast sockets; unlike the LOG target
174 which can only be viewed through syslog.
175
176 The appropriate userspace logging daemon (ulogd) may be obtained from
177 <http://www.netfilter.org/projects/ulogd/index.html>
178
179 To compile it as a module, choose M here. If unsure, say N.
180
181# NAT + specific targets: nf_conntrack 172# NAT + specific targets: nf_conntrack
182config NF_NAT_IPV4 173config NF_NAT_IPV4
183 tristate "IPv4 NAT" 174 tristate "IPv4 NAT"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 90b82405331e..33001621465b 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -19,6 +19,10 @@ obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
19# defrag 19# defrag
20obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o 20obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
21 21
22# logging
23obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o
24obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o
25
22# NAT helpers (nf_conntrack) 26# NAT helpers (nf_conntrack)
23obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o 27obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
24obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o 28obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
@@ -53,7 +57,6 @@ obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
53obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o 57obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
54obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o 58obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
55obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o 59obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o
56obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
57 60
58# generic ARP tables 61# generic ARP tables
59obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o 62obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
deleted file mode 100644
index 9cb993cd224b..000000000000
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ /dev/null
@@ -1,498 +0,0 @@
1/*
2 * netfilter module for userspace packet logging daemons
3 *
4 * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
5 * (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 * (C) 2005-2007 Patrick McHardy <kaber@trash.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This module accepts two parameters:
14 *
15 * nlbufsiz:
16 * The parameter specifies how big the buffer for each netlink multicast
17 * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will
18 * get accumulated in the kernel until they are sent to userspace. It is
19 * NOT possible to allocate more than 128kB, and it is strongly discouraged,
20 * because atomically allocating 128kB inside the network rx softirq is not
21 * reliable. Please also keep in mind that this buffer size is allocated for
22 * each nlgroup you are using, so the total kernel memory usage increases
23 * by that factor.
24 *
25 * Actually you should use nlbufsiz a bit smaller than PAGE_SIZE, since
26 * nlbufsiz is used with alloc_skb, which adds another
27 * sizeof(struct skb_shared_info). Use NLMSG_GOODSIZE instead.
28 *
29 * flushtimeout:
30 * Specify, after how many hundredths of a second the queue should be
31 * flushed even if it is not full yet.
32 */
33#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
34#include <linux/module.h>
35#include <linux/spinlock.h>
36#include <linux/socket.h>
37#include <linux/slab.h>
38#include <linux/skbuff.h>
39#include <linux/kernel.h>
40#include <linux/timer.h>
41#include <net/netlink.h>
42#include <linux/netdevice.h>
43#include <linux/mm.h>
44#include <linux/moduleparam.h>
45#include <linux/netfilter.h>
46#include <linux/netfilter/x_tables.h>
47#include <linux/netfilter_ipv4/ipt_ULOG.h>
48#include <net/netfilter/nf_log.h>
49#include <net/netns/generic.h>
50#include <net/sock.h>
51#include <linux/bitops.h>
52#include <asm/unaligned.h>
53
54MODULE_LICENSE("GPL");
55MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
56MODULE_DESCRIPTION("Xtables: packet logging to netlink using ULOG");
57MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG);
58
59#define ULOG_NL_EVENT 111 /* Harald's favorite number */
60#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */
61
62static unsigned int nlbufsiz = NLMSG_GOODSIZE;
63module_param(nlbufsiz, uint, 0400);
64MODULE_PARM_DESC(nlbufsiz, "netlink buffer size");
65
66static unsigned int flushtimeout = 10;
67module_param(flushtimeout, uint, 0600);
68MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)");
69
70static bool nflog = true;
71module_param(nflog, bool, 0400);
72MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
73
74/* global data structures */
75
76typedef struct {
77 unsigned int qlen; /* number of nlmsgs' in the skb */
78 struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */
79 struct sk_buff *skb; /* the pre-allocated skb */
80 struct timer_list timer; /* the timer function */
81} ulog_buff_t;
82
83static int ulog_net_id __read_mostly;
84struct ulog_net {
85 unsigned int nlgroup[ULOG_MAXNLGROUPS];
86 ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS];
87 struct sock *nflognl;
88 spinlock_t lock;
89};
90
91static struct ulog_net *ulog_pernet(struct net *net)
92{
93 return net_generic(net, ulog_net_id);
94}
95
96/* send one ulog_buff_t to userspace */
97static void ulog_send(struct ulog_net *ulog, unsigned int nlgroupnum)
98{
99 ulog_buff_t *ub = &ulog->ulog_buffers[nlgroupnum];
100
101 pr_debug("ulog_send: timer is deleting\n");
102 del_timer(&ub->timer);
103
104 if (!ub->skb) {
105 pr_debug("ulog_send: nothing to send\n");
106 return;
107 }
108
109 /* last nlmsg needs NLMSG_DONE */
110 if (ub->qlen > 1)
111 ub->lastnlh->nlmsg_type = NLMSG_DONE;
112
113 NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
114 pr_debug("throwing %d packets to netlink group %u\n",
115 ub->qlen, nlgroupnum + 1);
116 netlink_broadcast(ulog->nflognl, ub->skb, 0, nlgroupnum + 1,
117 GFP_ATOMIC);
118
119 ub->qlen = 0;
120 ub->skb = NULL;
121 ub->lastnlh = NULL;
122}
123
124
125/* timer function to flush queue in flushtimeout time */
126static void ulog_timer(unsigned long data)
127{
128 unsigned int groupnum = *((unsigned int *)data);
129 struct ulog_net *ulog = container_of((void *)data,
130 struct ulog_net,
131 nlgroup[groupnum]);
132 pr_debug("timer function called, calling ulog_send\n");
133
134 /* lock to protect against somebody modifying our structure
135 * from ipt_ulog_target at the same time */
136 spin_lock_bh(&ulog->lock);
137 ulog_send(ulog, groupnum);
138 spin_unlock_bh(&ulog->lock);
139}
140
141static struct sk_buff *ulog_alloc_skb(unsigned int size)
142{
143 struct sk_buff *skb;
144 unsigned int n;
145
146 /* alloc skb which should be big enough for a whole
147 * multipart message. WARNING: has to be <= 131000
148 * due to slab allocator restrictions */
149
150 n = max(size, nlbufsiz);
151 skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN);
152 if (!skb) {
153 if (n > size) {
154 /* try to allocate only as much as we need for
155 * current packet */
156
157 skb = alloc_skb(size, GFP_ATOMIC);
158 if (!skb)
159 pr_debug("cannot even allocate %ub\n", size);
160 }
161 }
162
163 return skb;
164}
165
166static void ipt_ulog_packet(struct net *net,
167 unsigned int hooknum,
168 const struct sk_buff *skb,
169 const struct net_device *in,
170 const struct net_device *out,
171 const struct ipt_ulog_info *loginfo,
172 const char *prefix)
173{
174 ulog_buff_t *ub;
175 ulog_packet_msg_t *pm;
176 size_t size, copy_len;
177 struct nlmsghdr *nlh;
178 struct timeval tv;
179 struct ulog_net *ulog = ulog_pernet(net);
180
181 /* ffs == find first bit set, necessary because userspace
182 * is already shifting groupnumber, but we need unshifted.
183 * ffs() returns [1..32], we need [0..31] */
184 unsigned int groupnum = ffs(loginfo->nl_group) - 1;
185
186 /* calculate the size of the skb needed */
187 if (loginfo->copy_range == 0 || loginfo->copy_range > skb->len)
188 copy_len = skb->len;
189 else
190 copy_len = loginfo->copy_range;
191
192 size = nlmsg_total_size(sizeof(*pm) + copy_len);
193
194 ub = &ulog->ulog_buffers[groupnum];
195
196 spin_lock_bh(&ulog->lock);
197
198 if (!ub->skb) {
199 if (!(ub->skb = ulog_alloc_skb(size)))
200 goto alloc_failure;
201 } else if (ub->qlen >= loginfo->qthreshold ||
202 size > skb_tailroom(ub->skb)) {
203 /* either the queue len is too high or we don't have
204 * enough room in nlskb left. send it to userspace. */
205
206 ulog_send(ulog, groupnum);
207
208 if (!(ub->skb = ulog_alloc_skb(size)))
209 goto alloc_failure;
210 }
211
212 pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold);
213
214 nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
215 sizeof(*pm)+copy_len, 0);
216 if (!nlh) {
217 pr_debug("error during nlmsg_put\n");
218 goto out_unlock;
219 }
220 ub->qlen++;
221
222 pm = nlmsg_data(nlh);
223 memset(pm, 0, sizeof(*pm));
224
225 /* We might not have a timestamp, get one */
226 if (skb->tstamp.tv64 == 0)
227 __net_timestamp((struct sk_buff *)skb);
228
229 /* copy hook, prefix, timestamp, payload, etc. */
230 pm->data_len = copy_len;
231 tv = ktime_to_timeval(skb->tstamp);
232 put_unaligned(tv.tv_sec, &pm->timestamp_sec);
233 put_unaligned(tv.tv_usec, &pm->timestamp_usec);
234 put_unaligned(skb->mark, &pm->mark);
235 pm->hook = hooknum;
236 if (prefix != NULL) {
237 strncpy(pm->prefix, prefix, sizeof(pm->prefix) - 1);
238 pm->prefix[sizeof(pm->prefix) - 1] = '\0';
239 }
240 else if (loginfo->prefix[0] != '\0')
241 strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix));
242
243 if (in && in->hard_header_len > 0 &&
244 skb->mac_header != skb->network_header &&
245 in->hard_header_len <= ULOG_MAC_LEN) {
246 memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len);
247 pm->mac_len = in->hard_header_len;
248 } else
249 pm->mac_len = 0;
250
251 if (in)
252 strncpy(pm->indev_name, in->name, sizeof(pm->indev_name));
253
254 if (out)
255 strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
256
257 /* copy_len <= skb->len, so can't fail. */
258 if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0)
259 BUG();
260
261 /* check if we are building multi-part messages */
262 if (ub->qlen > 1)
263 ub->lastnlh->nlmsg_flags |= NLM_F_MULTI;
264
265 ub->lastnlh = nlh;
266
267 /* if timer isn't already running, start it */
268 if (!timer_pending(&ub->timer)) {
269 ub->timer.expires = jiffies + flushtimeout * HZ / 100;
270 add_timer(&ub->timer);
271 }
272
273 /* if threshold is reached, send message to userspace */
274 if (ub->qlen >= loginfo->qthreshold) {
275 if (loginfo->qthreshold > 1)
276 nlh->nlmsg_type = NLMSG_DONE;
277 ulog_send(ulog, groupnum);
278 }
279out_unlock:
280 spin_unlock_bh(&ulog->lock);
281
282 return;
283
284alloc_failure:
285 pr_debug("Error building netlink message\n");
286 spin_unlock_bh(&ulog->lock);
287}
288
289static unsigned int
290ulog_tg(struct sk_buff *skb, const struct xt_action_param *par)
291{
292 struct net *net = dev_net(par->in ? par->in : par->out);
293
294 ipt_ulog_packet(net, par->hooknum, skb, par->in, par->out,
295 par->targinfo, NULL);
296 return XT_CONTINUE;
297}
298
299static void ipt_logfn(struct net *net,
300 u_int8_t pf,
301 unsigned int hooknum,
302 const struct sk_buff *skb,
303 const struct net_device *in,
304 const struct net_device *out,
305 const struct nf_loginfo *li,
306 const char *prefix)
307{
308 struct ipt_ulog_info loginfo;
309
310 if (!li || li->type != NF_LOG_TYPE_ULOG) {
311 loginfo.nl_group = ULOG_DEFAULT_NLGROUP;
312 loginfo.copy_range = 0;
313 loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD;
314 loginfo.prefix[0] = '\0';
315 } else {
316 loginfo.nl_group = li->u.ulog.group;
317 loginfo.copy_range = li->u.ulog.copy_len;
318 loginfo.qthreshold = li->u.ulog.qthreshold;
319 strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
320 }
321
322 ipt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix);
323}
324
325static int ulog_tg_check(const struct xt_tgchk_param *par)
326{
327 const struct ipt_ulog_info *loginfo = par->targinfo;
328
329 if (!par->net->xt.ulog_warn_deprecated) {
330 pr_info("ULOG is deprecated and it will be removed soon, "
331 "use NFLOG instead\n");
332 par->net->xt.ulog_warn_deprecated = true;
333 }
334
335 if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {
336 pr_debug("prefix not null-terminated\n");
337 return -EINVAL;
338 }
339 if (loginfo->qthreshold > ULOG_MAX_QLEN) {
340 pr_debug("queue threshold %Zu > MAX_QLEN\n",
341 loginfo->qthreshold);
342 return -EINVAL;
343 }
344 return 0;
345}
346
347#ifdef CONFIG_COMPAT
348struct compat_ipt_ulog_info {
349 compat_uint_t nl_group;
350 compat_size_t copy_range;
351 compat_size_t qthreshold;
352 char prefix[ULOG_PREFIX_LEN];
353};
354
355static void ulog_tg_compat_from_user(void *dst, const void *src)
356{
357 const struct compat_ipt_ulog_info *cl = src;
358 struct ipt_ulog_info l = {
359 .nl_group = cl->nl_group,
360 .copy_range = cl->copy_range,
361 .qthreshold = cl->qthreshold,
362 };
363
364 memcpy(l.prefix, cl->prefix, sizeof(l.prefix));
365 memcpy(dst, &l, sizeof(l));
366}
367
368static int ulog_tg_compat_to_user(void __user *dst, const void *src)
369{
370 const struct ipt_ulog_info *l = src;
371 struct compat_ipt_ulog_info cl = {
372 .nl_group = l->nl_group,
373 .copy_range = l->copy_range,
374 .qthreshold = l->qthreshold,
375 };
376
377 memcpy(cl.prefix, l->prefix, sizeof(cl.prefix));
378 return copy_to_user(dst, &cl, sizeof(cl)) ? -EFAULT : 0;
379}
380#endif /* CONFIG_COMPAT */
381
382static struct xt_target ulog_tg_reg __read_mostly = {
383 .name = "ULOG",
384 .family = NFPROTO_IPV4,
385 .target = ulog_tg,
386 .targetsize = sizeof(struct ipt_ulog_info),
387 .checkentry = ulog_tg_check,
388#ifdef CONFIG_COMPAT
389 .compatsize = sizeof(struct compat_ipt_ulog_info),
390 .compat_from_user = ulog_tg_compat_from_user,
391 .compat_to_user = ulog_tg_compat_to_user,
392#endif
393 .me = THIS_MODULE,
394};
395
396static struct nf_logger ipt_ulog_logger __read_mostly = {
397 .name = "ipt_ULOG",
398 .logfn = ipt_logfn,
399 .me = THIS_MODULE,
400};
401
402static int __net_init ulog_tg_net_init(struct net *net)
403{
404 int i;
405 struct ulog_net *ulog = ulog_pernet(net);
406 struct netlink_kernel_cfg cfg = {
407 .groups = ULOG_MAXNLGROUPS,
408 };
409
410 spin_lock_init(&ulog->lock);
411 /* initialize ulog_buffers */
412 for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
413 ulog->nlgroup[i] = i;
414 setup_timer(&ulog->ulog_buffers[i].timer, ulog_timer,
415 (unsigned long)&ulog->nlgroup[i]);
416 }
417
418 ulog->nflognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg);
419 if (!ulog->nflognl)
420 return -ENOMEM;
421
422 if (nflog)
423 nf_log_set(net, NFPROTO_IPV4, &ipt_ulog_logger);
424
425 return 0;
426}
427
428static void __net_exit ulog_tg_net_exit(struct net *net)
429{
430 ulog_buff_t *ub;
431 int i;
432 struct ulog_net *ulog = ulog_pernet(net);
433
434 if (nflog)
435 nf_log_unset(net, &ipt_ulog_logger);
436
437 netlink_kernel_release(ulog->nflognl);
438
439 /* remove pending timers and free allocated skb's */
440 for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
441 ub = &ulog->ulog_buffers[i];
442 pr_debug("timer is deleting\n");
443 del_timer(&ub->timer);
444
445 if (ub->skb) {
446 kfree_skb(ub->skb);
447 ub->skb = NULL;
448 }
449 }
450}
451
452static struct pernet_operations ulog_tg_net_ops = {
453 .init = ulog_tg_net_init,
454 .exit = ulog_tg_net_exit,
455 .id = &ulog_net_id,
456 .size = sizeof(struct ulog_net),
457};
458
459static int __init ulog_tg_init(void)
460{
461 int ret;
462 pr_debug("init module\n");
463
464 if (nlbufsiz > 128*1024) {
465 pr_warn("Netlink buffer has to be <= 128kB\n");
466 return -EINVAL;
467 }
468
469 ret = register_pernet_subsys(&ulog_tg_net_ops);
470 if (ret)
471 goto out_pernet;
472
473 ret = xt_register_target(&ulog_tg_reg);
474 if (ret < 0)
475 goto out_target;
476
477 if (nflog)
478 nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger);
479
480 return 0;
481
482out_target:
483 unregister_pernet_subsys(&ulog_tg_net_ops);
484out_pernet:
485 return ret;
486}
487
488static void __exit ulog_tg_exit(void)
489{
490 pr_debug("cleanup_module\n");
491 if (nflog)
492 nf_log_unregister(&ipt_ulog_logger);
493 xt_unregister_target(&ulog_tg_reg);
494 unregister_pernet_subsys(&ulog_tg_net_ops);
495}
496
497module_init(ulog_tg_init);
498module_exit(ulog_tg_exit);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 8127dc802865..a054fe083431 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -314,7 +314,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
314 return -ENOENT; 314 return -ENOENT;
315} 315}
316 316
317#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 317#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
318 318
319#include <linux/netfilter/nfnetlink.h> 319#include <linux/netfilter/nfnetlink.h>
320#include <linux/netfilter/nfnetlink_conntrack.h> 320#include <linux/netfilter/nfnetlink_conntrack.h>
@@ -358,7 +358,7 @@ static struct nf_sockopt_ops so_getorigdst = {
358 .pf = PF_INET, 358 .pf = PF_INET,
359 .get_optmin = SO_ORIGINAL_DST, 359 .get_optmin = SO_ORIGINAL_DST,
360 .get_optmax = SO_ORIGINAL_DST+1, 360 .get_optmax = SO_ORIGINAL_DST+1,
361 .get = &getorigdst, 361 .get = getorigdst,
362 .owner = THIS_MODULE, 362 .owner = THIS_MODULE,
363}; 363};
364 364
@@ -388,7 +388,7 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
388 .invert_tuple = ipv4_invert_tuple, 388 .invert_tuple = ipv4_invert_tuple,
389 .print_tuple = ipv4_print_tuple, 389 .print_tuple = ipv4_print_tuple,
390 .get_l4proto = ipv4_get_l4proto, 390 .get_l4proto = ipv4_get_l4proto,
391#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 391#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
392 .tuple_to_nlattr = ipv4_tuple_to_nlattr, 392 .tuple_to_nlattr = ipv4_tuple_to_nlattr,
393 .nlattr_tuple_size = ipv4_nlattr_tuple_size, 393 .nlattr_tuple_size = ipv4_nlattr_tuple_size,
394 .nlattr_to_tuple = ipv4_nlattr_to_tuple, 394 .nlattr_to_tuple = ipv4_nlattr_to_tuple,
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index a338dad41b7d..b91b2641adda 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -226,7 +226,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
226 return icmp_error_message(net, tmpl, skb, ctinfo, hooknum); 226 return icmp_error_message(net, tmpl, skb, ctinfo, hooknum);
227} 227}
228 228
229#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 229#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
230 230
231#include <linux/netfilter/nfnetlink.h> 231#include <linux/netfilter/nfnetlink.h>
232#include <linux/netfilter/nfnetlink_conntrack.h> 232#include <linux/netfilter/nfnetlink_conntrack.h>
@@ -408,7 +408,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
408 .error = icmp_error, 408 .error = icmp_error,
409 .destroy = NULL, 409 .destroy = NULL,
410 .me = NULL, 410 .me = NULL,
411#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 411#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
412 .tuple_to_nlattr = icmp_tuple_to_nlattr, 412 .tuple_to_nlattr = icmp_tuple_to_nlattr,
413 .nlattr_tuple_size = icmp_nlattr_tuple_size, 413 .nlattr_tuple_size = icmp_nlattr_tuple_size,
414 .nlattr_to_tuple = icmp_nlattr_to_tuple, 414 .nlattr_to_tuple = icmp_nlattr_to_tuple,
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index b8f6381c7d0b..76bd1aef257f 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -17,7 +17,7 @@
17#include <linux/netfilter_bridge.h> 17#include <linux/netfilter_bridge.h>
18#include <linux/netfilter_ipv4.h> 18#include <linux/netfilter_ipv4.h>
19#include <net/netfilter/ipv4/nf_defrag_ipv4.h> 19#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
20#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 20#if IS_ENABLED(CONFIG_NF_CONNTRACK)
21#include <net/netfilter/nf_conntrack.h> 21#include <net/netfilter/nf_conntrack.h>
22#endif 22#endif
23#include <net/netfilter/nf_conntrack_zones.h> 23#include <net/netfilter/nf_conntrack_zones.h>
@@ -45,7 +45,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
45{ 45{
46 u16 zone = NF_CT_DEFAULT_ZONE; 46 u16 zone = NF_CT_DEFAULT_ZONE;
47 47
48#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 48#if IS_ENABLED(CONFIG_NF_CONNTRACK)
49 if (skb->nfct) 49 if (skb->nfct)
50 zone = nf_ct_zone((struct nf_conn *)skb->nfct); 50 zone = nf_ct_zone((struct nf_conn *)skb->nfct);
51#endif 51#endif
@@ -74,8 +74,8 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
74 inet->nodefrag) 74 inet->nodefrag)
75 return NF_ACCEPT; 75 return NF_ACCEPT;
76 76
77#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 77#if IS_ENABLED(CONFIG_NF_CONNTRACK)
78#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE) 78#if !IS_ENABLED(CONFIG_NF_NAT)
79 /* Previously seen (loopback)? Ignore. Do this before 79 /* Previously seen (loopback)? Ignore. Do this before
80 fragment check. */ 80 fragment check. */
81 if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) 81 if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
new file mode 100644
index 000000000000..ccfc78db12ee
--- /dev/null
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -0,0 +1,149 @@
1/*
2 * (C) 2014 by Pablo Neira Ayuso <pablo@netfilter.org>
3 *
4 * Based on code from ebt_log from:
5 *
6 * Bart De Schuymer <bdschuym@pandora.be>
7 * Harald Welte <laforge@netfilter.org>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 */
13
14#include <linux/module.h>
15#include <linux/spinlock.h>
16#include <linux/skbuff.h>
17#include <linux/if_arp.h>
18#include <linux/ip.h>
19#include <net/route.h>
20
21#include <linux/netfilter.h>
22#include <linux/netfilter/xt_LOG.h>
23#include <net/netfilter/nf_log.h>
24
25static struct nf_loginfo default_loginfo = {
26 .type = NF_LOG_TYPE_LOG,
27 .u = {
28 .log = {
29 .level = 5,
30 .logflags = NF_LOG_MASK,
31 },
32 },
33};
34
35struct arppayload {
36 unsigned char mac_src[ETH_ALEN];
37 unsigned char ip_src[4];
38 unsigned char mac_dst[ETH_ALEN];
39 unsigned char ip_dst[4];
40};
41
42static void dump_arp_packet(struct nf_log_buf *m,
43 const struct nf_loginfo *info,
44 const struct sk_buff *skb, unsigned int nhoff)
45{
46 const struct arphdr *ah;
47 struct arphdr _arph;
48 const struct arppayload *ap;
49 struct arppayload _arpp;
50
51 ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
52 if (ah == NULL) {
53 nf_log_buf_add(m, "TRUNCATED");
54 return;
55 }
56 nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d",
57 ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op));
58
59 /* If it's for Ethernet and the lengths are OK, then log the ARP
60 * payload.
61 */
62 if (ah->ar_hrd != htons(1) ||
63 ah->ar_hln != ETH_ALEN ||
64 ah->ar_pln != sizeof(__be32))
65 return;
66
67 ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp);
68 if (ap == NULL) {
69 nf_log_buf_add(m, " INCOMPLETE [%Zu bytes]",
70 skb->len - sizeof(_arph));
71 return;
72 }
73 nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4",
74 ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst);
75}
76
77void nf_log_arp_packet(struct net *net, u_int8_t pf,
78 unsigned int hooknum, const struct sk_buff *skb,
79 const struct net_device *in,
80 const struct net_device *out,
81 const struct nf_loginfo *loginfo,
82 const char *prefix)
83{
84 struct nf_log_buf *m;
85
86 /* FIXME: Disabled from containers until syslog ns is supported */
87 if (!net_eq(net, &init_net))
88 return;
89
90 m = nf_log_buf_open();
91
92 if (!loginfo)
93 loginfo = &default_loginfo;
94
95 nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
96 prefix);
97 dump_arp_packet(m, loginfo, skb, 0);
98
99 nf_log_buf_close(m);
100}
101
102static struct nf_logger nf_arp_logger __read_mostly = {
103 .name = "nf_log_arp",
104 .type = NF_LOG_TYPE_LOG,
105 .logfn = nf_log_arp_packet,
106 .me = THIS_MODULE,
107};
108
109static int __net_init nf_log_arp_net_init(struct net *net)
110{
111 nf_log_set(net, NFPROTO_ARP, &nf_arp_logger);
112 return 0;
113}
114
115static void __net_exit nf_log_arp_net_exit(struct net *net)
116{
117 nf_log_unset(net, &nf_arp_logger);
118}
119
120static struct pernet_operations nf_log_arp_net_ops = {
121 .init = nf_log_arp_net_init,
122 .exit = nf_log_arp_net_exit,
123};
124
125static int __init nf_log_arp_init(void)
126{
127 int ret;
128
129 ret = register_pernet_subsys(&nf_log_arp_net_ops);
130 if (ret < 0)
131 return ret;
132
133 nf_log_register(NFPROTO_ARP, &nf_arp_logger);
134 return 0;
135}
136
137static void __exit nf_log_arp_exit(void)
138{
139 unregister_pernet_subsys(&nf_log_arp_net_ops);
140 nf_log_unregister(&nf_arp_logger);
141}
142
143module_init(nf_log_arp_init);
144module_exit(nf_log_arp_exit);
145
146MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
147MODULE_DESCRIPTION("Netfilter ARP packet logging");
148MODULE_LICENSE("GPL");
149MODULE_ALIAS_NF_LOGGER(3, 0);
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
new file mode 100644
index 000000000000..078bdca1b607
--- /dev/null
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -0,0 +1,385 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/module.h>
10#include <linux/spinlock.h>
11#include <linux/skbuff.h>
12#include <linux/if_arp.h>
13#include <linux/ip.h>
14#include <net/ipv6.h>
15#include <net/icmp.h>
16#include <net/udp.h>
17#include <net/tcp.h>
18#include <net/route.h>
19
20#include <linux/netfilter.h>
21#include <linux/netfilter/xt_LOG.h>
22#include <net/netfilter/nf_log.h>
23
24static struct nf_loginfo default_loginfo = {
25 .type = NF_LOG_TYPE_LOG,
26 .u = {
27 .log = {
28 .level = 5,
29 .logflags = NF_LOG_MASK,
30 },
31 },
32};
33
34/* One level of recursion won't kill us */
35static void dump_ipv4_packet(struct nf_log_buf *m,
36 const struct nf_loginfo *info,
37 const struct sk_buff *skb, unsigned int iphoff)
38{
39 struct iphdr _iph;
40 const struct iphdr *ih;
41 unsigned int logflags;
42
43 if (info->type == NF_LOG_TYPE_LOG)
44 logflags = info->u.log.logflags;
45 else
46 logflags = NF_LOG_MASK;
47
48 ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
49 if (ih == NULL) {
50 nf_log_buf_add(m, "TRUNCATED");
51 return;
52 }
53
54 /* Important fields:
55 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
56 /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
57 nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr);
58
59 /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
60 nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
61 ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
62 ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
63
64 /* Max length: 6 "CE DF MF " */
65 if (ntohs(ih->frag_off) & IP_CE)
66 nf_log_buf_add(m, "CE ");
67 if (ntohs(ih->frag_off) & IP_DF)
68 nf_log_buf_add(m, "DF ");
69 if (ntohs(ih->frag_off) & IP_MF)
70 nf_log_buf_add(m, "MF ");
71
72 /* Max length: 11 "FRAG:65535 " */
73 if (ntohs(ih->frag_off) & IP_OFFSET)
74 nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
75
76 if ((logflags & XT_LOG_IPOPT) &&
77 ih->ihl * 4 > sizeof(struct iphdr)) {
78 const unsigned char *op;
79 unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
80 unsigned int i, optsize;
81
82 optsize = ih->ihl * 4 - sizeof(struct iphdr);
83 op = skb_header_pointer(skb, iphoff+sizeof(_iph),
84 optsize, _opt);
85 if (op == NULL) {
86 nf_log_buf_add(m, "TRUNCATED");
87 return;
88 }
89
90 /* Max length: 127 "OPT (" 15*4*2chars ") " */
91 nf_log_buf_add(m, "OPT (");
92 for (i = 0; i < optsize; i++)
93 nf_log_buf_add(m, "%02X", op[i]);
94 nf_log_buf_add(m, ") ");
95 }
96
97 switch (ih->protocol) {
98 case IPPROTO_TCP:
99 if (nf_log_dump_tcp_header(m, skb, ih->protocol,
100 ntohs(ih->frag_off) & IP_OFFSET,
101 iphoff+ih->ihl*4, logflags))
102 return;
103 break;
104 case IPPROTO_UDP:
105 case IPPROTO_UDPLITE:
106 if (nf_log_dump_udp_header(m, skb, ih->protocol,
107 ntohs(ih->frag_off) & IP_OFFSET,
108 iphoff+ih->ihl*4))
109 return;
110 break;
111 case IPPROTO_ICMP: {
112 struct icmphdr _icmph;
113 const struct icmphdr *ich;
114 static const size_t required_len[NR_ICMP_TYPES+1]
115 = { [ICMP_ECHOREPLY] = 4,
116 [ICMP_DEST_UNREACH]
117 = 8 + sizeof(struct iphdr),
118 [ICMP_SOURCE_QUENCH]
119 = 8 + sizeof(struct iphdr),
120 [ICMP_REDIRECT]
121 = 8 + sizeof(struct iphdr),
122 [ICMP_ECHO] = 4,
123 [ICMP_TIME_EXCEEDED]
124 = 8 + sizeof(struct iphdr),
125 [ICMP_PARAMETERPROB]
126 = 8 + sizeof(struct iphdr),
127 [ICMP_TIMESTAMP] = 20,
128 [ICMP_TIMESTAMPREPLY] = 20,
129 [ICMP_ADDRESS] = 12,
130 [ICMP_ADDRESSREPLY] = 12 };
131
132 /* Max length: 11 "PROTO=ICMP " */
133 nf_log_buf_add(m, "PROTO=ICMP ");
134
135 if (ntohs(ih->frag_off) & IP_OFFSET)
136 break;
137
138 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
139 ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
140 sizeof(_icmph), &_icmph);
141 if (ich == NULL) {
142 nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
143 skb->len - iphoff - ih->ihl*4);
144 break;
145 }
146
147 /* Max length: 18 "TYPE=255 CODE=255 " */
148 nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
149
150 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
151 if (ich->type <= NR_ICMP_TYPES &&
152 required_len[ich->type] &&
153 skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
154 nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
155 skb->len - iphoff - ih->ihl*4);
156 break;
157 }
158
159 switch (ich->type) {
160 case ICMP_ECHOREPLY:
161 case ICMP_ECHO:
162 /* Max length: 19 "ID=65535 SEQ=65535 " */
163 nf_log_buf_add(m, "ID=%u SEQ=%u ",
164 ntohs(ich->un.echo.id),
165 ntohs(ich->un.echo.sequence));
166 break;
167
168 case ICMP_PARAMETERPROB:
169 /* Max length: 14 "PARAMETER=255 " */
170 nf_log_buf_add(m, "PARAMETER=%u ",
171 ntohl(ich->un.gateway) >> 24);
172 break;
173 case ICMP_REDIRECT:
174 /* Max length: 24 "GATEWAY=255.255.255.255 " */
175 nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
176 /* Fall through */
177 case ICMP_DEST_UNREACH:
178 case ICMP_SOURCE_QUENCH:
179 case ICMP_TIME_EXCEEDED:
180 /* Max length: 3+maxlen */
181 if (!iphoff) { /* Only recurse once. */
182 nf_log_buf_add(m, "[");
183 dump_ipv4_packet(m, info, skb,
184 iphoff + ih->ihl*4+sizeof(_icmph));
185 nf_log_buf_add(m, "] ");
186 }
187
188 /* Max length: 10 "MTU=65535 " */
189 if (ich->type == ICMP_DEST_UNREACH &&
190 ich->code == ICMP_FRAG_NEEDED) {
191 nf_log_buf_add(m, "MTU=%u ",
192 ntohs(ich->un.frag.mtu));
193 }
194 }
195 break;
196 }
197 /* Max Length */
198 case IPPROTO_AH: {
199 struct ip_auth_hdr _ahdr;
200 const struct ip_auth_hdr *ah;
201
202 if (ntohs(ih->frag_off) & IP_OFFSET)
203 break;
204
205 /* Max length: 9 "PROTO=AH " */
206 nf_log_buf_add(m, "PROTO=AH ");
207
208 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
209 ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
210 sizeof(_ahdr), &_ahdr);
211 if (ah == NULL) {
212 nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
213 skb->len - iphoff - ih->ihl*4);
214 break;
215 }
216
217 /* Length: 15 "SPI=0xF1234567 " */
218 nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
219 break;
220 }
221 case IPPROTO_ESP: {
222 struct ip_esp_hdr _esph;
223 const struct ip_esp_hdr *eh;
224
225 /* Max length: 10 "PROTO=ESP " */
226 nf_log_buf_add(m, "PROTO=ESP ");
227
228 if (ntohs(ih->frag_off) & IP_OFFSET)
229 break;
230
231 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
232 eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
233 sizeof(_esph), &_esph);
234 if (eh == NULL) {
235 nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
236 skb->len - iphoff - ih->ihl*4);
237 break;
238 }
239
240 /* Length: 15 "SPI=0xF1234567 " */
241 nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi));
242 break;
243 }
244 /* Max length: 10 "PROTO 255 " */
245 default:
246 nf_log_buf_add(m, "PROTO=%u ", ih->protocol);
247 }
248
249 /* Max length: 15 "UID=4294967295 " */
250 if ((logflags & XT_LOG_UID) && !iphoff)
251 nf_log_dump_sk_uid_gid(m, skb->sk);
252
253 /* Max length: 16 "MARK=0xFFFFFFFF " */
254 if (!iphoff && skb->mark)
255 nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
256
257 /* Proto Max log string length */
258 /* IP: 40+46+6+11+127 = 230 */
259 /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */
260 /* UDP: 10+max(25,20) = 35 */
261 /* UDPLITE: 14+max(25,20) = 39 */
262 /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
263 /* ESP: 10+max(25)+15 = 50 */
264 /* AH: 9+max(25)+15 = 49 */
265 /* unknown: 10 */
266
267 /* (ICMP allows recursion one level deep) */
268 /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */
269 /* maxlen = 230+ 91 + 230 + 252 = 803 */
270}
271
272static void dump_ipv4_mac_header(struct nf_log_buf *m,
273 const struct nf_loginfo *info,
274 const struct sk_buff *skb)
275{
276 struct net_device *dev = skb->dev;
277 unsigned int logflags = 0;
278
279 if (info->type == NF_LOG_TYPE_LOG)
280 logflags = info->u.log.logflags;
281
282 if (!(logflags & XT_LOG_MACDECODE))
283 goto fallback;
284
285 switch (dev->type) {
286 case ARPHRD_ETHER:
287 nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
288 eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
289 ntohs(eth_hdr(skb)->h_proto));
290 return;
291 default:
292 break;
293 }
294
295fallback:
296 nf_log_buf_add(m, "MAC=");
297 if (dev->hard_header_len &&
298 skb->mac_header != skb->network_header) {
299 const unsigned char *p = skb_mac_header(skb);
300 unsigned int i;
301
302 nf_log_buf_add(m, "%02x", *p++);
303 for (i = 1; i < dev->hard_header_len; i++, p++)
304 nf_log_buf_add(m, ":%02x", *p);
305 }
306 nf_log_buf_add(m, " ");
307}
308
309static void nf_log_ip_packet(struct net *net, u_int8_t pf,
310 unsigned int hooknum, const struct sk_buff *skb,
311 const struct net_device *in,
312 const struct net_device *out,
313 const struct nf_loginfo *loginfo,
314 const char *prefix)
315{
316 struct nf_log_buf *m;
317
318 /* FIXME: Disabled from containers until syslog ns is supported */
319 if (!net_eq(net, &init_net))
320 return;
321
322 m = nf_log_buf_open();
323
324 if (!loginfo)
325 loginfo = &default_loginfo;
326
327 nf_log_dump_packet_common(m, pf, hooknum, skb, in,
328 out, loginfo, prefix);
329
330 if (in != NULL)
331 dump_ipv4_mac_header(m, loginfo, skb);
332
333 dump_ipv4_packet(m, loginfo, skb, 0);
334
335 nf_log_buf_close(m);
336}
337
338static struct nf_logger nf_ip_logger __read_mostly = {
339 .name = "nf_log_ipv4",
340 .type = NF_LOG_TYPE_LOG,
341 .logfn = nf_log_ip_packet,
342 .me = THIS_MODULE,
343};
344
345static int __net_init nf_log_ipv4_net_init(struct net *net)
346{
347 nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger);
348 return 0;
349}
350
351static void __net_exit nf_log_ipv4_net_exit(struct net *net)
352{
353 nf_log_unset(net, &nf_ip_logger);
354}
355
356static struct pernet_operations nf_log_ipv4_net_ops = {
357 .init = nf_log_ipv4_net_init,
358 .exit = nf_log_ipv4_net_exit,
359};
360
361static int __init nf_log_ipv4_init(void)
362{
363 int ret;
364
365 ret = register_pernet_subsys(&nf_log_ipv4_net_ops);
366 if (ret < 0)
367 return ret;
368
369 nf_log_register(NFPROTO_IPV4, &nf_ip_logger);
370 return 0;
371}
372
373static void __exit nf_log_ipv4_exit(void)
374{
375 unregister_pernet_subsys(&nf_log_ipv4_net_ops);
376 nf_log_unregister(&nf_ip_logger);
377}
378
379module_init(nf_log_ipv4_init);
380module_exit(nf_log_ipv4_exit);
381
382MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
383MODULE_DESCRIPTION("Netfilter IPv4 packet logging");
384MODULE_LICENSE("GPL");
385MODULE_ALIAS_NF_LOGGER(AF_INET, 0);
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index d8b2e14efddc..14f5ccd06337 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -154,6 +154,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
154 htons(oldlen), htons(datalen), 1); 154 htons(oldlen), htons(datalen), 1);
155} 155}
156 156
157#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
157static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], 158static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
158 struct nf_nat_range *range) 159 struct nf_nat_range *range)
159{ 160{
@@ -169,6 +170,7 @@ static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
169 170
170 return 0; 171 return 0;
171} 172}
173#endif
172 174
173static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = { 175static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
174 .l3proto = NFPROTO_IPV4, 176 .l3proto = NFPROTO_IPV4,
@@ -177,7 +179,9 @@ static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
177 .manip_pkt = nf_nat_ipv4_manip_pkt, 179 .manip_pkt = nf_nat_ipv4_manip_pkt,
178 .csum_update = nf_nat_ipv4_csum_update, 180 .csum_update = nf_nat_ipv4_csum_update,
179 .csum_recalc = nf_nat_ipv4_csum_recalc, 181 .csum_recalc = nf_nat_ipv4_csum_recalc,
182#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
180 .nlattr_to_range = nf_nat_ipv4_nlattr_to_range, 183 .nlattr_to_range = nf_nat_ipv4_nlattr_to_range,
184#endif
181#ifdef CONFIG_XFRM 185#ifdef CONFIG_XFRM
182 .decode_session = nf_nat_ipv4_decode_session, 186 .decode_session = nf_nat_ipv4_decode_session,
183#endif 187#endif
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index 690d890111bb..9414923f1e15 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -124,7 +124,7 @@ static const struct nf_nat_l4proto gre = {
124 .manip_pkt = gre_manip_pkt, 124 .manip_pkt = gre_manip_pkt,
125 .in_range = nf_nat_l4proto_in_range, 125 .in_range = nf_nat_l4proto_in_range,
126 .unique_tuple = gre_unique_tuple, 126 .unique_tuple = gre_unique_tuple,
127#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 127#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
128 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, 128 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
129#endif 129#endif
130}; 130};
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index eb303471bcf6..4557b4ab8342 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -77,7 +77,7 @@ const struct nf_nat_l4proto nf_nat_l4proto_icmp = {
77 .manip_pkt = icmp_manip_pkt, 77 .manip_pkt = icmp_manip_pkt,
78 .in_range = icmp_in_range, 78 .in_range = icmp_in_range,
79 .unique_tuple = icmp_unique_tuple, 79 .unique_tuple = icmp_unique_tuple,
80#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 80#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
81 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, 81 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
82#endif 82#endif
83}; 83};
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 044a0ddf6a79..a3c59a077a5f 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -911,7 +911,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
911 sin6->sin6_flowinfo = ip6_flowinfo(ip6); 911 sin6->sin6_flowinfo = ip6_flowinfo(ip6);
912 sin6->sin6_scope_id = 912 sin6->sin6_scope_id =
913 ipv6_iface_scope_id(&sin6->sin6_addr, 913 ipv6_iface_scope_id(&sin6->sin6_addr,
914 IP6CB(skb)->iif); 914 inet6_iif(skb));
915 *addr_len = sizeof(*sin6); 915 *addr_len = sizeof(*sin6);
916 } 916 }
917 917
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index ae0af9386f7c..8e3eb39f84e7 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -52,6 +52,7 @@
52static int sockstat_seq_show(struct seq_file *seq, void *v) 52static int sockstat_seq_show(struct seq_file *seq, void *v)
53{ 53{
54 struct net *net = seq->private; 54 struct net *net = seq->private;
55 unsigned int frag_mem;
55 int orphans, sockets; 56 int orphans, sockets;
56 57
57 local_bh_disable(); 58 local_bh_disable();
@@ -71,8 +72,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
71 sock_prot_inuse_get(net, &udplite_prot)); 72 sock_prot_inuse_get(net, &udplite_prot));
72 seq_printf(seq, "RAW: inuse %d\n", 73 seq_printf(seq, "RAW: inuse %d\n",
73 sock_prot_inuse_get(net, &raw_prot)); 74 sock_prot_inuse_get(net, &raw_prot));
74 seq_printf(seq, "FRAG: inuse %d memory %d\n", 75 frag_mem = ip_frag_mem(net);
75 ip_frag_nqueues(net), ip_frag_mem(net)); 76 seq_printf(seq, "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem);
76 return 0; 77 return 0;
77} 78}
78 79
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 2c65160565e1..739db3100c23 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -58,6 +58,7 @@
58#include <linux/in_route.h> 58#include <linux/in_route.h>
59#include <linux/route.h> 59#include <linux/route.h>
60#include <linux/skbuff.h> 60#include <linux/skbuff.h>
61#include <linux/igmp.h>
61#include <net/net_namespace.h> 62#include <net/net_namespace.h>
62#include <net/dst.h> 63#include <net/dst.h>
63#include <net/sock.h> 64#include <net/sock.h>
@@ -174,7 +175,9 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
174 175
175 while (sk) { 176 while (sk) {
176 delivered = 1; 177 delivered = 1;
177 if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { 178 if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) &&
179 ip_mc_sf_allow(sk, iph->daddr, iph->saddr,
180 skb->dev->ifindex)) {
178 struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); 181 struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
179 182
180 /* Not releasing hash table! */ 183 /* Not releasing hash table! */
@@ -365,6 +368,8 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
365 368
366 skb->ip_summed = CHECKSUM_NONE; 369 skb->ip_summed = CHECKSUM_NONE;
367 370
371 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
372
368 skb->transport_header = skb->network_header; 373 skb->transport_header = skb->network_header;
369 err = -EFAULT; 374 err = -EFAULT;
370 if (memcpy_fromiovecend((void *)iph, from, 0, length)) 375 if (memcpy_fromiovecend((void *)iph, from, 0, length))
@@ -606,6 +611,8 @@ back_from_confirm:
606 &rt, msg->msg_flags); 611 &rt, msg->msg_flags);
607 612
608 else { 613 else {
614 sock_tx_timestamp(sk, &ipc.tx_flags);
615
609 if (!ipc.addr) 616 if (!ipc.addr)
610 ipc.addr = fl4.daddr; 617 ipc.addr = fl4.daddr;
611 lock_sock(sk); 618 lock_sock(sk);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index c86624b36a62..c0c75688896e 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -170,7 +170,8 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
170} 170}
171EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); 171EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
172 172
173__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) 173__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
174 __u16 *mssp)
174{ 175{
175 const struct iphdr *iph = ip_hdr(skb); 176 const struct iphdr *iph = ip_hdr(skb);
176 const struct tcphdr *th = tcp_hdr(skb); 177 const struct tcphdr *th = tcp_hdr(skb);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9d2118e5fbc7..744af67a5989 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -426,6 +426,15 @@ void tcp_init_sock(struct sock *sk)
426} 426}
427EXPORT_SYMBOL(tcp_init_sock); 427EXPORT_SYMBOL(tcp_init_sock);
428 428
429void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb)
430{
431 struct skb_shared_info *shinfo = skb_shinfo(skb);
432
433 sock_tx_timestamp(sk, &shinfo->tx_flags);
434 if (shinfo->tx_flags & SKBTX_ANY_SW_TSTAMP)
435 shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
436}
437
429/* 438/*
430 * Wait for a TCP event. 439 * Wait for a TCP event.
431 * 440 *
@@ -523,7 +532,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
523 } 532 }
524 /* This barrier is coupled with smp_wmb() in tcp_reset() */ 533 /* This barrier is coupled with smp_wmb() in tcp_reset() */
525 smp_rmb(); 534 smp_rmb();
526 if (sk->sk_err) 535 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
527 mask |= POLLERR; 536 mask |= POLLERR;
528 537
529 return mask; 538 return mask;
@@ -959,8 +968,10 @@ new_segment:
959 968
960 copied += copy; 969 copied += copy;
961 offset += copy; 970 offset += copy;
962 if (!(size -= copy)) 971 if (!(size -= copy)) {
972 tcp_tx_timestamp(sk, skb);
963 goto out; 973 goto out;
974 }
964 975
965 if (skb->len < size_goal || (flags & MSG_OOB)) 976 if (skb->len < size_goal || (flags & MSG_OOB))
966 continue; 977 continue;
@@ -1252,8 +1263,10 @@ new_segment:
1252 1263
1253 from += copy; 1264 from += copy;
1254 copied += copy; 1265 copied += copy;
1255 if ((seglen -= copy) == 0 && iovlen == 0) 1266 if ((seglen -= copy) == 0 && iovlen == 0) {
1267 tcp_tx_timestamp(sk, skb);
1256 goto out; 1268 goto out;
1269 }
1257 1270
1258 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair)) 1271 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1259 continue; 1272 continue;
@@ -1617,6 +1630,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1617 struct sk_buff *skb; 1630 struct sk_buff *skb;
1618 u32 urg_hole = 0; 1631 u32 urg_hole = 0;
1619 1632
1633 if (unlikely(flags & MSG_ERRQUEUE))
1634 return ip_recv_error(sk, msg, len, addr_len);
1635
1620 if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) && 1636 if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
1621 (sk->sk_state == TCP_ESTABLISHED)) 1637 (sk->sk_state == TCP_ESTABLISHED))
1622 sk_busy_loop(sk, nonblock); 1638 sk_busy_loop(sk, nonblock);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 40639c288dc2..a3d47af01906 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -74,6 +74,7 @@
74#include <linux/ipsec.h> 74#include <linux/ipsec.h>
75#include <asm/unaligned.h> 75#include <asm/unaligned.h>
76#include <net/netdma.h> 76#include <net/netdma.h>
77#include <linux/errqueue.h>
77 78
78int sysctl_tcp_timestamps __read_mostly = 1; 79int sysctl_tcp_timestamps __read_mostly = 1;
79int sysctl_tcp_window_scaling __read_mostly = 1; 80int sysctl_tcp_window_scaling __read_mostly = 1;
@@ -1904,16 +1905,17 @@ void tcp_clear_retrans(struct tcp_sock *tp)
1904 tp->sacked_out = 0; 1905 tp->sacked_out = 0;
1905} 1906}
1906 1907
1907/* Enter Loss state. If "how" is not zero, forget all SACK information 1908/* Enter Loss state. If we detect SACK reneging, forget all SACK information
1908 * and reset tags completely, otherwise preserve SACKs. If receiver 1909 * and reset tags completely, otherwise preserve SACKs. If receiver
1909 * dropped its ofo queue, we will know this due to reneging detection. 1910 * dropped its ofo queue, we will know this due to reneging detection.
1910 */ 1911 */
1911void tcp_enter_loss(struct sock *sk, int how) 1912void tcp_enter_loss(struct sock *sk)
1912{ 1913{
1913 const struct inet_connection_sock *icsk = inet_csk(sk); 1914 const struct inet_connection_sock *icsk = inet_csk(sk);
1914 struct tcp_sock *tp = tcp_sk(sk); 1915 struct tcp_sock *tp = tcp_sk(sk);
1915 struct sk_buff *skb; 1916 struct sk_buff *skb;
1916 bool new_recovery = false; 1917 bool new_recovery = false;
1918 bool is_reneg; /* is receiver reneging on SACKs? */
1917 1919
1918 /* Reduce ssthresh if it has not yet been made inside this window. */ 1920 /* Reduce ssthresh if it has not yet been made inside this window. */
1919 if (icsk->icsk_ca_state <= TCP_CA_Disorder || 1921 if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1934,7 +1936,11 @@ void tcp_enter_loss(struct sock *sk, int how)
1934 tcp_reset_reno_sack(tp); 1936 tcp_reset_reno_sack(tp);
1935 1937
1936 tp->undo_marker = tp->snd_una; 1938 tp->undo_marker = tp->snd_una;
1937 if (how) { 1939
1940 skb = tcp_write_queue_head(sk);
1941 is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
1942 if (is_reneg) {
1943 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
1938 tp->sacked_out = 0; 1944 tp->sacked_out = 0;
1939 tp->fackets_out = 0; 1945 tp->fackets_out = 0;
1940 } 1946 }
@@ -1948,7 +1954,7 @@ void tcp_enter_loss(struct sock *sk, int how)
1948 tp->undo_marker = 0; 1954 tp->undo_marker = 0;
1949 1955
1950 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; 1956 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
1951 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { 1957 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
1952 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; 1958 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
1953 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1959 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1954 tp->lost_out += tcp_skb_pcount(skb); 1960 tp->lost_out += tcp_skb_pcount(skb);
@@ -1981,19 +1987,21 @@ void tcp_enter_loss(struct sock *sk, int how)
1981 * remembered SACKs do not reflect real state of receiver i.e. 1987 * remembered SACKs do not reflect real state of receiver i.e.
1982 * receiver _host_ is heavily congested (or buggy). 1988 * receiver _host_ is heavily congested (or buggy).
1983 * 1989 *
1984 * Do processing similar to RTO timeout. 1990 * To avoid big spurious retransmission bursts due to transient SACK
1991 * scoreboard oddities that look like reneging, we give the receiver a
1992 * little time (max(RTT/2, 10ms)) to send us some more ACKs that will
1993 * restore sanity to the SACK scoreboard. If the apparent reneging
1994 * persists until this RTO then we'll clear the SACK scoreboard.
1985 */ 1995 */
1986static bool tcp_check_sack_reneging(struct sock *sk, int flag) 1996static bool tcp_check_sack_reneging(struct sock *sk, int flag)
1987{ 1997{
1988 if (flag & FLAG_SACK_RENEGING) { 1998 if (flag & FLAG_SACK_RENEGING) {
1989 struct inet_connection_sock *icsk = inet_csk(sk); 1999 struct tcp_sock *tp = tcp_sk(sk);
1990 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); 2000 unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
2001 msecs_to_jiffies(10));
1991 2002
1992 tcp_enter_loss(sk, 1);
1993 icsk->icsk_retransmits++;
1994 tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
1995 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 2003 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1996 icsk->icsk_rto, TCP_RTO_MAX); 2004 delay, TCP_RTO_MAX);
1997 return true; 2005 return true;
1998 } 2006 }
1999 return false; 2007 return false;
@@ -2475,7 +2483,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
2475 * losses and/or application stalls), do not perform any further cwnd 2483 * losses and/or application stalls), do not perform any further cwnd
2476 * reductions, but instead slow start up to ssthresh. 2484 * reductions, but instead slow start up to ssthresh.
2477 */ 2485 */
2478static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) 2486static void tcp_init_cwnd_reduction(struct sock *sk)
2479{ 2487{
2480 struct tcp_sock *tp = tcp_sk(sk); 2488 struct tcp_sock *tp = tcp_sk(sk);
2481 2489
@@ -2485,8 +2493,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
2485 tp->prior_cwnd = tp->snd_cwnd; 2493 tp->prior_cwnd = tp->snd_cwnd;
2486 tp->prr_delivered = 0; 2494 tp->prr_delivered = 0;
2487 tp->prr_out = 0; 2495 tp->prr_out = 0;
2488 if (set_ssthresh) 2496 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2489 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2490 TCP_ECN_queue_cwr(tp); 2497 TCP_ECN_queue_cwr(tp);
2491} 2498}
2492 2499
@@ -2528,14 +2535,14 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
2528} 2535}
2529 2536
2530/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */ 2537/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
2531void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) 2538void tcp_enter_cwr(struct sock *sk)
2532{ 2539{
2533 struct tcp_sock *tp = tcp_sk(sk); 2540 struct tcp_sock *tp = tcp_sk(sk);
2534 2541
2535 tp->prior_ssthresh = 0; 2542 tp->prior_ssthresh = 0;
2536 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { 2543 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2537 tp->undo_marker = 0; 2544 tp->undo_marker = 0;
2538 tcp_init_cwnd_reduction(sk, set_ssthresh); 2545 tcp_init_cwnd_reduction(sk);
2539 tcp_set_ca_state(sk, TCP_CA_CWR); 2546 tcp_set_ca_state(sk, TCP_CA_CWR);
2540 } 2547 }
2541} 2548}
@@ -2564,7 +2571,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
2564 tp->retrans_stamp = 0; 2571 tp->retrans_stamp = 0;
2565 2572
2566 if (flag & FLAG_ECE) 2573 if (flag & FLAG_ECE)
2567 tcp_enter_cwr(sk, 1); 2574 tcp_enter_cwr(sk);
2568 2575
2569 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { 2576 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
2570 tcp_try_keep_open(sk); 2577 tcp_try_keep_open(sk);
@@ -2670,7 +2677,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2670 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { 2677 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2671 if (!ece_ack) 2678 if (!ece_ack)
2672 tp->prior_ssthresh = tcp_current_ssthresh(sk); 2679 tp->prior_ssthresh = tcp_current_ssthresh(sk);
2673 tcp_init_cwnd_reduction(sk, true); 2680 tcp_init_cwnd_reduction(sk);
2674 } 2681 }
2675 tcp_set_ca_state(sk, TCP_CA_Recovery); 2682 tcp_set_ca_state(sk, TCP_CA_Recovery);
2676} 2683}
@@ -3100,6 +3107,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3100 tp->retrans_stamp = 0; 3107 tp->retrans_stamp = 0;
3101 } 3108 }
3102 3109
3110 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_ACK_TSTAMP) &&
3111 between(skb_shinfo(skb)->tskey, prior_snd_una,
3112 tp->snd_una + 1))
3113 __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
3114
3103 if (!fully_acked) 3115 if (!fully_acked)
3104 break; 3116 break;
3105 3117
@@ -3346,7 +3358,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
3346 tp->tlp_high_seq = 0; 3358 tp->tlp_high_seq = 0;
3347 /* Don't reduce cwnd if DSACK arrives for TLP retrans. */ 3359 /* Don't reduce cwnd if DSACK arrives for TLP retrans. */
3348 if (!(flag & FLAG_DSACKING_ACK)) { 3360 if (!(flag & FLAG_DSACKING_ACK)) {
3349 tcp_init_cwnd_reduction(sk, true); 3361 tcp_init_cwnd_reduction(sk);
3350 tcp_set_ca_state(sk, TCP_CA_CWR); 3362 tcp_set_ca_state(sk, TCP_CA_CWR);
3351 tcp_end_cwnd_reduction(sk); 3363 tcp_end_cwnd_reduction(sk);
3352 tcp_try_keep_open(sk); 3364 tcp_try_keep_open(sk);
@@ -5877,3 +5889,153 @@ discard:
5877 return 0; 5889 return 0;
5878} 5890}
5879EXPORT_SYMBOL(tcp_rcv_state_process); 5891EXPORT_SYMBOL(tcp_rcv_state_process);
5892
5893static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
5894{
5895 struct inet_request_sock *ireq = inet_rsk(req);
5896
5897 if (family == AF_INET)
5898 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
5899 &ireq->ir_rmt_addr, port);
5900#if IS_ENABLED(CONFIG_IPV6)
5901 else if (family == AF_INET6)
5902 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"),
5903 &ireq->ir_v6_rmt_addr, port);
5904#endif
5905}
5906
5907int tcp_conn_request(struct request_sock_ops *rsk_ops,
5908 const struct tcp_request_sock_ops *af_ops,
5909 struct sock *sk, struct sk_buff *skb)
5910{
5911 struct tcp_options_received tmp_opt;
5912 struct request_sock *req;
5913 struct tcp_sock *tp = tcp_sk(sk);
5914 struct dst_entry *dst = NULL;
5915 __u32 isn = TCP_SKB_CB(skb)->when;
5916 bool want_cookie = false, fastopen;
5917 struct flowi fl;
5918 struct tcp_fastopen_cookie foc = { .len = -1 };
5919 int err;
5920
5921
5922 /* TW buckets are converted to open requests without
5923 * limitations, they conserve resources and peer is
5924 * evidently real one.
5925 */
5926 if ((sysctl_tcp_syncookies == 2 ||
5927 inet_csk_reqsk_queue_is_full(sk)) && !isn) {
5928 want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
5929 if (!want_cookie)
5930 goto drop;
5931 }
5932
5933
5934 /* Accept backlog is full. If we have already queued enough
5935 * of warm entries in syn queue, drop request. It is better than
5936 * clogging syn queue with openreqs with exponentially increasing
5937 * timeout.
5938 */
5939 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
5940 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
5941 goto drop;
5942 }
5943
5944 req = inet_reqsk_alloc(rsk_ops);
5945 if (!req)
5946 goto drop;
5947
5948 tcp_rsk(req)->af_specific = af_ops;
5949
5950 tcp_clear_options(&tmp_opt);
5951 tmp_opt.mss_clamp = af_ops->mss_clamp;
5952 tmp_opt.user_mss = tp->rx_opt.user_mss;
5953 tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
5954
5955 if (want_cookie && !tmp_opt.saw_tstamp)
5956 tcp_clear_options(&tmp_opt);
5957
5958 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
5959 tcp_openreq_init(req, &tmp_opt, skb, sk);
5960
5961 af_ops->init_req(req, sk, skb);
5962
5963 if (security_inet_conn_request(sk, skb, req))
5964 goto drop_and_free;
5965
5966 if (!want_cookie || tmp_opt.tstamp_ok)
5967 TCP_ECN_create_request(req, skb, sock_net(sk));
5968
5969 if (want_cookie) {
5970 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
5971 req->cookie_ts = tmp_opt.tstamp_ok;
5972 } else if (!isn) {
5973 /* VJ's idea. We save last timestamp seen
5974 * from the destination in peer table, when entering
5975 * state TIME-WAIT, and check against it before
5976 * accepting new connection request.
5977 *
5978 * If "isn" is not zero, this request hit alive
5979 * timewait bucket, so that all the necessary checks
5980 * are made in the function processing timewait state.
5981 */
5982 if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) {
5983 bool strict;
5984
5985 dst = af_ops->route_req(sk, &fl, req, &strict);
5986 if (dst && strict &&
5987 !tcp_peer_is_proven(req, dst, true)) {
5988 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
5989 goto drop_and_release;
5990 }
5991 }
5992 /* Kill the following clause, if you dislike this way. */
5993 else if (!sysctl_tcp_syncookies &&
5994 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
5995 (sysctl_max_syn_backlog >> 2)) &&
5996 !tcp_peer_is_proven(req, dst, false)) {
5997 /* Without syncookies last quarter of
5998 * backlog is filled with destinations,
5999 * proven to be alive.
6000 * It means that we continue to communicate
6001 * to destinations, already remembered
6002 * to the moment of synflood.
6003 */
6004 pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
6005 rsk_ops->family);
6006 goto drop_and_release;
6007 }
6008
6009 isn = af_ops->init_seq(skb);
6010 }
6011 if (!dst) {
6012 dst = af_ops->route_req(sk, &fl, req, NULL);
6013 if (!dst)
6014 goto drop_and_free;
6015 }
6016
6017 tcp_rsk(req)->snt_isn = isn;
6018 tcp_openreq_init_rwin(req, sk, dst);
6019 fastopen = !want_cookie &&
6020 tcp_try_fastopen(sk, skb, req, &foc, dst);
6021 err = af_ops->send_synack(sk, dst, &fl, req,
6022 skb_get_queue_mapping(skb), &foc);
6023 if (!fastopen) {
6024 if (err || want_cookie)
6025 goto drop_and_free;
6026
6027 tcp_rsk(req)->listener = NULL;
6028 af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
6029 }
6030
6031 return 0;
6032
6033drop_and_release:
6034 dst_release(dst);
6035drop_and_free:
6036 reqsk_free(req);
6037drop:
6038 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
6039 return 0;
6040}
6041EXPORT_SYMBOL(tcp_conn_request);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 77cccda1ad0c..992a1f926009 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -99,7 +99,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
99struct inet_hashinfo tcp_hashinfo; 99struct inet_hashinfo tcp_hashinfo;
100EXPORT_SYMBOL(tcp_hashinfo); 100EXPORT_SYMBOL(tcp_hashinfo);
101 101
102static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb) 102static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
103{ 103{
104 return secure_tcp_sequence_number(ip_hdr(skb)->daddr, 104 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
105 ip_hdr(skb)->saddr, 105 ip_hdr(skb)->saddr,
@@ -208,6 +208,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
208 inet->inet_dport = usin->sin_port; 208 inet->inet_dport = usin->sin_port;
209 inet->inet_daddr = daddr; 209 inet->inet_daddr = daddr;
210 210
211 inet_set_txhash(sk);
212
211 inet_csk(sk)->icsk_ext_hdr_len = 0; 213 inet_csk(sk)->icsk_ext_hdr_len = 0;
212 if (inet_opt) 214 if (inet_opt)
213 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 215 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
@@ -342,11 +344,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
342 int err; 344 int err;
343 struct net *net = dev_net(icmp_skb->dev); 345 struct net *net = dev_net(icmp_skb->dev);
344 346
345 if (icmp_skb->len < (iph->ihl << 2) + 8) {
346 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
347 return;
348 }
349
350 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, 347 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
351 iph->saddr, th->source, inet_iif(icmp_skb)); 348 iph->saddr, th->source, inet_iif(icmp_skb));
352 if (!sk) { 349 if (!sk) {
@@ -814,6 +811,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
814 * socket. 811 * socket.
815 */ 812 */
816static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, 813static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
814 struct flowi *fl,
817 struct request_sock *req, 815 struct request_sock *req,
818 u16 queue_mapping, 816 u16 queue_mapping,
819 struct tcp_fastopen_cookie *foc) 817 struct tcp_fastopen_cookie *foc)
@@ -837,24 +835,11 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
837 ireq->ir_rmt_addr, 835 ireq->ir_rmt_addr,
838 ireq->opt); 836 ireq->opt);
839 err = net_xmit_eval(err); 837 err = net_xmit_eval(err);
840 if (!tcp_rsk(req)->snt_synack && !err)
841 tcp_rsk(req)->snt_synack = tcp_time_stamp;
842 } 838 }
843 839
844 return err; 840 return err;
845} 841}
846 842
847static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
848{
849 int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL);
850
851 if (!res) {
852 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
853 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
854 }
855 return res;
856}
857
858/* 843/*
859 * IPv4 request_sock destructor. 844 * IPv4 request_sock destructor.
860 */ 845 */
@@ -1064,7 +1049,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1064 if (sin->sin_family != AF_INET) 1049 if (sin->sin_family != AF_INET)
1065 return -EINVAL; 1050 return -EINVAL;
1066 1051
1067 if (!cmd.tcpm_key || !cmd.tcpm_keylen) 1052 if (!cmd.tcpm_keylen)
1068 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1053 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1069 AF_INET); 1054 AF_INET);
1070 1055
@@ -1237,161 +1222,68 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1237 1222
1238#endif 1223#endif
1239 1224
1225static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
1226 struct sk_buff *skb)
1227{
1228 struct inet_request_sock *ireq = inet_rsk(req);
1229
1230 ireq->ir_loc_addr = ip_hdr(skb)->daddr;
1231 ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
1232 ireq->no_srccheck = inet_sk(sk)->transparent;
1233 ireq->opt = tcp_v4_save_options(skb);
1234}
1235
1236static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
1237 const struct request_sock *req,
1238 bool *strict)
1239{
1240 struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1241
1242 if (strict) {
1243 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1244 *strict = true;
1245 else
1246 *strict = false;
1247 }
1248
1249 return dst;
1250}
1251
1240struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1252struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1241 .family = PF_INET, 1253 .family = PF_INET,
1242 .obj_size = sizeof(struct tcp_request_sock), 1254 .obj_size = sizeof(struct tcp_request_sock),
1243 .rtx_syn_ack = tcp_v4_rtx_synack, 1255 .rtx_syn_ack = tcp_rtx_synack,
1244 .send_ack = tcp_v4_reqsk_send_ack, 1256 .send_ack = tcp_v4_reqsk_send_ack,
1245 .destructor = tcp_v4_reqsk_destructor, 1257 .destructor = tcp_v4_reqsk_destructor,
1246 .send_reset = tcp_v4_send_reset, 1258 .send_reset = tcp_v4_send_reset,
1247 .syn_ack_timeout = tcp_syn_ack_timeout, 1259 .syn_ack_timeout = tcp_syn_ack_timeout,
1248}; 1260};
1249 1261
1250#ifdef CONFIG_TCP_MD5SIG
1251static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1262static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1263 .mss_clamp = TCP_MSS_DEFAULT,
1264#ifdef CONFIG_TCP_MD5SIG
1252 .md5_lookup = tcp_v4_reqsk_md5_lookup, 1265 .md5_lookup = tcp_v4_reqsk_md5_lookup,
1253 .calc_md5_hash = tcp_v4_md5_hash_skb, 1266 .calc_md5_hash = tcp_v4_md5_hash_skb,
1254};
1255#endif 1267#endif
1268 .init_req = tcp_v4_init_req,
1269#ifdef CONFIG_SYN_COOKIES
1270 .cookie_init_seq = cookie_v4_init_sequence,
1271#endif
1272 .route_req = tcp_v4_route_req,
1273 .init_seq = tcp_v4_init_sequence,
1274 .send_synack = tcp_v4_send_synack,
1275 .queue_hash_add = inet_csk_reqsk_queue_hash_add,
1276};
1256 1277
1257int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1278int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1258{ 1279{
1259 struct tcp_options_received tmp_opt;
1260 struct request_sock *req;
1261 struct inet_request_sock *ireq;
1262 struct tcp_sock *tp = tcp_sk(sk);
1263 struct dst_entry *dst = NULL;
1264 __be32 saddr = ip_hdr(skb)->saddr;
1265 __be32 daddr = ip_hdr(skb)->daddr;
1266 __u32 isn = TCP_SKB_CB(skb)->when;
1267 bool want_cookie = false, fastopen;
1268 struct flowi4 fl4;
1269 struct tcp_fastopen_cookie foc = { .len = -1 };
1270 int err;
1271
1272 /* Never answer to SYNs send to broadcast or multicast */ 1280 /* Never answer to SYNs send to broadcast or multicast */
1273 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1281 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1274 goto drop; 1282 goto drop;
1275 1283
1276 /* TW buckets are converted to open requests without 1284 return tcp_conn_request(&tcp_request_sock_ops,
1277 * limitations, they conserve resources and peer is 1285 &tcp_request_sock_ipv4_ops, sk, skb);
1278 * evidently real one.
1279 */
1280 if ((sysctl_tcp_syncookies == 2 ||
1281 inet_csk_reqsk_queue_is_full(sk)) && !isn) {
1282 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1283 if (!want_cookie)
1284 goto drop;
1285 }
1286
1287 /* Accept backlog is full. If we have already queued enough
1288 * of warm entries in syn queue, drop request. It is better than
1289 * clogging syn queue with openreqs with exponentially increasing
1290 * timeout.
1291 */
1292 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
1293 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1294 goto drop;
1295 }
1296
1297 req = inet_reqsk_alloc(&tcp_request_sock_ops);
1298 if (!req)
1299 goto drop;
1300
1301#ifdef CONFIG_TCP_MD5SIG
1302 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1303#endif
1304
1305 tcp_clear_options(&tmp_opt);
1306 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1307 tmp_opt.user_mss = tp->rx_opt.user_mss;
1308 tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
1309
1310 if (want_cookie && !tmp_opt.saw_tstamp)
1311 tcp_clear_options(&tmp_opt);
1312
1313 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1314 tcp_openreq_init(req, &tmp_opt, skb);
1315
1316 ireq = inet_rsk(req);
1317 ireq->ir_loc_addr = daddr;
1318 ireq->ir_rmt_addr = saddr;
1319 ireq->no_srccheck = inet_sk(sk)->transparent;
1320 ireq->opt = tcp_v4_save_options(skb);
1321 ireq->ir_mark = inet_request_mark(sk, skb);
1322
1323 if (security_inet_conn_request(sk, skb, req))
1324 goto drop_and_free;
1325
1326 if (!want_cookie || tmp_opt.tstamp_ok)
1327 TCP_ECN_create_request(req, skb, sock_net(sk));
1328
1329 if (want_cookie) {
1330 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1331 req->cookie_ts = tmp_opt.tstamp_ok;
1332 } else if (!isn) {
1333 /* VJ's idea. We save last timestamp seen
1334 * from the destination in peer table, when entering
1335 * state TIME-WAIT, and check against it before
1336 * accepting new connection request.
1337 *
1338 * If "isn" is not zero, this request hit alive
1339 * timewait bucket, so that all the necessary checks
1340 * are made in the function processing timewait state.
1341 */
1342 if (tmp_opt.saw_tstamp &&
1343 tcp_death_row.sysctl_tw_recycle &&
1344 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1345 fl4.daddr == saddr) {
1346 if (!tcp_peer_is_proven(req, dst, true)) {
1347 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1348 goto drop_and_release;
1349 }
1350 }
1351 /* Kill the following clause, if you dislike this way. */
1352 else if (!sysctl_tcp_syncookies &&
1353 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1354 (sysctl_max_syn_backlog >> 2)) &&
1355 !tcp_peer_is_proven(req, dst, false)) {
1356 /* Without syncookies last quarter of
1357 * backlog is filled with destinations,
1358 * proven to be alive.
1359 * It means that we continue to communicate
1360 * to destinations, already remembered
1361 * to the moment of synflood.
1362 */
1363 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1364 &saddr, ntohs(tcp_hdr(skb)->source));
1365 goto drop_and_release;
1366 }
1367
1368 isn = tcp_v4_init_sequence(skb);
1369 }
1370 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1371 goto drop_and_free;
1372
1373 tcp_rsk(req)->snt_isn = isn;
1374 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1375 tcp_openreq_init_rwin(req, sk, dst);
1376 fastopen = !want_cookie &&
1377 tcp_try_fastopen(sk, skb, req, &foc, dst);
1378 err = tcp_v4_send_synack(sk, dst, req,
1379 skb_get_queue_mapping(skb), &foc);
1380 if (!fastopen) {
1381 if (err || want_cookie)
1382 goto drop_and_free;
1383
1384 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1385 tcp_rsk(req)->listener = NULL;
1386 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1387 }
1388
1389 return 0;
1390 1286
1391drop_and_release:
1392 dst_release(dst);
1393drop_and_free:
1394 reqsk_free(req);
1395drop: 1287drop:
1396 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 1288 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1397 return 0; 1289 return 0;
@@ -1439,6 +1331,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1439 newinet->mc_ttl = ip_hdr(skb)->ttl; 1331 newinet->mc_ttl = ip_hdr(skb)->ttl;
1440 newinet->rcv_tos = ip_hdr(skb)->tos; 1332 newinet->rcv_tos = ip_hdr(skb)->tos;
1441 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1333 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1334 inet_set_txhash(newsk);
1442 if (inet_opt) 1335 if (inet_opt)
1443 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1336 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1444 newinet->inet_id = newtp->write_seq ^ jiffies; 1337 newinet->inet_id = newtp->write_seq ^ jiffies;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 4fe041805989..0d54e59b9ea8 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1093,7 +1093,6 @@ static const struct genl_ops tcp_metrics_nl_ops[] = {
1093 .doit = tcp_metrics_nl_cmd_get, 1093 .doit = tcp_metrics_nl_cmd_get,
1094 .dumpit = tcp_metrics_nl_dump, 1094 .dumpit = tcp_metrics_nl_dump,
1095 .policy = tcp_metrics_nl_policy, 1095 .policy = tcp_metrics_nl_policy,
1096 .flags = GENL_ADMIN_PERM,
1097 }, 1096 },
1098 { 1097 {
1099 .cmd = TCP_METRICS_CMD_DEL, 1098 .cmd = TCP_METRICS_CMD_DEL,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index e68e0d4af6c9..1649988bd1b6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -298,7 +298,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
298 tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; 298 tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
299 tw->tw_tclass = np->tclass; 299 tw->tw_tclass = np->tclass;
300 tw->tw_flowlabel = np->flow_label >> 12; 300 tw->tw_flowlabel = np->flow_label >> 12;
301 tw->tw_ipv6only = np->ipv6only; 301 tw->tw_ipv6only = sk->sk_ipv6only;
302 } 302 }
303#endif 303#endif
304 304
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 55046ecd083e..f597119fc4e7 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -14,6 +14,21 @@
14#include <net/tcp.h> 14#include <net/tcp.h>
15#include <net/protocol.h> 15#include <net/protocol.h>
16 16
17void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, unsigned int seq,
18 unsigned int mss)
19{
20 while (skb) {
21 if (ts_seq < (__u64) seq + mss) {
22 skb_shinfo(skb)->tx_flags = SKBTX_SW_TSTAMP;
23 skb_shinfo(skb)->tskey = ts_seq;
24 return;
25 }
26
27 skb = skb->next;
28 seq += mss;
29 }
30}
31
17struct sk_buff *tcp_gso_segment(struct sk_buff *skb, 32struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
18 netdev_features_t features) 33 netdev_features_t features)
19{ 34{
@@ -91,6 +106,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
91 th = tcp_hdr(skb); 106 th = tcp_hdr(skb);
92 seq = ntohl(th->seq); 107 seq = ntohl(th->seq);
93 108
109 if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP))
110 tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss);
111
94 newcheck = ~csum_fold((__force __wsum)((__force u32)th->check + 112 newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
95 (__force u32)delta)); 113 (__force u32)delta));
96 114
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 179b51e6bda3..8fcfc91964ec 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -916,6 +916,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
916 skb_orphan(skb); 916 skb_orphan(skb);
917 skb->sk = sk; 917 skb->sk = sk;
918 skb->destructor = tcp_wfree; 918 skb->destructor = tcp_wfree;
919 skb_set_hash_from_sk(skb, sk);
919 atomic_add(skb->truesize, &sk->sk_wmem_alloc); 920 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
920 921
921 /* Build TCP header and checksum it. */ 922 /* Build TCP header and checksum it. */
@@ -978,7 +979,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
978 if (likely(err <= 0)) 979 if (likely(err <= 0))
979 return err; 980 return err;
980 981
981 tcp_enter_cwr(sk, 1); 982 tcp_enter_cwr(sk);
982 983
983 return net_xmit_eval(err); 984 return net_xmit_eval(err);
984} 985}
@@ -3301,3 +3302,18 @@ void tcp_send_probe0(struct sock *sk)
3301 TCP_RTO_MAX); 3302 TCP_RTO_MAX);
3302 } 3303 }
3303} 3304}
3305
3306int tcp_rtx_synack(struct sock *sk, struct request_sock *req)
3307{
3308 const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
3309 struct flowi fl;
3310 int res;
3311
3312 res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
3313 if (!res) {
3314 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
3315 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
3316 }
3317 return res;
3318}
3319EXPORT_SYMBOL(tcp_rtx_synack);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 286227abed10..df90cd1ce37f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -391,7 +391,7 @@ void tcp_retransmit_timer(struct sock *sk)
391 tcp_write_err(sk); 391 tcp_write_err(sk);
392 goto out; 392 goto out;
393 } 393 }
394 tcp_enter_loss(sk, 0); 394 tcp_enter_loss(sk);
395 tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); 395 tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
396 __sk_dst_reset(sk); 396 __sk_dst_reset(sk);
397 goto out_reset_timer; 397 goto out_reset_timer;
@@ -422,7 +422,7 @@ void tcp_retransmit_timer(struct sock *sk)
422 NET_INC_STATS_BH(sock_net(sk), mib_idx); 422 NET_INC_STATS_BH(sock_net(sk), mib_idx);
423 } 423 }
424 424
425 tcp_enter_loss(sk, 0); 425 tcp_enter_loss(sk);
426 426
427 if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { 427 if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
428 /* Retransmission failed because of local congestion, 428 /* Retransmission failed because of local congestion,
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 9a5e05f27f4f..b40ad897f945 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -218,7 +218,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
218 * This is: 218 * This is:
219 * (actual rate in segments) * baseRTT 219 * (actual rate in segments) * baseRTT
220 */ 220 */
221 target_cwnd = tp->snd_cwnd * vegas->baseRTT / rtt; 221 target_cwnd = (u64)tp->snd_cwnd * vegas->baseRTT;
222 do_div(target_cwnd, rtt);
222 223
223 /* Calculate the difference between the window we had, 224 /* Calculate the difference between the window we had,
224 * and the window we would like to have. This quantity 225 * and the window we would like to have. This quantity
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 27b9825753d1..8276977d2c85 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -144,7 +144,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
144 144
145 rtt = veno->minrtt; 145 rtt = veno->minrtt;
146 146
147 target_cwnd = (tp->snd_cwnd * veno->basertt); 147 target_cwnd = (u64)tp->snd_cwnd * veno->basertt;
148 target_cwnd <<= V_PARAM_SHIFT; 148 target_cwnd <<= V_PARAM_SHIFT;
149 do_div(target_cwnd, rtt); 149 do_div(target_cwnd, rtt);
150 150
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7d5a8661df76..f57c0e4c2326 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -594,27 +594,6 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
594 return true; 594 return true;
595} 595}
596 596
597static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
598 __be16 loc_port, __be32 loc_addr,
599 __be16 rmt_port, __be32 rmt_addr,
600 int dif)
601{
602 struct hlist_nulls_node *node;
603 struct sock *s = sk;
604 unsigned short hnum = ntohs(loc_port);
605
606 sk_nulls_for_each_from(s, node) {
607 if (__udp_is_mcast_sock(net, s,
608 loc_port, loc_addr,
609 rmt_port, rmt_addr,
610 dif, hnum))
611 goto found;
612 }
613 s = NULL;
614found:
615 return s;
616}
617
618/* 597/*
619 * This routine is called by the ICMP module when it gets some 598 * This routine is called by the ICMP module when it gets some
620 * sort of error condition. If err < 0 then the socket should 599 * sort of error condition. If err < 0 then the socket should
@@ -1588,7 +1567,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1588 goto csum_error; 1567 goto csum_error;
1589 1568
1590 1569
1591 if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { 1570 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
1592 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, 1571 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
1593 is_udplite); 1572 is_udplite);
1594 goto drop; 1573 goto drop;
@@ -1640,6 +1619,8 @@ static void flush_stack(struct sock **stack, unsigned int count,
1640 1619
1641 if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) 1620 if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
1642 skb1 = NULL; 1621 skb1 = NULL;
1622
1623 sock_put(sk);
1643 } 1624 }
1644 if (unlikely(skb1)) 1625 if (unlikely(skb1))
1645 kfree_skb(skb1); 1626 kfree_skb(skb1);
@@ -1668,41 +1649,50 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
1668 struct udp_table *udptable) 1649 struct udp_table *udptable)
1669{ 1650{
1670 struct sock *sk, *stack[256 / sizeof(struct sock *)]; 1651 struct sock *sk, *stack[256 / sizeof(struct sock *)];
1671 struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); 1652 struct hlist_nulls_node *node;
1672 int dif; 1653 unsigned short hnum = ntohs(uh->dest);
1673 unsigned int i, count = 0; 1654 struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
1655 int dif = skb->dev->ifindex;
1656 unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
1657 unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
1658
1659 if (use_hash2) {
1660 hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
1661 udp_table.mask;
1662 hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask;
1663start_lookup:
1664 hslot = &udp_table.hash2[hash2];
1665 offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
1666 }
1674 1667
1675 spin_lock(&hslot->lock); 1668 spin_lock(&hslot->lock);
1676 sk = sk_nulls_head(&hslot->head); 1669 sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
1677 dif = skb->dev->ifindex; 1670 if (__udp_is_mcast_sock(net, sk,
1678 sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); 1671 uh->dest, daddr,
1679 while (sk) { 1672 uh->source, saddr,
1680 stack[count++] = sk; 1673 dif, hnum)) {
1681 sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, 1674 if (unlikely(count == ARRAY_SIZE(stack))) {
1682 daddr, uh->source, saddr, dif); 1675 flush_stack(stack, count, skb, ~0);
1683 if (unlikely(count == ARRAY_SIZE(stack))) { 1676 count = 0;
1684 if (!sk) 1677 }
1685 break; 1678 stack[count++] = sk;
1686 flush_stack(stack, count, skb, ~0); 1679 sock_hold(sk);
1687 count = 0;
1688 } 1680 }
1689 } 1681 }
1690 /*
1691 * before releasing chain lock, we must take a reference on sockets
1692 */
1693 for (i = 0; i < count; i++)
1694 sock_hold(stack[i]);
1695 1682
1696 spin_unlock(&hslot->lock); 1683 spin_unlock(&hslot->lock);
1697 1684
1685 /* Also lookup *:port if we are using hash2 and haven't done so yet. */
1686 if (use_hash2 && hash2 != hash2_any) {
1687 hash2 = hash2_any;
1688 goto start_lookup;
1689 }
1690
1698 /* 1691 /*
1699 * do the slow work with no lock held 1692 * do the slow work with no lock held
1700 */ 1693 */
1701 if (count) { 1694 if (count) {
1702 flush_stack(stack, count, skb, count - 1); 1695 flush_stack(stack, count, skb, count - 1);
1703
1704 for (i = 0; i < count; i++)
1705 sock_put(stack[i]);
1706 } else { 1696 } else {
1707 kfree_skb(skb); 1697 kfree_skb(skb);
1708 } 1698 }
@@ -2526,79 +2516,3 @@ void __init udp_init(void)
2526 sysctl_udp_rmem_min = SK_MEM_QUANTUM; 2516 sysctl_udp_rmem_min = SK_MEM_QUANTUM;
2527 sysctl_udp_wmem_min = SK_MEM_QUANTUM; 2517 sysctl_udp_wmem_min = SK_MEM_QUANTUM;
2528} 2518}
2529
2530struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
2531 netdev_features_t features)
2532{
2533 struct sk_buff *segs = ERR_PTR(-EINVAL);
2534 u16 mac_offset = skb->mac_header;
2535 int mac_len = skb->mac_len;
2536 int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
2537 __be16 protocol = skb->protocol;
2538 netdev_features_t enc_features;
2539 int udp_offset, outer_hlen;
2540 unsigned int oldlen;
2541 bool need_csum;
2542
2543 oldlen = (u16)~skb->len;
2544
2545 if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
2546 goto out;
2547
2548 skb->encapsulation = 0;
2549 __skb_pull(skb, tnl_hlen);
2550 skb_reset_mac_header(skb);
2551 skb_set_network_header(skb, skb_inner_network_offset(skb));
2552 skb->mac_len = skb_inner_network_offset(skb);
2553 skb->protocol = htons(ETH_P_TEB);
2554
2555 need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
2556 if (need_csum)
2557 skb->encap_hdr_csum = 1;
2558
2559 /* segment inner packet. */
2560 enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
2561 segs = skb_mac_gso_segment(skb, enc_features);
2562 if (!segs || IS_ERR(segs)) {
2563 skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
2564 mac_len);
2565 goto out;
2566 }
2567
2568 outer_hlen = skb_tnl_header_len(skb);
2569 udp_offset = outer_hlen - tnl_hlen;
2570 skb = segs;
2571 do {
2572 struct udphdr *uh;
2573 int len;
2574
2575 skb_reset_inner_headers(skb);
2576 skb->encapsulation = 1;
2577
2578 skb->mac_len = mac_len;
2579
2580 skb_push(skb, outer_hlen);
2581 skb_reset_mac_header(skb);
2582 skb_set_network_header(skb, mac_len);
2583 skb_set_transport_header(skb, udp_offset);
2584 len = skb->len - udp_offset;
2585 uh = udp_hdr(skb);
2586 uh->len = htons(len);
2587
2588 if (need_csum) {
2589 __be32 delta = htonl(oldlen + len);
2590
2591 uh->check = ~csum_fold((__force __wsum)
2592 ((__force u32)uh->check +
2593 (__force u32)delta));
2594 uh->check = gso_make_checksum(skb, ~uh->check);
2595
2596 if (uh->check == 0)
2597 uh->check = CSUM_MANGLED_0;
2598 }
2599
2600 skb->protocol = protocol;
2601 } while ((skb = skb->next));
2602out:
2603 return segs;
2604}
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 546d2d439dda..59035bc3008d 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -47,6 +47,82 @@ static int udp4_ufo_send_check(struct sk_buff *skb)
47 return 0; 47 return 0;
48} 48}
49 49
50struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
51 netdev_features_t features)
52{
53 struct sk_buff *segs = ERR_PTR(-EINVAL);
54 u16 mac_offset = skb->mac_header;
55 int mac_len = skb->mac_len;
56 int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
57 __be16 protocol = skb->protocol;
58 netdev_features_t enc_features;
59 int udp_offset, outer_hlen;
60 unsigned int oldlen;
61 bool need_csum;
62
63 oldlen = (u16)~skb->len;
64
65 if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
66 goto out;
67
68 skb->encapsulation = 0;
69 __skb_pull(skb, tnl_hlen);
70 skb_reset_mac_header(skb);
71 skb_set_network_header(skb, skb_inner_network_offset(skb));
72 skb->mac_len = skb_inner_network_offset(skb);
73 skb->protocol = htons(ETH_P_TEB);
74
75 need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
76 if (need_csum)
77 skb->encap_hdr_csum = 1;
78
79 /* segment inner packet. */
80 enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
81 segs = skb_mac_gso_segment(skb, enc_features);
82 if (IS_ERR_OR_NULL(segs)) {
83 skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
84 mac_len);
85 goto out;
86 }
87
88 outer_hlen = skb_tnl_header_len(skb);
89 udp_offset = outer_hlen - tnl_hlen;
90 skb = segs;
91 do {
92 struct udphdr *uh;
93 int len;
94
95 skb_reset_inner_headers(skb);
96 skb->encapsulation = 1;
97
98 skb->mac_len = mac_len;
99
100 skb_push(skb, outer_hlen);
101 skb_reset_mac_header(skb);
102 skb_set_network_header(skb, mac_len);
103 skb_set_transport_header(skb, udp_offset);
104 len = skb->len - udp_offset;
105 uh = udp_hdr(skb);
106 uh->len = htons(len);
107
108 if (need_csum) {
109 __be32 delta = htonl(oldlen + len);
110
111 uh->check = ~csum_fold((__force __wsum)
112 ((__force u32)uh->check +
113 (__force u32)delta));
114 uh->check = gso_make_checksum(skb, ~uh->check);
115
116 if (uh->check == 0)
117 uh->check = CSUM_MANGLED_0;
118 }
119
120 skb->protocol = protocol;
121 } while ((skb = skb->next));
122out:
123 return segs;
124}
125
50static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, 126static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
51 netdev_features_t features) 127 netdev_features_t features)
52{ 128{
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
new file mode 100644
index 000000000000..61ec1a65207e
--- /dev/null
+++ b/net/ipv4/udp_tunnel.c
@@ -0,0 +1,100 @@
1#include <linux/module.h>
2#include <linux/errno.h>
3#include <linux/socket.h>
4#include <linux/udp.h>
5#include <linux/types.h>
6#include <linux/kernel.h>
7#include <net/udp.h>
8#include <net/udp_tunnel.h>
9#include <net/net_namespace.h>
10
11int udp_sock_create(struct net *net, struct udp_port_cfg *cfg,
12 struct socket **sockp)
13{
14 int err = -EINVAL;
15 struct socket *sock = NULL;
16
17#if IS_ENABLED(CONFIG_IPV6)
18 if (cfg->family == AF_INET6) {
19 struct sockaddr_in6 udp6_addr;
20
21 err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock);
22 if (err < 0)
23 goto error;
24
25 sk_change_net(sock->sk, net);
26
27 udp6_addr.sin6_family = AF_INET6;
28 memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6,
29 sizeof(udp6_addr.sin6_addr));
30 udp6_addr.sin6_port = cfg->local_udp_port;
31 err = kernel_bind(sock, (struct sockaddr *)&udp6_addr,
32 sizeof(udp6_addr));
33 if (err < 0)
34 goto error;
35
36 if (cfg->peer_udp_port) {
37 udp6_addr.sin6_family = AF_INET6;
38 memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6,
39 sizeof(udp6_addr.sin6_addr));
40 udp6_addr.sin6_port = cfg->peer_udp_port;
41 err = kernel_connect(sock,
42 (struct sockaddr *)&udp6_addr,
43 sizeof(udp6_addr), 0);
44 }
45 if (err < 0)
46 goto error;
47
48 udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums);
49 udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums);
50 } else
51#endif
52 if (cfg->family == AF_INET) {
53 struct sockaddr_in udp_addr;
54
55 err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock);
56 if (err < 0)
57 goto error;
58
59 sk_change_net(sock->sk, net);
60
61 udp_addr.sin_family = AF_INET;
62 udp_addr.sin_addr = cfg->local_ip;
63 udp_addr.sin_port = cfg->local_udp_port;
64 err = kernel_bind(sock, (struct sockaddr *)&udp_addr,
65 sizeof(udp_addr));
66 if (err < 0)
67 goto error;
68
69 if (cfg->peer_udp_port) {
70 udp_addr.sin_family = AF_INET;
71 udp_addr.sin_addr = cfg->peer_ip;
72 udp_addr.sin_port = cfg->peer_udp_port;
73 err = kernel_connect(sock,
74 (struct sockaddr *)&udp_addr,
75 sizeof(udp_addr), 0);
76 if (err < 0)
77 goto error;
78 }
79
80 sock->sk->sk_no_check_tx = !cfg->use_udp_checksums;
81 } else {
82 return -EPFNOSUPPORT;
83 }
84
85
86 *sockp = sock;
87
88 return 0;
89
90error:
91 if (sock) {
92 kernel_sock_shutdown(sock, SHUT_RDWR);
93 sk_release_kernel(sock->sk);
94 }
95 *sockp = NULL;
96 return err;
97}
98EXPORT_SYMBOL(udp_sock_create);
99
100MODULE_LICENSE("GPL");
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index a2ce0101eaac..dccefa9d84cf 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -124,7 +124,7 @@ static int xfrm4_ah_rcv(struct sk_buff *skb)
124 124
125 for_each_protocol_rcu(ah4_handlers, handler) 125 for_each_protocol_rcu(ah4_handlers, handler)
126 if ((ret = handler->handler(skb)) != -EINVAL) 126 if ((ret = handler->handler(skb)) != -EINVAL)
127 return ret;; 127 return ret;
128 128
129 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 129 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
130 130