aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig4
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c3
-rw-r--r--net/ipv4/datagram.c1
-rw-r--r--net/ipv4/devinet.c36
-rw-r--r--net/ipv4/gre_demux.c1
-rw-r--r--net/ipv4/gre_offload.c5
-rw-r--r--net/ipv4/icmp.c2
-rw-r--r--net/ipv4/igmp.c12
-rw-r--r--net/ipv4/inet_fragment.c283
-rw-r--r--net/ipv4/ip_fragment.c56
-rw-r--r--net/ipv4/ip_options.c4
-rw-r--r--net/ipv4/ip_output.c7
-rw-r--r--net/ipv4/ip_sockglue.c2
-rw-r--r--net/ipv4/ip_tunnel.c28
-rw-r--r--net/ipv4/ipconfig.c5
-rw-r--r--net/ipv4/ipmr.c2
-rw-r--r--net/ipv4/netfilter/Kconfig29
-rw-r--r--net/ipv4/netfilter/Makefile4
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c498
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c4
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c8
-rw-r--r--net/ipv4/netfilter/nf_log_arp.c149
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c385
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c2
-rw-r--r--net/ipv4/proc.c5
-rw-r--r--net/ipv4/raw.c9
-rw-r--r--net/ipv4/route.c47
-rw-r--r--net/ipv4/syncookies.c3
-rw-r--r--net/ipv4/tcp.c3
-rw-r--r--net/ipv4/tcp_fastopen.c2
-rw-r--r--net/ipv4/tcp_input.c175
-rw-r--r--net/ipv4/tcp_ipv4.c193
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv4/tcp_offload.c2
-rw-r--r--net/ipv4/tcp_output.c24
-rw-r--r--net/ipv4/udp.c159
-rw-r--r--net/ipv4/udp_offload.c76
-rw-r--r--net/ipv4/udp_tunnel.c100
42 files changed, 1342 insertions, 999 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 05c57f0fcabe..dbc10d84161f 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -307,6 +307,10 @@ config NET_IPVTI
307 the notion of a secure tunnel for IPSEC and then use routing protocol 307 the notion of a secure tunnel for IPSEC and then use routing protocol
308 on top. 308 on top.
309 309
310config NET_UDP_TUNNEL
311 tristate
312 default n
313
310config INET_AH 314config INET_AH
311 tristate "IP: AH transformation" 315 tristate "IP: AH transformation"
312 select XFRM_ALGO 316 select XFRM_ALGO
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f032688d20d3..8ee1cd4053ee 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_NET_IPIP) += ipip.o
22gre-y := gre_demux.o 22gre-y := gre_demux.o
23obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o 23obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
24obj-$(CONFIG_NET_IPGRE) += ip_gre.o 24obj-$(CONFIG_NET_IPGRE) += ip_gre.o
25obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o
25obj-$(CONFIG_NET_IPVTI) += ip_vti.o 26obj-$(CONFIG_NET_IPVTI) += ip_vti.o
26obj-$(CONFIG_SYN_COOKIES) += syncookies.o 27obj-$(CONFIG_SYN_COOKIES) += syncookies.o
27obj-$(CONFIG_INET_AH) += ah4.o 28obj-$(CONFIG_INET_AH) += ah4.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d5e6836cf772..d156b3c5f363 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1429,6 +1429,9 @@ static int inet_gro_complete(struct sk_buff *skb, int nhoff)
1429 int proto = iph->protocol; 1429 int proto = iph->protocol;
1430 int err = -ENOSYS; 1430 int err = -ENOSYS;
1431 1431
1432 if (skb->encapsulation)
1433 skb_set_inner_network_header(skb, nhoff);
1434
1432 csum_replace2(&iph->check, iph->tot_len, newlen); 1435 csum_replace2(&iph->check, iph->tot_len, newlen);
1433 iph->tot_len = newlen; 1436 iph->tot_len = newlen;
1434 1437
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index a3095fdefbed..90c0e8386116 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -76,6 +76,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
76 inet->inet_daddr = fl4->daddr; 76 inet->inet_daddr = fl4->daddr;
77 inet->inet_dport = usin->sin_port; 77 inet->inet_dport = usin->sin_port;
78 sk->sk_state = TCP_ESTABLISHED; 78 sk->sk_state = TCP_ESTABLISHED;
79 inet_set_txhash(sk);
79 inet->inet_id = jiffies; 80 inet->inet_id = jiffies;
80 81
81 sk_dst_set(sk, &rt->dst); 82 sk_dst_set(sk, &rt->dst);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index e9449376b58e..214882e7d6de 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -180,11 +180,12 @@ static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
180static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, 180static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
181 int destroy); 181 int destroy);
182#ifdef CONFIG_SYSCTL 182#ifdef CONFIG_SYSCTL
183static void devinet_sysctl_register(struct in_device *idev); 183static int devinet_sysctl_register(struct in_device *idev);
184static void devinet_sysctl_unregister(struct in_device *idev); 184static void devinet_sysctl_unregister(struct in_device *idev);
185#else 185#else
186static void devinet_sysctl_register(struct in_device *idev) 186static int devinet_sysctl_register(struct in_device *idev)
187{ 187{
188 return 0;
188} 189}
189static void devinet_sysctl_unregister(struct in_device *idev) 190static void devinet_sysctl_unregister(struct in_device *idev)
190{ 191{
@@ -232,6 +233,7 @@ EXPORT_SYMBOL(in_dev_finish_destroy);
232static struct in_device *inetdev_init(struct net_device *dev) 233static struct in_device *inetdev_init(struct net_device *dev)
233{ 234{
234 struct in_device *in_dev; 235 struct in_device *in_dev;
236 int err = -ENOMEM;
235 237
236 ASSERT_RTNL(); 238 ASSERT_RTNL();
237 239
@@ -252,7 +254,13 @@ static struct in_device *inetdev_init(struct net_device *dev)
252 /* Account for reference dev->ip_ptr (below) */ 254 /* Account for reference dev->ip_ptr (below) */
253 in_dev_hold(in_dev); 255 in_dev_hold(in_dev);
254 256
255 devinet_sysctl_register(in_dev); 257 err = devinet_sysctl_register(in_dev);
258 if (err) {
259 in_dev->dead = 1;
260 in_dev_put(in_dev);
261 in_dev = NULL;
262 goto out;
263 }
256 ip_mc_init_dev(in_dev); 264 ip_mc_init_dev(in_dev);
257 if (dev->flags & IFF_UP) 265 if (dev->flags & IFF_UP)
258 ip_mc_up(in_dev); 266 ip_mc_up(in_dev);
@@ -260,7 +268,7 @@ static struct in_device *inetdev_init(struct net_device *dev)
260 /* we can receive as soon as ip_ptr is set -- do this last */ 268 /* we can receive as soon as ip_ptr is set -- do this last */
261 rcu_assign_pointer(dev->ip_ptr, in_dev); 269 rcu_assign_pointer(dev->ip_ptr, in_dev);
262out: 270out:
263 return in_dev; 271 return in_dev ?: ERR_PTR(err);
264out_kfree: 272out_kfree:
265 kfree(in_dev); 273 kfree(in_dev);
266 in_dev = NULL; 274 in_dev = NULL;
@@ -1347,8 +1355,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1347 if (!in_dev) { 1355 if (!in_dev) {
1348 if (event == NETDEV_REGISTER) { 1356 if (event == NETDEV_REGISTER) {
1349 in_dev = inetdev_init(dev); 1357 in_dev = inetdev_init(dev);
1350 if (!in_dev) 1358 if (IS_ERR(in_dev))
1351 return notifier_from_errno(-ENOMEM); 1359 return notifier_from_errno(PTR_ERR(in_dev));
1352 if (dev->flags & IFF_LOOPBACK) { 1360 if (dev->flags & IFF_LOOPBACK) {
1353 IN_DEV_CONF_SET(in_dev, NOXFRM, 1); 1361 IN_DEV_CONF_SET(in_dev, NOXFRM, 1);
1354 IN_DEV_CONF_SET(in_dev, NOPOLICY, 1); 1362 IN_DEV_CONF_SET(in_dev, NOPOLICY, 1);
@@ -2182,11 +2190,21 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
2182 kfree(t); 2190 kfree(t);
2183} 2191}
2184 2192
2185static void devinet_sysctl_register(struct in_device *idev) 2193static int devinet_sysctl_register(struct in_device *idev)
2186{ 2194{
2187 neigh_sysctl_register(idev->dev, idev->arp_parms, NULL); 2195 int err;
2188 __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, 2196
2197 if (!sysctl_dev_name_is_allowed(idev->dev->name))
2198 return -EINVAL;
2199
2200 err = neigh_sysctl_register(idev->dev, idev->arp_parms, NULL);
2201 if (err)
2202 return err;
2203 err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
2189 &idev->cnf); 2204 &idev->cnf);
2205 if (err)
2206 neigh_sysctl_unregister(idev->arp_parms);
2207 return err;
2190} 2208}
2191 2209
2192static void devinet_sysctl_unregister(struct in_device *idev) 2210static void devinet_sysctl_unregister(struct in_device *idev)
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 4e9619bca732..0485bf7f8f03 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -68,6 +68,7 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
68 68
69 skb_push(skb, hdr_len); 69 skb_push(skb, hdr_len);
70 70
71 skb_reset_transport_header(skb);
71 greh = (struct gre_base_hdr *)skb->data; 72 greh = (struct gre_base_hdr *)skb->data;
72 greh->flags = tnl_flags_to_gre_flags(tpi->flags); 73 greh->flags = tnl_flags_to_gre_flags(tpi->flags);
73 greh->protocol = tpi->proto; 74 greh->protocol = tpi->proto;
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index eb92deb12666..6556263c8fa5 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -74,7 +74,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
74 /* segment inner packet. */ 74 /* segment inner packet. */
75 enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); 75 enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
76 segs = skb_mac_gso_segment(skb, enc_features); 76 segs = skb_mac_gso_segment(skb, enc_features);
77 if (!segs || IS_ERR(segs)) { 77 if (IS_ERR_OR_NULL(segs)) {
78 skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len); 78 skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len);
79 goto out; 79 goto out;
80 } 80 }
@@ -263,6 +263,9 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff)
263 int err = -ENOENT; 263 int err = -ENOENT;
264 __be16 type; 264 __be16 type;
265 265
266 skb->encapsulation = 1;
267 skb_shinfo(skb)->gso_type = SKB_GSO_GRE;
268
266 type = greh->protocol; 269 type = greh->protocol;
267 if (greh->flags & GRE_KEY) 270 if (greh->flags & GRE_KEY)
268 grehlen += GRE_HEADER_SECTION; 271 grehlen += GRE_HEADER_SECTION;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 79c3d947a481..42b7bcf8045b 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -739,8 +739,6 @@ static void icmp_unreach(struct sk_buff *skb)
739 /* fall through */ 739 /* fall through */
740 case 0: 740 case 0:
741 info = ntohs(icmph->un.frag.mtu); 741 info = ntohs(icmph->un.frag.mtu);
742 if (!info)
743 goto out;
744 } 742 }
745 break; 743 break;
746 case ICMP_SR_FAILED: 744 case ICMP_SR_FAILED:
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 6748d420f714..f10eab462282 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1321,7 +1321,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1321 atomic_set(&im->refcnt, 1); 1321 atomic_set(&im->refcnt, 1);
1322 spin_lock_init(&im->lock); 1322 spin_lock_init(&im->lock);
1323#ifdef CONFIG_IP_MULTICAST 1323#ifdef CONFIG_IP_MULTICAST
1324 setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im); 1324 setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im);
1325 im->unsolicit_count = IGMP_Unsolicited_Report_Count; 1325 im->unsolicit_count = IGMP_Unsolicited_Report_Count;
1326#endif 1326#endif
1327 1327
@@ -1944,6 +1944,10 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1944 1944
1945 rtnl_lock(); 1945 rtnl_lock();
1946 in_dev = ip_mc_find_dev(net, imr); 1946 in_dev = ip_mc_find_dev(net, imr);
1947 if (!in_dev) {
1948 ret = -ENODEV;
1949 goto out;
1950 }
1947 ifindex = imr->imr_ifindex; 1951 ifindex = imr->imr_ifindex;
1948 for (imlp = &inet->mc_list; 1952 for (imlp = &inet->mc_list;
1949 (iml = rtnl_dereference(*imlp)) != NULL; 1953 (iml = rtnl_dereference(*imlp)) != NULL;
@@ -1961,16 +1965,14 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1961 1965
1962 *imlp = iml->next_rcu; 1966 *imlp = iml->next_rcu;
1963 1967
1964 if (in_dev) 1968 ip_mc_dec_group(in_dev, group);
1965 ip_mc_dec_group(in_dev, group);
1966 rtnl_unlock(); 1969 rtnl_unlock();
1967 /* decrease mem now to avoid the memleak warning */ 1970 /* decrease mem now to avoid the memleak warning */
1968 atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); 1971 atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
1969 kfree_rcu(iml, rcu); 1972 kfree_rcu(iml, rcu);
1970 return 0; 1973 return 0;
1971 } 1974 }
1972 if (!in_dev) 1975out:
1973 ret = -ENODEV;
1974 rtnl_unlock(); 1976 rtnl_unlock();
1975 return ret; 1977 return ret;
1976} 1978}
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 3b01959bf4bb..62b1f73749dc 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -25,6 +25,12 @@
25#include <net/inet_frag.h> 25#include <net/inet_frag.h>
26#include <net/inet_ecn.h> 26#include <net/inet_ecn.h>
27 27
28#define INETFRAGS_EVICT_BUCKETS 128
29#define INETFRAGS_EVICT_MAX 512
30
31/* don't rebuild inetfrag table with new secret more often than this */
32#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
33
28/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements 34/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
29 * Value : 0xff if frame should be dropped. 35 * Value : 0xff if frame should be dropped.
30 * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field 36 * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
@@ -46,24 +52,39 @@ const u8 ip_frag_ecn_table[16] = {
46}; 52};
47EXPORT_SYMBOL(ip_frag_ecn_table); 53EXPORT_SYMBOL(ip_frag_ecn_table);
48 54
49static void inet_frag_secret_rebuild(unsigned long dummy) 55static unsigned int
56inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
57{
58 return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
59}
60
61static bool inet_frag_may_rebuild(struct inet_frags *f)
62{
63 return time_after(jiffies,
64 f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
65}
66
67static void inet_frag_secret_rebuild(struct inet_frags *f)
50{ 68{
51 struct inet_frags *f = (struct inet_frags *)dummy;
52 unsigned long now = jiffies;
53 int i; 69 int i;
54 70
55 /* Per bucket lock NOT needed here, due to write lock protection */ 71 write_seqlock_bh(&f->rnd_seqlock);
56 write_lock(&f->lock); 72
73 if (!inet_frag_may_rebuild(f))
74 goto out;
57 75
58 get_random_bytes(&f->rnd, sizeof(u32)); 76 get_random_bytes(&f->rnd, sizeof(u32));
77
59 for (i = 0; i < INETFRAGS_HASHSZ; i++) { 78 for (i = 0; i < INETFRAGS_HASHSZ; i++) {
60 struct inet_frag_bucket *hb; 79 struct inet_frag_bucket *hb;
61 struct inet_frag_queue *q; 80 struct inet_frag_queue *q;
62 struct hlist_node *n; 81 struct hlist_node *n;
63 82
64 hb = &f->hash[i]; 83 hb = &f->hash[i];
84 spin_lock(&hb->chain_lock);
85
65 hlist_for_each_entry_safe(q, n, &hb->chain, list) { 86 hlist_for_each_entry_safe(q, n, &hb->chain, list) {
66 unsigned int hval = f->hashfn(q); 87 unsigned int hval = inet_frag_hashfn(f, q);
67 88
68 if (hval != i) { 89 if (hval != i) {
69 struct inet_frag_bucket *hb_dest; 90 struct inet_frag_bucket *hb_dest;
@@ -72,76 +93,195 @@ static void inet_frag_secret_rebuild(unsigned long dummy)
72 93
73 /* Relink to new hash chain. */ 94 /* Relink to new hash chain. */
74 hb_dest = &f->hash[hval]; 95 hb_dest = &f->hash[hval];
96
97 /* This is the only place where we take
98 * another chain_lock while already holding
99 * one. As this will not run concurrently,
100 * we cannot deadlock on hb_dest lock below, if its
101 * already locked it will be released soon since
102 * other caller cannot be waiting for hb lock
103 * that we've taken above.
104 */
105 spin_lock_nested(&hb_dest->chain_lock,
106 SINGLE_DEPTH_NESTING);
75 hlist_add_head(&q->list, &hb_dest->chain); 107 hlist_add_head(&q->list, &hb_dest->chain);
108 spin_unlock(&hb_dest->chain_lock);
76 } 109 }
77 } 110 }
111 spin_unlock(&hb->chain_lock);
78 } 112 }
79 write_unlock(&f->lock);
80 113
81 mod_timer(&f->secret_timer, now + f->secret_interval); 114 f->rebuild = false;
115 f->last_rebuild_jiffies = jiffies;
116out:
117 write_sequnlock_bh(&f->rnd_seqlock);
118}
119
120static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
121{
122 return q->net->low_thresh == 0 ||
123 frag_mem_limit(q->net) >= q->net->low_thresh;
124}
125
126static unsigned int
127inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
128{
129 struct inet_frag_queue *fq;
130 struct hlist_node *n;
131 unsigned int evicted = 0;
132 HLIST_HEAD(expired);
133
134evict_again:
135 spin_lock(&hb->chain_lock);
136
137 hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
138 if (!inet_fragq_should_evict(fq))
139 continue;
140
141 if (!del_timer(&fq->timer)) {
142 /* q expiring right now thus increment its refcount so
143 * it won't be freed under us and wait until the timer
144 * has finished executing then destroy it
145 */
146 atomic_inc(&fq->refcnt);
147 spin_unlock(&hb->chain_lock);
148 del_timer_sync(&fq->timer);
149 WARN_ON(atomic_read(&fq->refcnt) != 1);
150 inet_frag_put(fq, f);
151 goto evict_again;
152 }
153
154 /* suppress xmit of (icmp) error packet */
155 fq->last_in &= ~INET_FRAG_FIRST_IN;
156 fq->last_in |= INET_FRAG_EVICTED;
157 hlist_del(&fq->list);
158 hlist_add_head(&fq->list, &expired);
159 ++evicted;
160 }
161
162 spin_unlock(&hb->chain_lock);
163
164 hlist_for_each_entry_safe(fq, n, &expired, list)
165 f->frag_expire((unsigned long) fq);
166
167 return evicted;
168}
169
170static void inet_frag_worker(struct work_struct *work)
171{
172 unsigned int budget = INETFRAGS_EVICT_BUCKETS;
173 unsigned int i, evicted = 0;
174 struct inet_frags *f;
175
176 f = container_of(work, struct inet_frags, frags_work);
177
178 BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
179
180 local_bh_disable();
181
182 for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
183 evicted += inet_evict_bucket(f, &f->hash[i]);
184 i = (i + 1) & (INETFRAGS_HASHSZ - 1);
185 if (evicted > INETFRAGS_EVICT_MAX)
186 break;
187 }
188
189 f->next_bucket = i;
190
191 local_bh_enable();
192
193 if (f->rebuild && inet_frag_may_rebuild(f))
194 inet_frag_secret_rebuild(f);
195}
196
197static void inet_frag_schedule_worker(struct inet_frags *f)
198{
199 if (unlikely(!work_pending(&f->frags_work)))
200 schedule_work(&f->frags_work);
82} 201}
83 202
84void inet_frags_init(struct inet_frags *f) 203void inet_frags_init(struct inet_frags *f)
85{ 204{
86 int i; 205 int i;
87 206
207 INIT_WORK(&f->frags_work, inet_frag_worker);
208
88 for (i = 0; i < INETFRAGS_HASHSZ; i++) { 209 for (i = 0; i < INETFRAGS_HASHSZ; i++) {
89 struct inet_frag_bucket *hb = &f->hash[i]; 210 struct inet_frag_bucket *hb = &f->hash[i];
90 211
91 spin_lock_init(&hb->chain_lock); 212 spin_lock_init(&hb->chain_lock);
92 INIT_HLIST_HEAD(&hb->chain); 213 INIT_HLIST_HEAD(&hb->chain);
93 } 214 }
94 rwlock_init(&f->lock);
95 215
96 setup_timer(&f->secret_timer, inet_frag_secret_rebuild, 216 seqlock_init(&f->rnd_seqlock);
97 (unsigned long)f); 217 f->last_rebuild_jiffies = 0;
98 f->secret_timer.expires = jiffies + f->secret_interval;
99 add_timer(&f->secret_timer);
100} 218}
101EXPORT_SYMBOL(inet_frags_init); 219EXPORT_SYMBOL(inet_frags_init);
102 220
103void inet_frags_init_net(struct netns_frags *nf) 221void inet_frags_init_net(struct netns_frags *nf)
104{ 222{
105 nf->nqueues = 0;
106 init_frag_mem_limit(nf); 223 init_frag_mem_limit(nf);
107 INIT_LIST_HEAD(&nf->lru_list);
108 spin_lock_init(&nf->lru_lock);
109} 224}
110EXPORT_SYMBOL(inet_frags_init_net); 225EXPORT_SYMBOL(inet_frags_init_net);
111 226
112void inet_frags_fini(struct inet_frags *f) 227void inet_frags_fini(struct inet_frags *f)
113{ 228{
114 del_timer(&f->secret_timer); 229 cancel_work_sync(&f->frags_work);
115} 230}
116EXPORT_SYMBOL(inet_frags_fini); 231EXPORT_SYMBOL(inet_frags_fini);
117 232
118void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) 233void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
119{ 234{
120 nf->low_thresh = 0; 235 unsigned int seq;
236 int i;
121 237
238 nf->low_thresh = 0;
122 local_bh_disable(); 239 local_bh_disable();
123 inet_frag_evictor(nf, f, true); 240
241evict_again:
242 seq = read_seqbegin(&f->rnd_seqlock);
243
244 for (i = 0; i < INETFRAGS_HASHSZ ; i++)
245 inet_evict_bucket(f, &f->hash[i]);
246
247 if (read_seqretry(&f->rnd_seqlock, seq))
248 goto evict_again;
249
124 local_bh_enable(); 250 local_bh_enable();
125 251
126 percpu_counter_destroy(&nf->mem); 252 percpu_counter_destroy(&nf->mem);
127} 253}
128EXPORT_SYMBOL(inet_frags_exit_net); 254EXPORT_SYMBOL(inet_frags_exit_net);
129 255
130static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) 256static struct inet_frag_bucket *
257get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
258__acquires(hb->chain_lock)
131{ 259{
132 struct inet_frag_bucket *hb; 260 struct inet_frag_bucket *hb;
133 unsigned int hash; 261 unsigned int seq, hash;
262
263 restart:
264 seq = read_seqbegin(&f->rnd_seqlock);
134 265
135 read_lock(&f->lock); 266 hash = inet_frag_hashfn(f, fq);
136 hash = f->hashfn(fq);
137 hb = &f->hash[hash]; 267 hb = &f->hash[hash];
138 268
139 spin_lock(&hb->chain_lock); 269 spin_lock(&hb->chain_lock);
270 if (read_seqretry(&f->rnd_seqlock, seq)) {
271 spin_unlock(&hb->chain_lock);
272 goto restart;
273 }
274
275 return hb;
276}
277
278static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
279{
280 struct inet_frag_bucket *hb;
281
282 hb = get_frag_bucket_locked(fq, f);
140 hlist_del(&fq->list); 283 hlist_del(&fq->list);
141 spin_unlock(&hb->chain_lock); 284 spin_unlock(&hb->chain_lock);
142
143 read_unlock(&f->lock);
144 inet_frag_lru_del(fq);
145} 285}
146 286
147void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) 287void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
@@ -165,8 +305,7 @@ static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
165 kfree_skb(skb); 305 kfree_skb(skb);
166} 306}
167 307
168void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, 308void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
169 int *work)
170{ 309{
171 struct sk_buff *fp; 310 struct sk_buff *fp;
172 struct netns_frags *nf; 311 struct netns_frags *nf;
@@ -186,86 +325,30 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
186 fp = xp; 325 fp = xp;
187 } 326 }
188 sum = sum_truesize + f->qsize; 327 sum = sum_truesize + f->qsize;
189 if (work)
190 *work -= sum;
191 sub_frag_mem_limit(q, sum); 328 sub_frag_mem_limit(q, sum);
192 329
193 if (f->destructor) 330 if (f->destructor)
194 f->destructor(q); 331 f->destructor(q);
195 kfree(q); 332 kfree(q);
196
197} 333}
198EXPORT_SYMBOL(inet_frag_destroy); 334EXPORT_SYMBOL(inet_frag_destroy);
199 335
200int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
201{
202 struct inet_frag_queue *q;
203 int work, evicted = 0;
204
205 if (!force) {
206 if (frag_mem_limit(nf) <= nf->high_thresh)
207 return 0;
208 }
209
210 work = frag_mem_limit(nf) - nf->low_thresh;
211 while (work > 0 || force) {
212 spin_lock(&nf->lru_lock);
213
214 if (list_empty(&nf->lru_list)) {
215 spin_unlock(&nf->lru_lock);
216 break;
217 }
218
219 q = list_first_entry(&nf->lru_list,
220 struct inet_frag_queue, lru_list);
221 atomic_inc(&q->refcnt);
222 /* Remove q from list to avoid several CPUs grabbing it */
223 list_del_init(&q->lru_list);
224
225 spin_unlock(&nf->lru_lock);
226
227 spin_lock(&q->lock);
228 if (!(q->last_in & INET_FRAG_COMPLETE))
229 inet_frag_kill(q, f);
230 spin_unlock(&q->lock);
231
232 if (atomic_dec_and_test(&q->refcnt))
233 inet_frag_destroy(q, f, &work);
234 evicted++;
235 }
236
237 return evicted;
238}
239EXPORT_SYMBOL(inet_frag_evictor);
240
241static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, 336static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
242 struct inet_frag_queue *qp_in, struct inet_frags *f, 337 struct inet_frag_queue *qp_in, struct inet_frags *f,
243 void *arg) 338 void *arg)
244{ 339{
245 struct inet_frag_bucket *hb; 340 struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
246 struct inet_frag_queue *qp; 341 struct inet_frag_queue *qp;
247 unsigned int hash;
248
249 read_lock(&f->lock); /* Protects against hash rebuild */
250 /*
251 * While we stayed w/o the lock other CPU could update
252 * the rnd seed, so we need to re-calculate the hash
253 * chain. Fortunatelly the qp_in can be used to get one.
254 */
255 hash = f->hashfn(qp_in);
256 hb = &f->hash[hash];
257 spin_lock(&hb->chain_lock);
258 342
259#ifdef CONFIG_SMP 343#ifdef CONFIG_SMP
260 /* With SMP race we have to recheck hash table, because 344 /* With SMP race we have to recheck hash table, because
261 * such entry could be created on other cpu, while we 345 * such entry could have been created on other cpu before
262 * released the hash bucket lock. 346 * we acquired hash bucket lock.
263 */ 347 */
264 hlist_for_each_entry(qp, &hb->chain, list) { 348 hlist_for_each_entry(qp, &hb->chain, list) {
265 if (qp->net == nf && f->match(qp, arg)) { 349 if (qp->net == nf && f->match(qp, arg)) {
266 atomic_inc(&qp->refcnt); 350 atomic_inc(&qp->refcnt);
267 spin_unlock(&hb->chain_lock); 351 spin_unlock(&hb->chain_lock);
268 read_unlock(&f->lock);
269 qp_in->last_in |= INET_FRAG_COMPLETE; 352 qp_in->last_in |= INET_FRAG_COMPLETE;
270 inet_frag_put(qp_in, f); 353 inet_frag_put(qp_in, f);
271 return qp; 354 return qp;
@@ -278,9 +361,8 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
278 361
279 atomic_inc(&qp->refcnt); 362 atomic_inc(&qp->refcnt);
280 hlist_add_head(&qp->list, &hb->chain); 363 hlist_add_head(&qp->list, &hb->chain);
281 inet_frag_lru_add(nf, qp); 364
282 spin_unlock(&hb->chain_lock); 365 spin_unlock(&hb->chain_lock);
283 read_unlock(&f->lock);
284 366
285 return qp; 367 return qp;
286} 368}
@@ -290,6 +372,11 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
290{ 372{
291 struct inet_frag_queue *q; 373 struct inet_frag_queue *q;
292 374
375 if (frag_mem_limit(nf) > nf->high_thresh) {
376 inet_frag_schedule_worker(f);
377 return NULL;
378 }
379
293 q = kzalloc(f->qsize, GFP_ATOMIC); 380 q = kzalloc(f->qsize, GFP_ATOMIC);
294 if (q == NULL) 381 if (q == NULL)
295 return NULL; 382 return NULL;
@@ -301,7 +388,6 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
301 setup_timer(&q->timer, f->frag_expire, (unsigned long)q); 388 setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
302 spin_lock_init(&q->lock); 389 spin_lock_init(&q->lock);
303 atomic_set(&q->refcnt, 1); 390 atomic_set(&q->refcnt, 1);
304 INIT_LIST_HEAD(&q->lru_list);
305 391
306 return q; 392 return q;
307} 393}
@@ -320,12 +406,15 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
320 406
321struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, 407struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
322 struct inet_frags *f, void *key, unsigned int hash) 408 struct inet_frags *f, void *key, unsigned int hash)
323 __releases(&f->lock)
324{ 409{
325 struct inet_frag_bucket *hb; 410 struct inet_frag_bucket *hb;
326 struct inet_frag_queue *q; 411 struct inet_frag_queue *q;
327 int depth = 0; 412 int depth = 0;
328 413
414 if (frag_mem_limit(nf) > nf->low_thresh)
415 inet_frag_schedule_worker(f);
416
417 hash &= (INETFRAGS_HASHSZ - 1);
329 hb = &f->hash[hash]; 418 hb = &f->hash[hash];
330 419
331 spin_lock(&hb->chain_lock); 420 spin_lock(&hb->chain_lock);
@@ -333,18 +422,22 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
333 if (q->net == nf && f->match(q, key)) { 422 if (q->net == nf && f->match(q, key)) {
334 atomic_inc(&q->refcnt); 423 atomic_inc(&q->refcnt);
335 spin_unlock(&hb->chain_lock); 424 spin_unlock(&hb->chain_lock);
336 read_unlock(&f->lock);
337 return q; 425 return q;
338 } 426 }
339 depth++; 427 depth++;
340 } 428 }
341 spin_unlock(&hb->chain_lock); 429 spin_unlock(&hb->chain_lock);
342 read_unlock(&f->lock);
343 430
344 if (depth <= INETFRAGS_MAXDEPTH) 431 if (depth <= INETFRAGS_MAXDEPTH)
345 return inet_frag_create(nf, f, key); 432 return inet_frag_create(nf, f, key);
346 else 433
347 return ERR_PTR(-ENOBUFS); 434 if (inet_frag_may_rebuild(f)) {
435 if (!f->rebuild)
436 f->rebuild = true;
437 inet_frag_schedule_worker(f);
438 }
439
440 return ERR_PTR(-ENOBUFS);
348} 441}
349EXPORT_SYMBOL(inet_frag_find); 442EXPORT_SYMBOL(inet_frag_find);
350 443
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index ed32313e307c..634fc31aa243 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -86,11 +86,6 @@ static inline u8 ip4_frag_ecn(u8 tos)
86 86
87static struct inet_frags ip4_frags; 87static struct inet_frags ip4_frags;
88 88
89int ip_frag_nqueues(struct net *net)
90{
91 return net->ipv4.frags.nqueues;
92}
93
94int ip_frag_mem(struct net *net) 89int ip_frag_mem(struct net *net)
95{ 90{
96 return sum_frag_mem_limit(&net->ipv4.frags); 91 return sum_frag_mem_limit(&net->ipv4.frags);
@@ -109,21 +104,21 @@ static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
109 net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); 104 net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
110 return jhash_3words((__force u32)id << 16 | prot, 105 return jhash_3words((__force u32)id << 16 | prot,
111 (__force u32)saddr, (__force u32)daddr, 106 (__force u32)saddr, (__force u32)daddr,
112 ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1); 107 ip4_frags.rnd);
113} 108}
114 109
115static unsigned int ip4_hashfn(struct inet_frag_queue *q) 110static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
116{ 111{
117 struct ipq *ipq; 112 const struct ipq *ipq;
118 113
119 ipq = container_of(q, struct ipq, q); 114 ipq = container_of(q, struct ipq, q);
120 return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); 115 return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
121} 116}
122 117
123static bool ip4_frag_match(struct inet_frag_queue *q, void *a) 118static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a)
124{ 119{
125 struct ipq *qp; 120 const struct ipq *qp;
126 struct ip4_create_arg *arg = a; 121 const struct ip4_create_arg *arg = a;
127 122
128 qp = container_of(q, struct ipq, q); 123 qp = container_of(q, struct ipq, q);
129 return qp->id == arg->iph->id && 124 return qp->id == arg->iph->id &&
@@ -133,14 +128,14 @@ static bool ip4_frag_match(struct inet_frag_queue *q, void *a)
133 qp->user == arg->user; 128 qp->user == arg->user;
134} 129}
135 130
136static void ip4_frag_init(struct inet_frag_queue *q, void *a) 131static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
137{ 132{
138 struct ipq *qp = container_of(q, struct ipq, q); 133 struct ipq *qp = container_of(q, struct ipq, q);
139 struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, 134 struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
140 frags); 135 frags);
141 struct net *net = container_of(ipv4, struct net, ipv4); 136 struct net *net = container_of(ipv4, struct net, ipv4);
142 137
143 struct ip4_create_arg *arg = a; 138 const struct ip4_create_arg *arg = a;
144 139
145 qp->protocol = arg->iph->protocol; 140 qp->protocol = arg->iph->protocol;
146 qp->id = arg->iph->id; 141 qp->id = arg->iph->id;
@@ -177,18 +172,6 @@ static void ipq_kill(struct ipq *ipq)
177 inet_frag_kill(&ipq->q, &ip4_frags); 172 inet_frag_kill(&ipq->q, &ip4_frags);
178} 173}
179 174
180/* Memory limiting on fragments. Evictor trashes the oldest
181 * fragment queue until we are back under the threshold.
182 */
183static void ip_evictor(struct net *net)
184{
185 int evicted;
186
187 evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
188 if (evicted)
189 IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
190}
191
192/* 175/*
193 * Oops, a fragment queue timed out. Kill it and send an ICMP reply. 176 * Oops, a fragment queue timed out. Kill it and send an ICMP reply.
194 */ 177 */
@@ -207,7 +190,8 @@ static void ip_expire(unsigned long arg)
207 190
208 ipq_kill(qp); 191 ipq_kill(qp);
209 192
210 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); 193 if (!(qp->q.last_in & INET_FRAG_EVICTED))
194 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
211 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); 195 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
212 196
213 if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { 197 if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {
@@ -260,7 +244,6 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
260 arg.iph = iph; 244 arg.iph = iph;
261 arg.user = user; 245 arg.user = user;
262 246
263 read_lock(&ip4_frags.lock);
264 hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); 247 hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
265 248
266 q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); 249 q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
@@ -505,7 +488,6 @@ found:
505 } 488 }
506 489
507 skb_dst_drop(skb); 490 skb_dst_drop(skb);
508 inet_frag_lru_move(&qp->q);
509 return -EINPROGRESS; 491 return -EINPROGRESS;
510 492
511err: 493err:
@@ -655,9 +637,6 @@ int ip_defrag(struct sk_buff *skb, u32 user)
655 net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev); 637 net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
656 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); 638 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
657 639
658 /* Start by cleaning up the memory. */
659 ip_evictor(net);
660
661 /* Lookup (or create) queue header */ 640 /* Lookup (or create) queue header */
662 if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { 641 if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
663 int ret; 642 int ret;
@@ -721,14 +700,17 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
721 .data = &init_net.ipv4.frags.high_thresh, 700 .data = &init_net.ipv4.frags.high_thresh,
722 .maxlen = sizeof(int), 701 .maxlen = sizeof(int),
723 .mode = 0644, 702 .mode = 0644,
724 .proc_handler = proc_dointvec 703 .proc_handler = proc_dointvec_minmax,
704 .extra1 = &init_net.ipv4.frags.low_thresh
725 }, 705 },
726 { 706 {
727 .procname = "ipfrag_low_thresh", 707 .procname = "ipfrag_low_thresh",
728 .data = &init_net.ipv4.frags.low_thresh, 708 .data = &init_net.ipv4.frags.low_thresh,
729 .maxlen = sizeof(int), 709 .maxlen = sizeof(int),
730 .mode = 0644, 710 .mode = 0644,
731 .proc_handler = proc_dointvec 711 .proc_handler = proc_dointvec_minmax,
712 .extra1 = &zero,
713 .extra2 = &init_net.ipv4.frags.high_thresh
732 }, 714 },
733 { 715 {
734 .procname = "ipfrag_time", 716 .procname = "ipfrag_time",
@@ -740,10 +722,12 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
740 { } 722 { }
741}; 723};
742 724
725/* secret interval has been deprecated */
726static int ip4_frags_secret_interval_unused;
743static struct ctl_table ip4_frags_ctl_table[] = { 727static struct ctl_table ip4_frags_ctl_table[] = {
744 { 728 {
745 .procname = "ipfrag_secret_interval", 729 .procname = "ipfrag_secret_interval",
746 .data = &ip4_frags.secret_interval, 730 .data = &ip4_frags_secret_interval_unused,
747 .maxlen = sizeof(int), 731 .maxlen = sizeof(int),
748 .mode = 0644, 732 .mode = 0644,
749 .proc_handler = proc_dointvec_jiffies, 733 .proc_handler = proc_dointvec_jiffies,
@@ -771,7 +755,10 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
771 goto err_alloc; 755 goto err_alloc;
772 756
773 table[0].data = &net->ipv4.frags.high_thresh; 757 table[0].data = &net->ipv4.frags.high_thresh;
758 table[0].extra1 = &net->ipv4.frags.low_thresh;
759 table[0].extra2 = &init_net.ipv4.frags.high_thresh;
774 table[1].data = &net->ipv4.frags.low_thresh; 760 table[1].data = &net->ipv4.frags.low_thresh;
761 table[1].extra2 = &net->ipv4.frags.high_thresh;
775 table[2].data = &net->ipv4.frags.timeout; 762 table[2].data = &net->ipv4.frags.timeout;
776 763
777 /* Don't export sysctls to unprivileged users */ 764 /* Don't export sysctls to unprivileged users */
@@ -873,6 +860,5 @@ void __init ipfrag_init(void)
873 ip4_frags.qsize = sizeof(struct ipq); 860 ip4_frags.qsize = sizeof(struct ipq);
874 ip4_frags.match = ip4_frag_match; 861 ip4_frags.match = ip4_frag_match;
875 ip4_frags.frag_expire = ip_expire; 862 ip4_frags.frag_expire = ip_expire;
876 ip4_frags.secret_interval = 10 * 60 * HZ;
877 inet_frags_init(&ip4_frags); 863 inet_frags_init(&ip4_frags);
878} 864}
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 5e7aecea05cd..ad382499bace 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -288,6 +288,10 @@ int ip_options_compile(struct net *net,
288 optptr++; 288 optptr++;
289 continue; 289 continue;
290 } 290 }
291 if (unlikely(l < 2)) {
292 pp_ptr = optptr;
293 goto error;
294 }
291 optlen = optptr[1]; 295 optlen = optptr[1];
292 if (optlen < 2 || optlen > l) { 296 if (optlen < 2 || optlen > l) {
293 pp_ptr = optptr; 297 pp_ptr = optptr;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 8d3b6b0e9857..b16556836d66 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -962,10 +962,6 @@ alloc_new_skb:
962 sk->sk_allocation); 962 sk->sk_allocation);
963 if (unlikely(skb == NULL)) 963 if (unlikely(skb == NULL))
964 err = -ENOBUFS; 964 err = -ENOBUFS;
965 else
966 /* only the initial fragment is
967 time stamped */
968 cork->tx_flags = 0;
969 } 965 }
970 if (skb == NULL) 966 if (skb == NULL)
971 goto error; 967 goto error;
@@ -976,7 +972,10 @@ alloc_new_skb:
976 skb->ip_summed = csummode; 972 skb->ip_summed = csummode;
977 skb->csum = 0; 973 skb->csum = 0;
978 skb_reserve(skb, hh_len); 974 skb_reserve(skb, hh_len);
975
976 /* only the initial fragment is time stamped */
979 skb_shinfo(skb)->tx_flags = cork->tx_flags; 977 skb_shinfo(skb)->tx_flags = cork->tx_flags;
978 cork->tx_flags = 0;
980 979
981 /* 980 /*
982 * Find where to start putting bytes. 981 * Find where to start putting bytes.
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 64741b938632..5cb830c78990 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1319,7 +1319,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1319 if (sk->sk_type != SOCK_STREAM) 1319 if (sk->sk_type != SOCK_STREAM)
1320 return -ENOPROTOOPT; 1320 return -ENOPROTOOPT;
1321 1321
1322 msg.msg_control = optval; 1322 msg.msg_control = (__force void *) optval;
1323 msg.msg_controllen = len; 1323 msg.msg_controllen = len;
1324 msg.msg_flags = flags; 1324 msg.msg_flags = flags;
1325 1325
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 097b3e7c1e8f..dd8c8c765799 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -73,12 +73,7 @@ static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
73{ 73{
74 struct dst_entry *old_dst; 74 struct dst_entry *old_dst;
75 75
76 if (dst) { 76 dst_clone(dst);
77 if (dst->flags & DST_NOCACHE)
78 dst = NULL;
79 else
80 dst_clone(dst);
81 }
82 old_dst = xchg((__force struct dst_entry **)&idst->dst, dst); 77 old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
83 dst_release(old_dst); 78 dst_release(old_dst);
84} 79}
@@ -108,13 +103,14 @@ static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie)
108 103
109 rcu_read_lock(); 104 rcu_read_lock();
110 dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst); 105 dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst);
106 if (dst && !atomic_inc_not_zero(&dst->__refcnt))
107 dst = NULL;
111 if (dst) { 108 if (dst) {
112 if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 109 if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
113 rcu_read_unlock();
114 tunnel_dst_reset(t); 110 tunnel_dst_reset(t);
115 return NULL; 111 dst_release(dst);
112 dst = NULL;
116 } 113 }
117 dst_hold(dst);
118 } 114 }
119 rcu_read_unlock(); 115 rcu_read_unlock();
120 return (struct rtable *)dst; 116 return (struct rtable *)dst;
@@ -173,6 +169,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
173 169
174 hlist_for_each_entry_rcu(t, head, hash_node) { 170 hlist_for_each_entry_rcu(t, head, hash_node) {
175 if (remote != t->parms.iph.daddr || 171 if (remote != t->parms.iph.daddr ||
172 t->parms.iph.saddr != 0 ||
176 !(t->dev->flags & IFF_UP)) 173 !(t->dev->flags & IFF_UP))
177 continue; 174 continue;
178 175
@@ -189,10 +186,11 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
189 head = &itn->tunnels[hash]; 186 head = &itn->tunnels[hash];
190 187
191 hlist_for_each_entry_rcu(t, head, hash_node) { 188 hlist_for_each_entry_rcu(t, head, hash_node) {
192 if ((local != t->parms.iph.saddr && 189 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
193 (local != t->parms.iph.daddr || 190 (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
194 !ipv4_is_multicast(local))) || 191 continue;
195 !(t->dev->flags & IFF_UP)) 192
193 if (!(t->dev->flags & IFF_UP))
196 continue; 194 continue;
197 195
198 if (!ip_tunnel_key_match(&t->parms, flags, key)) 196 if (!ip_tunnel_key_match(&t->parms, flags, key))
@@ -209,6 +207,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
209 207
210 hlist_for_each_entry_rcu(t, head, hash_node) { 208 hlist_for_each_entry_rcu(t, head, hash_node) {
211 if (t->parms.i_key != key || 209 if (t->parms.i_key != key ||
210 t->parms.iph.saddr != 0 ||
211 t->parms.iph.daddr != 0 ||
212 !(t->dev->flags & IFF_UP)) 212 !(t->dev->flags & IFF_UP))
213 continue; 213 continue;
214 214
@@ -305,7 +305,7 @@ static struct net_device *__ip_tunnel_create(struct net *net,
305 } 305 }
306 306
307 ASSERT_RTNL(); 307 ASSERT_RTNL();
308 dev = alloc_netdev(ops->priv_size, name, ops->setup); 308 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
309 if (!dev) { 309 if (!dev) {
310 err = -ENOMEM; 310 err = -ENOMEM;
311 goto failed; 311 goto failed;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index b3e86ea7b71b..5bbef4fdcb43 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -143,8 +143,6 @@ __be32 ic_servaddr = NONE; /* Boot server IP address */
143__be32 root_server_addr = NONE; /* Address of NFS server */ 143__be32 root_server_addr = NONE; /* Address of NFS server */
144u8 root_server_path[256] = { 0, }; /* Path to mount as root */ 144u8 root_server_path[256] = { 0, }; /* Path to mount as root */
145 145
146__be32 ic_dev_xid; /* Device under configuration */
147
148/* vendor class identifier */ 146/* vendor class identifier */
149static char vendor_class_identifier[253] __initdata; 147static char vendor_class_identifier[253] __initdata;
150 148
@@ -654,6 +652,7 @@ static struct packet_type bootp_packet_type __initdata = {
654 .func = ic_bootp_recv, 652 .func = ic_bootp_recv,
655}; 653};
656 654
655static __be32 ic_dev_xid; /* Device under configuration */
657 656
658/* 657/*
659 * Initialize DHCP/BOOTP extension fields in the request. 658 * Initialize DHCP/BOOTP extension fields in the request.
@@ -1218,10 +1217,10 @@ static int __init ic_dynamic(void)
1218 get_random_bytes(&timeout, sizeof(timeout)); 1217 get_random_bytes(&timeout, sizeof(timeout));
1219 timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM); 1218 timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM);
1220 for (;;) { 1219 for (;;) {
1220#ifdef IPCONFIG_BOOTP
1221 /* Track the device we are configuring */ 1221 /* Track the device we are configuring */
1222 ic_dev_xid = d->xid; 1222 ic_dev_xid = d->xid;
1223 1223
1224#ifdef IPCONFIG_BOOTP
1225 if (do_bootp && (d->able & IC_BOOTP)) 1224 if (do_bootp && (d->able & IC_BOOTP))
1226 ic_bootp_send_if(d, jiffies - start_jiffies); 1225 ic_bootp_send_if(d, jiffies - start_jiffies);
1227#endif 1226#endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 65bcaa789043..c8034587859d 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -500,7 +500,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
500 else 500 else
501 sprintf(name, "pimreg%u", mrt->id); 501 sprintf(name, "pimreg%u", mrt->id);
502 502
503 dev = alloc_netdev(0, name, reg_vif_setup); 503 dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);
504 504
505 if (dev == NULL) 505 if (dev == NULL)
506 return NULL; 506 return NULL;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index a26ce035e3fa..fb173126f03d 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -36,6 +36,16 @@ config NF_CONNTRACK_PROC_COMPAT
36 36
37 If unsure, say Y. 37 If unsure, say Y.
38 38
39config NF_LOG_ARP
40 tristate "ARP packet logging"
41 default m if NETFILTER_ADVANCED=n
42 select NF_LOG_COMMON
43
44config NF_LOG_IPV4
45 tristate "IPv4 packet logging"
46 default m if NETFILTER_ADVANCED=n
47 select NF_LOG_COMMON
48
39config NF_TABLES_IPV4 49config NF_TABLES_IPV4
40 depends on NF_TABLES 50 depends on NF_TABLES
41 tristate "IPv4 nf_tables support" 51 tristate "IPv4 nf_tables support"
@@ -159,25 +169,6 @@ config IP_NF_TARGET_SYNPROXY
159 169
160 To compile it as a module, choose M here. If unsure, say N. 170 To compile it as a module, choose M here. If unsure, say N.
161 171
162config IP_NF_TARGET_ULOG
163 tristate "ULOG target support (obsolete)"
164 default m if NETFILTER_ADVANCED=n
165 ---help---
166
167 This option enables the old IPv4-only "ipt_ULOG" implementation
168 which has been obsoleted by the new "nfnetlink_log" code (see
169 CONFIG_NETFILTER_NETLINK_LOG).
170
171 This option adds a `ULOG' target, which allows you to create rules in
172 any iptables table. The packet is passed to a userspace logging
173 daemon using netlink multicast sockets; unlike the LOG target
174 which can only be viewed through syslog.
175
176 The appropriate userspace logging daemon (ulogd) may be obtained from
177 <http://www.netfilter.org/projects/ulogd/index.html>
178
179 To compile it as a module, choose M here. If unsure, say N.
180
181# NAT + specific targets: nf_conntrack 172# NAT + specific targets: nf_conntrack
182config NF_NAT_IPV4 173config NF_NAT_IPV4
183 tristate "IPv4 NAT" 174 tristate "IPv4 NAT"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 90b82405331e..245db9df3337 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -19,6 +19,10 @@ obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
19# defrag 19# defrag
20obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o 20obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
21 21
22# logging
23obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o
24obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o
25
22# NAT helpers (nf_conntrack) 26# NAT helpers (nf_conntrack)
23obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o 27obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
24obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o 28obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
deleted file mode 100644
index 9cb993cd224b..000000000000
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ /dev/null
@@ -1,498 +0,0 @@
1/*
2 * netfilter module for userspace packet logging daemons
3 *
4 * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
5 * (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 * (C) 2005-2007 Patrick McHardy <kaber@trash.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This module accepts two parameters:
14 *
15 * nlbufsiz:
16 * The parameter specifies how big the buffer for each netlink multicast
17 * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will
18 * get accumulated in the kernel until they are sent to userspace. It is
19 * NOT possible to allocate more than 128kB, and it is strongly discouraged,
20 * because atomically allocating 128kB inside the network rx softirq is not
21 * reliable. Please also keep in mind that this buffer size is allocated for
22 * each nlgroup you are using, so the total kernel memory usage increases
23 * by that factor.
24 *
25 * Actually you should use nlbufsiz a bit smaller than PAGE_SIZE, since
26 * nlbufsiz is used with alloc_skb, which adds another
27 * sizeof(struct skb_shared_info). Use NLMSG_GOODSIZE instead.
28 *
29 * flushtimeout:
30 * Specify, after how many hundredths of a second the queue should be
31 * flushed even if it is not full yet.
32 */
33#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
34#include <linux/module.h>
35#include <linux/spinlock.h>
36#include <linux/socket.h>
37#include <linux/slab.h>
38#include <linux/skbuff.h>
39#include <linux/kernel.h>
40#include <linux/timer.h>
41#include <net/netlink.h>
42#include <linux/netdevice.h>
43#include <linux/mm.h>
44#include <linux/moduleparam.h>
45#include <linux/netfilter.h>
46#include <linux/netfilter/x_tables.h>
47#include <linux/netfilter_ipv4/ipt_ULOG.h>
48#include <net/netfilter/nf_log.h>
49#include <net/netns/generic.h>
50#include <net/sock.h>
51#include <linux/bitops.h>
52#include <asm/unaligned.h>
53
54MODULE_LICENSE("GPL");
55MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
56MODULE_DESCRIPTION("Xtables: packet logging to netlink using ULOG");
57MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG);
58
59#define ULOG_NL_EVENT 111 /* Harald's favorite number */
60#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */
61
62static unsigned int nlbufsiz = NLMSG_GOODSIZE;
63module_param(nlbufsiz, uint, 0400);
64MODULE_PARM_DESC(nlbufsiz, "netlink buffer size");
65
66static unsigned int flushtimeout = 10;
67module_param(flushtimeout, uint, 0600);
68MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)");
69
70static bool nflog = true;
71module_param(nflog, bool, 0400);
72MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
73
74/* global data structures */
75
76typedef struct {
77 unsigned int qlen; /* number of nlmsgs' in the skb */
78 struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */
79 struct sk_buff *skb; /* the pre-allocated skb */
80 struct timer_list timer; /* the timer function */
81} ulog_buff_t;
82
83static int ulog_net_id __read_mostly;
84struct ulog_net {
85 unsigned int nlgroup[ULOG_MAXNLGROUPS];
86 ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS];
87 struct sock *nflognl;
88 spinlock_t lock;
89};
90
91static struct ulog_net *ulog_pernet(struct net *net)
92{
93 return net_generic(net, ulog_net_id);
94}
95
96/* send one ulog_buff_t to userspace */
97static void ulog_send(struct ulog_net *ulog, unsigned int nlgroupnum)
98{
99 ulog_buff_t *ub = &ulog->ulog_buffers[nlgroupnum];
100
101 pr_debug("ulog_send: timer is deleting\n");
102 del_timer(&ub->timer);
103
104 if (!ub->skb) {
105 pr_debug("ulog_send: nothing to send\n");
106 return;
107 }
108
109 /* last nlmsg needs NLMSG_DONE */
110 if (ub->qlen > 1)
111 ub->lastnlh->nlmsg_type = NLMSG_DONE;
112
113 NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
114 pr_debug("throwing %d packets to netlink group %u\n",
115 ub->qlen, nlgroupnum + 1);
116 netlink_broadcast(ulog->nflognl, ub->skb, 0, nlgroupnum + 1,
117 GFP_ATOMIC);
118
119 ub->qlen = 0;
120 ub->skb = NULL;
121 ub->lastnlh = NULL;
122}
123
124
125/* timer function to flush queue in flushtimeout time */
126static void ulog_timer(unsigned long data)
127{
128 unsigned int groupnum = *((unsigned int *)data);
129 struct ulog_net *ulog = container_of((void *)data,
130 struct ulog_net,
131 nlgroup[groupnum]);
132 pr_debug("timer function called, calling ulog_send\n");
133
134 /* lock to protect against somebody modifying our structure
135 * from ipt_ulog_target at the same time */
136 spin_lock_bh(&ulog->lock);
137 ulog_send(ulog, groupnum);
138 spin_unlock_bh(&ulog->lock);
139}
140
141static struct sk_buff *ulog_alloc_skb(unsigned int size)
142{
143 struct sk_buff *skb;
144 unsigned int n;
145
146 /* alloc skb which should be big enough for a whole
147 * multipart message. WARNING: has to be <= 131000
148 * due to slab allocator restrictions */
149
150 n = max(size, nlbufsiz);
151 skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN);
152 if (!skb) {
153 if (n > size) {
154 /* try to allocate only as much as we need for
155 * current packet */
156
157 skb = alloc_skb(size, GFP_ATOMIC);
158 if (!skb)
159 pr_debug("cannot even allocate %ub\n", size);
160 }
161 }
162
163 return skb;
164}
165
166static void ipt_ulog_packet(struct net *net,
167 unsigned int hooknum,
168 const struct sk_buff *skb,
169 const struct net_device *in,
170 const struct net_device *out,
171 const struct ipt_ulog_info *loginfo,
172 const char *prefix)
173{
174 ulog_buff_t *ub;
175 ulog_packet_msg_t *pm;
176 size_t size, copy_len;
177 struct nlmsghdr *nlh;
178 struct timeval tv;
179 struct ulog_net *ulog = ulog_pernet(net);
180
181 /* ffs == find first bit set, necessary because userspace
182 * is already shifting groupnumber, but we need unshifted.
183 * ffs() returns [1..32], we need [0..31] */
184 unsigned int groupnum = ffs(loginfo->nl_group) - 1;
185
186 /* calculate the size of the skb needed */
187 if (loginfo->copy_range == 0 || loginfo->copy_range > skb->len)
188 copy_len = skb->len;
189 else
190 copy_len = loginfo->copy_range;
191
192 size = nlmsg_total_size(sizeof(*pm) + copy_len);
193
194 ub = &ulog->ulog_buffers[groupnum];
195
196 spin_lock_bh(&ulog->lock);
197
198 if (!ub->skb) {
199 if (!(ub->skb = ulog_alloc_skb(size)))
200 goto alloc_failure;
201 } else if (ub->qlen >= loginfo->qthreshold ||
202 size > skb_tailroom(ub->skb)) {
203 /* either the queue len is too high or we don't have
204 * enough room in nlskb left. send it to userspace. */
205
206 ulog_send(ulog, groupnum);
207
208 if (!(ub->skb = ulog_alloc_skb(size)))
209 goto alloc_failure;
210 }
211
212 pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold);
213
214 nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
215 sizeof(*pm)+copy_len, 0);
216 if (!nlh) {
217 pr_debug("error during nlmsg_put\n");
218 goto out_unlock;
219 }
220 ub->qlen++;
221
222 pm = nlmsg_data(nlh);
223 memset(pm, 0, sizeof(*pm));
224
225 /* We might not have a timestamp, get one */
226 if (skb->tstamp.tv64 == 0)
227 __net_timestamp((struct sk_buff *)skb);
228
229 /* copy hook, prefix, timestamp, payload, etc. */
230 pm->data_len = copy_len;
231 tv = ktime_to_timeval(skb->tstamp);
232 put_unaligned(tv.tv_sec, &pm->timestamp_sec);
233 put_unaligned(tv.tv_usec, &pm->timestamp_usec);
234 put_unaligned(skb->mark, &pm->mark);
235 pm->hook = hooknum;
236 if (prefix != NULL) {
237 strncpy(pm->prefix, prefix, sizeof(pm->prefix) - 1);
238 pm->prefix[sizeof(pm->prefix) - 1] = '\0';
239 }
240 else if (loginfo->prefix[0] != '\0')
241 strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix));
242
243 if (in && in->hard_header_len > 0 &&
244 skb->mac_header != skb->network_header &&
245 in->hard_header_len <= ULOG_MAC_LEN) {
246 memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len);
247 pm->mac_len = in->hard_header_len;
248 } else
249 pm->mac_len = 0;
250
251 if (in)
252 strncpy(pm->indev_name, in->name, sizeof(pm->indev_name));
253
254 if (out)
255 strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
256
257 /* copy_len <= skb->len, so can't fail. */
258 if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0)
259 BUG();
260
261 /* check if we are building multi-part messages */
262 if (ub->qlen > 1)
263 ub->lastnlh->nlmsg_flags |= NLM_F_MULTI;
264
265 ub->lastnlh = nlh;
266
267 /* if timer isn't already running, start it */
268 if (!timer_pending(&ub->timer)) {
269 ub->timer.expires = jiffies + flushtimeout * HZ / 100;
270 add_timer(&ub->timer);
271 }
272
273 /* if threshold is reached, send message to userspace */
274 if (ub->qlen >= loginfo->qthreshold) {
275 if (loginfo->qthreshold > 1)
276 nlh->nlmsg_type = NLMSG_DONE;
277 ulog_send(ulog, groupnum);
278 }
279out_unlock:
280 spin_unlock_bh(&ulog->lock);
281
282 return;
283
284alloc_failure:
285 pr_debug("Error building netlink message\n");
286 spin_unlock_bh(&ulog->lock);
287}
288
289static unsigned int
290ulog_tg(struct sk_buff *skb, const struct xt_action_param *par)
291{
292 struct net *net = dev_net(par->in ? par->in : par->out);
293
294 ipt_ulog_packet(net, par->hooknum, skb, par->in, par->out,
295 par->targinfo, NULL);
296 return XT_CONTINUE;
297}
298
299static void ipt_logfn(struct net *net,
300 u_int8_t pf,
301 unsigned int hooknum,
302 const struct sk_buff *skb,
303 const struct net_device *in,
304 const struct net_device *out,
305 const struct nf_loginfo *li,
306 const char *prefix)
307{
308 struct ipt_ulog_info loginfo;
309
310 if (!li || li->type != NF_LOG_TYPE_ULOG) {
311 loginfo.nl_group = ULOG_DEFAULT_NLGROUP;
312 loginfo.copy_range = 0;
313 loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD;
314 loginfo.prefix[0] = '\0';
315 } else {
316 loginfo.nl_group = li->u.ulog.group;
317 loginfo.copy_range = li->u.ulog.copy_len;
318 loginfo.qthreshold = li->u.ulog.qthreshold;
319 strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
320 }
321
322 ipt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix);
323}
324
325static int ulog_tg_check(const struct xt_tgchk_param *par)
326{
327 const struct ipt_ulog_info *loginfo = par->targinfo;
328
329 if (!par->net->xt.ulog_warn_deprecated) {
330 pr_info("ULOG is deprecated and it will be removed soon, "
331 "use NFLOG instead\n");
332 par->net->xt.ulog_warn_deprecated = true;
333 }
334
335 if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {
336 pr_debug("prefix not null-terminated\n");
337 return -EINVAL;
338 }
339 if (loginfo->qthreshold > ULOG_MAX_QLEN) {
340 pr_debug("queue threshold %Zu > MAX_QLEN\n",
341 loginfo->qthreshold);
342 return -EINVAL;
343 }
344 return 0;
345}
346
347#ifdef CONFIG_COMPAT
348struct compat_ipt_ulog_info {
349 compat_uint_t nl_group;
350 compat_size_t copy_range;
351 compat_size_t qthreshold;
352 char prefix[ULOG_PREFIX_LEN];
353};
354
355static void ulog_tg_compat_from_user(void *dst, const void *src)
356{
357 const struct compat_ipt_ulog_info *cl = src;
358 struct ipt_ulog_info l = {
359 .nl_group = cl->nl_group,
360 .copy_range = cl->copy_range,
361 .qthreshold = cl->qthreshold,
362 };
363
364 memcpy(l.prefix, cl->prefix, sizeof(l.prefix));
365 memcpy(dst, &l, sizeof(l));
366}
367
368static int ulog_tg_compat_to_user(void __user *dst, const void *src)
369{
370 const struct ipt_ulog_info *l = src;
371 struct compat_ipt_ulog_info cl = {
372 .nl_group = l->nl_group,
373 .copy_range = l->copy_range,
374 .qthreshold = l->qthreshold,
375 };
376
377 memcpy(cl.prefix, l->prefix, sizeof(cl.prefix));
378 return copy_to_user(dst, &cl, sizeof(cl)) ? -EFAULT : 0;
379}
380#endif /* CONFIG_COMPAT */
381
382static struct xt_target ulog_tg_reg __read_mostly = {
383 .name = "ULOG",
384 .family = NFPROTO_IPV4,
385 .target = ulog_tg,
386 .targetsize = sizeof(struct ipt_ulog_info),
387 .checkentry = ulog_tg_check,
388#ifdef CONFIG_COMPAT
389 .compatsize = sizeof(struct compat_ipt_ulog_info),
390 .compat_from_user = ulog_tg_compat_from_user,
391 .compat_to_user = ulog_tg_compat_to_user,
392#endif
393 .me = THIS_MODULE,
394};
395
396static struct nf_logger ipt_ulog_logger __read_mostly = {
397 .name = "ipt_ULOG",
398 .logfn = ipt_logfn,
399 .me = THIS_MODULE,
400};
401
402static int __net_init ulog_tg_net_init(struct net *net)
403{
404 int i;
405 struct ulog_net *ulog = ulog_pernet(net);
406 struct netlink_kernel_cfg cfg = {
407 .groups = ULOG_MAXNLGROUPS,
408 };
409
410 spin_lock_init(&ulog->lock);
411 /* initialize ulog_buffers */
412 for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
413 ulog->nlgroup[i] = i;
414 setup_timer(&ulog->ulog_buffers[i].timer, ulog_timer,
415 (unsigned long)&ulog->nlgroup[i]);
416 }
417
418 ulog->nflognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg);
419 if (!ulog->nflognl)
420 return -ENOMEM;
421
422 if (nflog)
423 nf_log_set(net, NFPROTO_IPV4, &ipt_ulog_logger);
424
425 return 0;
426}
427
428static void __net_exit ulog_tg_net_exit(struct net *net)
429{
430 ulog_buff_t *ub;
431 int i;
432 struct ulog_net *ulog = ulog_pernet(net);
433
434 if (nflog)
435 nf_log_unset(net, &ipt_ulog_logger);
436
437 netlink_kernel_release(ulog->nflognl);
438
439 /* remove pending timers and free allocated skb's */
440 for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
441 ub = &ulog->ulog_buffers[i];
442 pr_debug("timer is deleting\n");
443 del_timer(&ub->timer);
444
445 if (ub->skb) {
446 kfree_skb(ub->skb);
447 ub->skb = NULL;
448 }
449 }
450}
451
452static struct pernet_operations ulog_tg_net_ops = {
453 .init = ulog_tg_net_init,
454 .exit = ulog_tg_net_exit,
455 .id = &ulog_net_id,
456 .size = sizeof(struct ulog_net),
457};
458
459static int __init ulog_tg_init(void)
460{
461 int ret;
462 pr_debug("init module\n");
463
464 if (nlbufsiz > 128*1024) {
465 pr_warn("Netlink buffer has to be <= 128kB\n");
466 return -EINVAL;
467 }
468
469 ret = register_pernet_subsys(&ulog_tg_net_ops);
470 if (ret)
471 goto out_pernet;
472
473 ret = xt_register_target(&ulog_tg_reg);
474 if (ret < 0)
475 goto out_target;
476
477 if (nflog)
478 nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger);
479
480 return 0;
481
482out_target:
483 unregister_pernet_subsys(&ulog_tg_net_ops);
484out_pernet:
485 return ret;
486}
487
488static void __exit ulog_tg_exit(void)
489{
490 pr_debug("cleanup_module\n");
491 if (nflog)
492 nf_log_unregister(&ipt_ulog_logger);
493 xt_unregister_target(&ulog_tg_reg);
494 unregister_pernet_subsys(&ulog_tg_net_ops);
495}
496
497module_init(ulog_tg_init);
498module_exit(ulog_tg_exit);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 8127dc802865..4ce44c4bc57b 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -314,7 +314,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
314 return -ENOENT; 314 return -ENOENT;
315} 315}
316 316
317#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 317#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
318 318
319#include <linux/netfilter/nfnetlink.h> 319#include <linux/netfilter/nfnetlink.h>
320#include <linux/netfilter/nfnetlink_conntrack.h> 320#include <linux/netfilter/nfnetlink_conntrack.h>
@@ -388,7 +388,7 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
388 .invert_tuple = ipv4_invert_tuple, 388 .invert_tuple = ipv4_invert_tuple,
389 .print_tuple = ipv4_print_tuple, 389 .print_tuple = ipv4_print_tuple,
390 .get_l4proto = ipv4_get_l4proto, 390 .get_l4proto = ipv4_get_l4proto,
391#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 391#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
392 .tuple_to_nlattr = ipv4_tuple_to_nlattr, 392 .tuple_to_nlattr = ipv4_tuple_to_nlattr,
393 .nlattr_tuple_size = ipv4_nlattr_tuple_size, 393 .nlattr_tuple_size = ipv4_nlattr_tuple_size,
394 .nlattr_to_tuple = ipv4_nlattr_to_tuple, 394 .nlattr_to_tuple = ipv4_nlattr_to_tuple,
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index a338dad41b7d..b91b2641adda 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -226,7 +226,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
226 return icmp_error_message(net, tmpl, skb, ctinfo, hooknum); 226 return icmp_error_message(net, tmpl, skb, ctinfo, hooknum);
227} 227}
228 228
229#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 229#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
230 230
231#include <linux/netfilter/nfnetlink.h> 231#include <linux/netfilter/nfnetlink.h>
232#include <linux/netfilter/nfnetlink_conntrack.h> 232#include <linux/netfilter/nfnetlink_conntrack.h>
@@ -408,7 +408,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
408 .error = icmp_error, 408 .error = icmp_error,
409 .destroy = NULL, 409 .destroy = NULL,
410 .me = NULL, 410 .me = NULL,
411#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 411#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
412 .tuple_to_nlattr = icmp_tuple_to_nlattr, 412 .tuple_to_nlattr = icmp_tuple_to_nlattr,
413 .nlattr_tuple_size = icmp_nlattr_tuple_size, 413 .nlattr_tuple_size = icmp_nlattr_tuple_size,
414 .nlattr_to_tuple = icmp_nlattr_to_tuple, 414 .nlattr_to_tuple = icmp_nlattr_to_tuple,
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index b8f6381c7d0b..76bd1aef257f 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -17,7 +17,7 @@
17#include <linux/netfilter_bridge.h> 17#include <linux/netfilter_bridge.h>
18#include <linux/netfilter_ipv4.h> 18#include <linux/netfilter_ipv4.h>
19#include <net/netfilter/ipv4/nf_defrag_ipv4.h> 19#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
20#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 20#if IS_ENABLED(CONFIG_NF_CONNTRACK)
21#include <net/netfilter/nf_conntrack.h> 21#include <net/netfilter/nf_conntrack.h>
22#endif 22#endif
23#include <net/netfilter/nf_conntrack_zones.h> 23#include <net/netfilter/nf_conntrack_zones.h>
@@ -45,7 +45,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
45{ 45{
46 u16 zone = NF_CT_DEFAULT_ZONE; 46 u16 zone = NF_CT_DEFAULT_ZONE;
47 47
48#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 48#if IS_ENABLED(CONFIG_NF_CONNTRACK)
49 if (skb->nfct) 49 if (skb->nfct)
50 zone = nf_ct_zone((struct nf_conn *)skb->nfct); 50 zone = nf_ct_zone((struct nf_conn *)skb->nfct);
51#endif 51#endif
@@ -74,8 +74,8 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
74 inet->nodefrag) 74 inet->nodefrag)
75 return NF_ACCEPT; 75 return NF_ACCEPT;
76 76
77#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 77#if IS_ENABLED(CONFIG_NF_CONNTRACK)
78#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE) 78#if !IS_ENABLED(CONFIG_NF_NAT)
79 /* Previously seen (loopback)? Ignore. Do this before 79 /* Previously seen (loopback)? Ignore. Do this before
80 fragment check. */ 80 fragment check. */
81 if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) 81 if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
new file mode 100644
index 000000000000..ccfc78db12ee
--- /dev/null
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -0,0 +1,149 @@
1/*
2 * (C) 2014 by Pablo Neira Ayuso <pablo@netfilter.org>
3 *
4 * Based on code from ebt_log from:
5 *
6 * Bart De Schuymer <bdschuym@pandora.be>
7 * Harald Welte <laforge@netfilter.org>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 */
13
14#include <linux/module.h>
15#include <linux/spinlock.h>
16#include <linux/skbuff.h>
17#include <linux/if_arp.h>
18#include <linux/ip.h>
19#include <net/route.h>
20
21#include <linux/netfilter.h>
22#include <linux/netfilter/xt_LOG.h>
23#include <net/netfilter/nf_log.h>
24
25static struct nf_loginfo default_loginfo = {
26 .type = NF_LOG_TYPE_LOG,
27 .u = {
28 .log = {
29 .level = 5,
30 .logflags = NF_LOG_MASK,
31 },
32 },
33};
34
35struct arppayload {
36 unsigned char mac_src[ETH_ALEN];
37 unsigned char ip_src[4];
38 unsigned char mac_dst[ETH_ALEN];
39 unsigned char ip_dst[4];
40};
41
42static void dump_arp_packet(struct nf_log_buf *m,
43 const struct nf_loginfo *info,
44 const struct sk_buff *skb, unsigned int nhoff)
45{
46 const struct arphdr *ah;
47 struct arphdr _arph;
48 const struct arppayload *ap;
49 struct arppayload _arpp;
50
51 ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
52 if (ah == NULL) {
53 nf_log_buf_add(m, "TRUNCATED");
54 return;
55 }
56 nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d",
57 ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op));
58
59 /* If it's for Ethernet and the lengths are OK, then log the ARP
60 * payload.
61 */
62 if (ah->ar_hrd != htons(1) ||
63 ah->ar_hln != ETH_ALEN ||
64 ah->ar_pln != sizeof(__be32))
65 return;
66
67 ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp);
68 if (ap == NULL) {
69 nf_log_buf_add(m, " INCOMPLETE [%Zu bytes]",
70 skb->len - sizeof(_arph));
71 return;
72 }
73 nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4",
74 ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst);
75}
76
77void nf_log_arp_packet(struct net *net, u_int8_t pf,
78 unsigned int hooknum, const struct sk_buff *skb,
79 const struct net_device *in,
80 const struct net_device *out,
81 const struct nf_loginfo *loginfo,
82 const char *prefix)
83{
84 struct nf_log_buf *m;
85
86 /* FIXME: Disabled from containers until syslog ns is supported */
87 if (!net_eq(net, &init_net))
88 return;
89
90 m = nf_log_buf_open();
91
92 if (!loginfo)
93 loginfo = &default_loginfo;
94
95 nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
96 prefix);
97 dump_arp_packet(m, loginfo, skb, 0);
98
99 nf_log_buf_close(m);
100}
101
102static struct nf_logger nf_arp_logger __read_mostly = {
103 .name = "nf_log_arp",
104 .type = NF_LOG_TYPE_LOG,
105 .logfn = nf_log_arp_packet,
106 .me = THIS_MODULE,
107};
108
109static int __net_init nf_log_arp_net_init(struct net *net)
110{
111 nf_log_set(net, NFPROTO_ARP, &nf_arp_logger);
112 return 0;
113}
114
115static void __net_exit nf_log_arp_net_exit(struct net *net)
116{
117 nf_log_unset(net, &nf_arp_logger);
118}
119
120static struct pernet_operations nf_log_arp_net_ops = {
121 .init = nf_log_arp_net_init,
122 .exit = nf_log_arp_net_exit,
123};
124
125static int __init nf_log_arp_init(void)
126{
127 int ret;
128
129 ret = register_pernet_subsys(&nf_log_arp_net_ops);
130 if (ret < 0)
131 return ret;
132
133 nf_log_register(NFPROTO_ARP, &nf_arp_logger);
134 return 0;
135}
136
137static void __exit nf_log_arp_exit(void)
138{
139 unregister_pernet_subsys(&nf_log_arp_net_ops);
140 nf_log_unregister(&nf_arp_logger);
141}
142
143module_init(nf_log_arp_init);
144module_exit(nf_log_arp_exit);
145
146MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
147MODULE_DESCRIPTION("Netfilter ARP packet logging");
148MODULE_LICENSE("GPL");
149MODULE_ALIAS_NF_LOGGER(3, 0);
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
new file mode 100644
index 000000000000..078bdca1b607
--- /dev/null
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -0,0 +1,385 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/module.h>
10#include <linux/spinlock.h>
11#include <linux/skbuff.h>
12#include <linux/if_arp.h>
13#include <linux/ip.h>
14#include <net/ipv6.h>
15#include <net/icmp.h>
16#include <net/udp.h>
17#include <net/tcp.h>
18#include <net/route.h>
19
20#include <linux/netfilter.h>
21#include <linux/netfilter/xt_LOG.h>
22#include <net/netfilter/nf_log.h>
23
24static struct nf_loginfo default_loginfo = {
25 .type = NF_LOG_TYPE_LOG,
26 .u = {
27 .log = {
28 .level = 5,
29 .logflags = NF_LOG_MASK,
30 },
31 },
32};
33
34/* One level of recursion won't kill us */
35static void dump_ipv4_packet(struct nf_log_buf *m,
36 const struct nf_loginfo *info,
37 const struct sk_buff *skb, unsigned int iphoff)
38{
39 struct iphdr _iph;
40 const struct iphdr *ih;
41 unsigned int logflags;
42
43 if (info->type == NF_LOG_TYPE_LOG)
44 logflags = info->u.log.logflags;
45 else
46 logflags = NF_LOG_MASK;
47
48 ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
49 if (ih == NULL) {
50 nf_log_buf_add(m, "TRUNCATED");
51 return;
52 }
53
54 /* Important fields:
55 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
56 /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
57 nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr);
58
59 /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
60 nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
61 ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
62 ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
63
64 /* Max length: 6 "CE DF MF " */
65 if (ntohs(ih->frag_off) & IP_CE)
66 nf_log_buf_add(m, "CE ");
67 if (ntohs(ih->frag_off) & IP_DF)
68 nf_log_buf_add(m, "DF ");
69 if (ntohs(ih->frag_off) & IP_MF)
70 nf_log_buf_add(m, "MF ");
71
72 /* Max length: 11 "FRAG:65535 " */
73 if (ntohs(ih->frag_off) & IP_OFFSET)
74 nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
75
76 if ((logflags & XT_LOG_IPOPT) &&
77 ih->ihl * 4 > sizeof(struct iphdr)) {
78 const unsigned char *op;
79 unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
80 unsigned int i, optsize;
81
82 optsize = ih->ihl * 4 - sizeof(struct iphdr);
83 op = skb_header_pointer(skb, iphoff+sizeof(_iph),
84 optsize, _opt);
85 if (op == NULL) {
86 nf_log_buf_add(m, "TRUNCATED");
87 return;
88 }
89
90 /* Max length: 127 "OPT (" 15*4*2chars ") " */
91 nf_log_buf_add(m, "OPT (");
92 for (i = 0; i < optsize; i++)
93 nf_log_buf_add(m, "%02X", op[i]);
94 nf_log_buf_add(m, ") ");
95 }
96
97 switch (ih->protocol) {
98 case IPPROTO_TCP:
99 if (nf_log_dump_tcp_header(m, skb, ih->protocol,
100 ntohs(ih->frag_off) & IP_OFFSET,
101 iphoff+ih->ihl*4, logflags))
102 return;
103 break;
104 case IPPROTO_UDP:
105 case IPPROTO_UDPLITE:
106 if (nf_log_dump_udp_header(m, skb, ih->protocol,
107 ntohs(ih->frag_off) & IP_OFFSET,
108 iphoff+ih->ihl*4))
109 return;
110 break;
111 case IPPROTO_ICMP: {
112 struct icmphdr _icmph;
113 const struct icmphdr *ich;
114 static const size_t required_len[NR_ICMP_TYPES+1]
115 = { [ICMP_ECHOREPLY] = 4,
116 [ICMP_DEST_UNREACH]
117 = 8 + sizeof(struct iphdr),
118 [ICMP_SOURCE_QUENCH]
119 = 8 + sizeof(struct iphdr),
120 [ICMP_REDIRECT]
121 = 8 + sizeof(struct iphdr),
122 [ICMP_ECHO] = 4,
123 [ICMP_TIME_EXCEEDED]
124 = 8 + sizeof(struct iphdr),
125 [ICMP_PARAMETERPROB]
126 = 8 + sizeof(struct iphdr),
127 [ICMP_TIMESTAMP] = 20,
128 [ICMP_TIMESTAMPREPLY] = 20,
129 [ICMP_ADDRESS] = 12,
130 [ICMP_ADDRESSREPLY] = 12 };
131
132 /* Max length: 11 "PROTO=ICMP " */
133 nf_log_buf_add(m, "PROTO=ICMP ");
134
135 if (ntohs(ih->frag_off) & IP_OFFSET)
136 break;
137
138 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
139 ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
140 sizeof(_icmph), &_icmph);
141 if (ich == NULL) {
142 nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
143 skb->len - iphoff - ih->ihl*4);
144 break;
145 }
146
147 /* Max length: 18 "TYPE=255 CODE=255 " */
148 nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
149
150 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
151 if (ich->type <= NR_ICMP_TYPES &&
152 required_len[ich->type] &&
153 skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
154 nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
155 skb->len - iphoff - ih->ihl*4);
156 break;
157 }
158
159 switch (ich->type) {
160 case ICMP_ECHOREPLY:
161 case ICMP_ECHO:
162 /* Max length: 19 "ID=65535 SEQ=65535 " */
163 nf_log_buf_add(m, "ID=%u SEQ=%u ",
164 ntohs(ich->un.echo.id),
165 ntohs(ich->un.echo.sequence));
166 break;
167
168 case ICMP_PARAMETERPROB:
169 /* Max length: 14 "PARAMETER=255 " */
170 nf_log_buf_add(m, "PARAMETER=%u ",
171 ntohl(ich->un.gateway) >> 24);
172 break;
173 case ICMP_REDIRECT:
174 /* Max length: 24 "GATEWAY=255.255.255.255 " */
175 nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
176 /* Fall through */
177 case ICMP_DEST_UNREACH:
178 case ICMP_SOURCE_QUENCH:
179 case ICMP_TIME_EXCEEDED:
180 /* Max length: 3+maxlen */
181 if (!iphoff) { /* Only recurse once. */
182 nf_log_buf_add(m, "[");
183 dump_ipv4_packet(m, info, skb,
184 iphoff + ih->ihl*4+sizeof(_icmph));
185 nf_log_buf_add(m, "] ");
186 }
187
188 /* Max length: 10 "MTU=65535 " */
189 if (ich->type == ICMP_DEST_UNREACH &&
190 ich->code == ICMP_FRAG_NEEDED) {
191 nf_log_buf_add(m, "MTU=%u ",
192 ntohs(ich->un.frag.mtu));
193 }
194 }
195 break;
196 }
197 /* Max Length */
198 case IPPROTO_AH: {
199 struct ip_auth_hdr _ahdr;
200 const struct ip_auth_hdr *ah;
201
202 if (ntohs(ih->frag_off) & IP_OFFSET)
203 break;
204
205 /* Max length: 9 "PROTO=AH " */
206 nf_log_buf_add(m, "PROTO=AH ");
207
208 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
209 ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
210 sizeof(_ahdr), &_ahdr);
211 if (ah == NULL) {
212 nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
213 skb->len - iphoff - ih->ihl*4);
214 break;
215 }
216
217 /* Length: 15 "SPI=0xF1234567 " */
218 nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
219 break;
220 }
221 case IPPROTO_ESP: {
222 struct ip_esp_hdr _esph;
223 const struct ip_esp_hdr *eh;
224
225 /* Max length: 10 "PROTO=ESP " */
226 nf_log_buf_add(m, "PROTO=ESP ");
227
228 if (ntohs(ih->frag_off) & IP_OFFSET)
229 break;
230
231 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
232 eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
233 sizeof(_esph), &_esph);
234 if (eh == NULL) {
235 nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
236 skb->len - iphoff - ih->ihl*4);
237 break;
238 }
239
240 /* Length: 15 "SPI=0xF1234567 " */
241 nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi));
242 break;
243 }
244 /* Max length: 10 "PROTO 255 " */
245 default:
246 nf_log_buf_add(m, "PROTO=%u ", ih->protocol);
247 }
248
249 /* Max length: 15 "UID=4294967295 " */
250 if ((logflags & XT_LOG_UID) && !iphoff)
251 nf_log_dump_sk_uid_gid(m, skb->sk);
252
253 /* Max length: 16 "MARK=0xFFFFFFFF " */
254 if (!iphoff && skb->mark)
255 nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
256
257 /* Proto Max log string length */
258 /* IP: 40+46+6+11+127 = 230 */
259 /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */
260 /* UDP: 10+max(25,20) = 35 */
261 /* UDPLITE: 14+max(25,20) = 39 */
262 /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
263 /* ESP: 10+max(25)+15 = 50 */
264 /* AH: 9+max(25)+15 = 49 */
265 /* unknown: 10 */
266
267 /* (ICMP allows recursion one level deep) */
268 /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */
269 /* maxlen = 230+ 91 + 230 + 252 = 803 */
270}
271
272static void dump_ipv4_mac_header(struct nf_log_buf *m,
273 const struct nf_loginfo *info,
274 const struct sk_buff *skb)
275{
276 struct net_device *dev = skb->dev;
277 unsigned int logflags = 0;
278
279 if (info->type == NF_LOG_TYPE_LOG)
280 logflags = info->u.log.logflags;
281
282 if (!(logflags & XT_LOG_MACDECODE))
283 goto fallback;
284
285 switch (dev->type) {
286 case ARPHRD_ETHER:
287 nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
288 eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
289 ntohs(eth_hdr(skb)->h_proto));
290 return;
291 default:
292 break;
293 }
294
295fallback:
296 nf_log_buf_add(m, "MAC=");
297 if (dev->hard_header_len &&
298 skb->mac_header != skb->network_header) {
299 const unsigned char *p = skb_mac_header(skb);
300 unsigned int i;
301
302 nf_log_buf_add(m, "%02x", *p++);
303 for (i = 1; i < dev->hard_header_len; i++, p++)
304 nf_log_buf_add(m, ":%02x", *p);
305 }
306 nf_log_buf_add(m, " ");
307}
308
309static void nf_log_ip_packet(struct net *net, u_int8_t pf,
310 unsigned int hooknum, const struct sk_buff *skb,
311 const struct net_device *in,
312 const struct net_device *out,
313 const struct nf_loginfo *loginfo,
314 const char *prefix)
315{
316 struct nf_log_buf *m;
317
318 /* FIXME: Disabled from containers until syslog ns is supported */
319 if (!net_eq(net, &init_net))
320 return;
321
322 m = nf_log_buf_open();
323
324 if (!loginfo)
325 loginfo = &default_loginfo;
326
327 nf_log_dump_packet_common(m, pf, hooknum, skb, in,
328 out, loginfo, prefix);
329
330 if (in != NULL)
331 dump_ipv4_mac_header(m, loginfo, skb);
332
333 dump_ipv4_packet(m, loginfo, skb, 0);
334
335 nf_log_buf_close(m);
336}
337
338static struct nf_logger nf_ip_logger __read_mostly = {
339 .name = "nf_log_ipv4",
340 .type = NF_LOG_TYPE_LOG,
341 .logfn = nf_log_ip_packet,
342 .me = THIS_MODULE,
343};
344
345static int __net_init nf_log_ipv4_net_init(struct net *net)
346{
347 nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger);
348 return 0;
349}
350
351static void __net_exit nf_log_ipv4_net_exit(struct net *net)
352{
353 nf_log_unset(net, &nf_ip_logger);
354}
355
356static struct pernet_operations nf_log_ipv4_net_ops = {
357 .init = nf_log_ipv4_net_init,
358 .exit = nf_log_ipv4_net_exit,
359};
360
361static int __init nf_log_ipv4_init(void)
362{
363 int ret;
364
365 ret = register_pernet_subsys(&nf_log_ipv4_net_ops);
366 if (ret < 0)
367 return ret;
368
369 nf_log_register(NFPROTO_IPV4, &nf_ip_logger);
370 return 0;
371}
372
373static void __exit nf_log_ipv4_exit(void)
374{
375 unregister_pernet_subsys(&nf_log_ipv4_net_ops);
376 nf_log_unregister(&nf_ip_logger);
377}
378
379module_init(nf_log_ipv4_init);
380module_exit(nf_log_ipv4_exit);
381
382MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
383MODULE_DESCRIPTION("Netfilter IPv4 packet logging");
384MODULE_LICENSE("GPL");
385MODULE_ALIAS_NF_LOGGER(AF_INET, 0);
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index d8b2e14efddc..14f5ccd06337 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -154,6 +154,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
154 htons(oldlen), htons(datalen), 1); 154 htons(oldlen), htons(datalen), 1);
155} 155}
156 156
157#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
157static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], 158static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
158 struct nf_nat_range *range) 159 struct nf_nat_range *range)
159{ 160{
@@ -169,6 +170,7 @@ static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
169 170
170 return 0; 171 return 0;
171} 172}
173#endif
172 174
173static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = { 175static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
174 .l3proto = NFPROTO_IPV4, 176 .l3proto = NFPROTO_IPV4,
@@ -177,7 +179,9 @@ static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
177 .manip_pkt = nf_nat_ipv4_manip_pkt, 179 .manip_pkt = nf_nat_ipv4_manip_pkt,
178 .csum_update = nf_nat_ipv4_csum_update, 180 .csum_update = nf_nat_ipv4_csum_update,
179 .csum_recalc = nf_nat_ipv4_csum_recalc, 181 .csum_recalc = nf_nat_ipv4_csum_recalc,
182#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
180 .nlattr_to_range = nf_nat_ipv4_nlattr_to_range, 183 .nlattr_to_range = nf_nat_ipv4_nlattr_to_range,
184#endif
181#ifdef CONFIG_XFRM 185#ifdef CONFIG_XFRM
182 .decode_session = nf_nat_ipv4_decode_session, 186 .decode_session = nf_nat_ipv4_decode_session,
183#endif 187#endif
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index 690d890111bb..9414923f1e15 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -124,7 +124,7 @@ static const struct nf_nat_l4proto gre = {
124 .manip_pkt = gre_manip_pkt, 124 .manip_pkt = gre_manip_pkt,
125 .in_range = nf_nat_l4proto_in_range, 125 .in_range = nf_nat_l4proto_in_range,
126 .unique_tuple = gre_unique_tuple, 126 .unique_tuple = gre_unique_tuple,
127#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 127#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
128 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, 128 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
129#endif 129#endif
130}; 130};
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index eb303471bcf6..4557b4ab8342 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -77,7 +77,7 @@ const struct nf_nat_l4proto nf_nat_l4proto_icmp = {
77 .manip_pkt = icmp_manip_pkt, 77 .manip_pkt = icmp_manip_pkt,
78 .in_range = icmp_in_range, 78 .in_range = icmp_in_range,
79 .unique_tuple = icmp_unique_tuple, 79 .unique_tuple = icmp_unique_tuple,
80#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 80#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
81 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, 81 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
82#endif 82#endif
83}; 83};
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index ae0af9386f7c..8e3eb39f84e7 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -52,6 +52,7 @@
52static int sockstat_seq_show(struct seq_file *seq, void *v) 52static int sockstat_seq_show(struct seq_file *seq, void *v)
53{ 53{
54 struct net *net = seq->private; 54 struct net *net = seq->private;
55 unsigned int frag_mem;
55 int orphans, sockets; 56 int orphans, sockets;
56 57
57 local_bh_disable(); 58 local_bh_disable();
@@ -71,8 +72,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
71 sock_prot_inuse_get(net, &udplite_prot)); 72 sock_prot_inuse_get(net, &udplite_prot));
72 seq_printf(seq, "RAW: inuse %d\n", 73 seq_printf(seq, "RAW: inuse %d\n",
73 sock_prot_inuse_get(net, &raw_prot)); 74 sock_prot_inuse_get(net, &raw_prot));
74 seq_printf(seq, "FRAG: inuse %d memory %d\n", 75 frag_mem = ip_frag_mem(net);
75 ip_frag_nqueues(net), ip_frag_mem(net)); 76 seq_printf(seq, "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem);
76 return 0; 77 return 0;
77} 78}
78 79
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 2c65160565e1..739db3100c23 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -58,6 +58,7 @@
58#include <linux/in_route.h> 58#include <linux/in_route.h>
59#include <linux/route.h> 59#include <linux/route.h>
60#include <linux/skbuff.h> 60#include <linux/skbuff.h>
61#include <linux/igmp.h>
61#include <net/net_namespace.h> 62#include <net/net_namespace.h>
62#include <net/dst.h> 63#include <net/dst.h>
63#include <net/sock.h> 64#include <net/sock.h>
@@ -174,7 +175,9 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
174 175
175 while (sk) { 176 while (sk) {
176 delivered = 1; 177 delivered = 1;
177 if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { 178 if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) &&
179 ip_mc_sf_allow(sk, iph->daddr, iph->saddr,
180 skb->dev->ifindex)) {
178 struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); 181 struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
179 182
180 /* Not releasing hash table! */ 183 /* Not releasing hash table! */
@@ -365,6 +368,8 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
365 368
366 skb->ip_summed = CHECKSUM_NONE; 369 skb->ip_summed = CHECKSUM_NONE;
367 370
371 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
372
368 skb->transport_header = skb->network_header; 373 skb->transport_header = skb->network_header;
369 err = -EFAULT; 374 err = -EFAULT;
370 if (memcpy_fromiovecend((void *)iph, from, 0, length)) 375 if (memcpy_fromiovecend((void *)iph, from, 0, length))
@@ -606,6 +611,8 @@ back_from_confirm:
606 &rt, msg->msg_flags); 611 &rt, msg->msg_flags);
607 612
608 else { 613 else {
614 sock_tx_timestamp(sk, &ipc.tx_flags);
615
609 if (!ipc.addr) 616 if (!ipc.addr)
610 ipc.addr = fl4.daddr; 617 ipc.addr = fl4.daddr;
611 lock_sock(sk); 618 lock_sock(sk);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 082239ffe34a..190199851c9a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -457,8 +457,31 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
457 return neigh_create(&arp_tbl, pkey, dev); 457 return neigh_create(&arp_tbl, pkey, dev);
458} 458}
459 459
460atomic_t *ip_idents __read_mostly; 460#define IP_IDENTS_SZ 2048u
461EXPORT_SYMBOL(ip_idents); 461struct ip_ident_bucket {
462 atomic_t id;
463 u32 stamp32;
464};
465
466static struct ip_ident_bucket *ip_idents __read_mostly;
467
468/* In order to protect privacy, we add a perturbation to identifiers
469 * if one generator is seldom used. This makes hard for an attacker
470 * to infer how many packets were sent between two points in time.
471 */
472u32 ip_idents_reserve(u32 hash, int segs)
473{
474 struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
475 u32 old = ACCESS_ONCE(bucket->stamp32);
476 u32 now = (u32)jiffies;
477 u32 delta = 0;
478
479 if (old != now && cmpxchg(&bucket->stamp32, old, now) == old)
480 delta = prandom_u32_max(now - old);
481
482 return atomic_add_return(segs + delta, &bucket->id) - segs;
483}
484EXPORT_SYMBOL(ip_idents_reserve);
462 485
463void __ip_select_ident(struct iphdr *iph, int segs) 486void __ip_select_ident(struct iphdr *iph, int segs)
464{ 487{
@@ -467,7 +490,10 @@ void __ip_select_ident(struct iphdr *iph, int segs)
467 490
468 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd)); 491 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
469 492
470 hash = jhash_1word((__force u32)iph->daddr, ip_idents_hashrnd); 493 hash = jhash_3words((__force u32)iph->daddr,
494 (__force u32)iph->saddr,
495 iph->protocol,
496 ip_idents_hashrnd);
471 id = ip_idents_reserve(hash, segs); 497 id = ip_idents_reserve(hash, segs);
472 iph->id = htons(id); 498 iph->id = htons(id);
473} 499}
@@ -1010,7 +1036,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1010 const struct iphdr *iph = (const struct iphdr *) skb->data; 1036 const struct iphdr *iph = (const struct iphdr *) skb->data;
1011 struct flowi4 fl4; 1037 struct flowi4 fl4;
1012 struct rtable *rt; 1038 struct rtable *rt;
1013 struct dst_entry *dst; 1039 struct dst_entry *odst = NULL;
1014 bool new = false; 1040 bool new = false;
1015 1041
1016 bh_lock_sock(sk); 1042 bh_lock_sock(sk);
@@ -1018,16 +1044,17 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1018 if (!ip_sk_accept_pmtu(sk)) 1044 if (!ip_sk_accept_pmtu(sk))
1019 goto out; 1045 goto out;
1020 1046
1021 rt = (struct rtable *) __sk_dst_get(sk); 1047 odst = sk_dst_get(sk);
1022 1048
1023 if (sock_owned_by_user(sk) || !rt) { 1049 if (sock_owned_by_user(sk) || !odst) {
1024 __ipv4_sk_update_pmtu(skb, sk, mtu); 1050 __ipv4_sk_update_pmtu(skb, sk, mtu);
1025 goto out; 1051 goto out;
1026 } 1052 }
1027 1053
1028 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); 1054 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1029 1055
1030 if (!__sk_dst_check(sk, 0)) { 1056 rt = (struct rtable *)odst;
1057 if (odst->obsolete && odst->ops->check(odst, 0) == NULL) {
1031 rt = ip_route_output_flow(sock_net(sk), &fl4, sk); 1058 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1032 if (IS_ERR(rt)) 1059 if (IS_ERR(rt))
1033 goto out; 1060 goto out;
@@ -1037,8 +1064,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1037 1064
1038 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu); 1065 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1039 1066
1040 dst = dst_check(&rt->dst, 0); 1067 if (!dst_check(&rt->dst, 0)) {
1041 if (!dst) {
1042 if (new) 1068 if (new)
1043 dst_release(&rt->dst); 1069 dst_release(&rt->dst);
1044 1070
@@ -1050,10 +1076,11 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1050 } 1076 }
1051 1077
1052 if (new) 1078 if (new)
1053 __sk_dst_set(sk, &rt->dst); 1079 sk_dst_set(sk, &rt->dst);
1054 1080
1055out: 1081out:
1056 bh_unlock_sock(sk); 1082 bh_unlock_sock(sk);
1083 dst_release(odst);
1057} 1084}
1058EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); 1085EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1059 1086
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index c86624b36a62..c0c75688896e 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -170,7 +170,8 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
170} 170}
171EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); 171EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
172 172
173__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) 173__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
174 __u16 *mssp)
174{ 175{
175 const struct iphdr *iph = ip_hdr(skb); 176 const struct iphdr *iph = ip_hdr(skb);
176 const struct tcphdr *th = tcp_hdr(skb); 177 const struct tcphdr *th = tcp_hdr(skb);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index eb1dde37e678..9d2118e5fbc7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1108,7 +1108,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1108 if (unlikely(tp->repair)) { 1108 if (unlikely(tp->repair)) {
1109 if (tp->repair_queue == TCP_RECV_QUEUE) { 1109 if (tp->repair_queue == TCP_RECV_QUEUE) {
1110 copied = tcp_send_rcvq(sk, msg, size); 1110 copied = tcp_send_rcvq(sk, msg, size);
1111 goto out; 1111 goto out_nopush;
1112 } 1112 }
1113 1113
1114 err = -EINVAL; 1114 err = -EINVAL;
@@ -1282,6 +1282,7 @@ wait_for_memory:
1282out: 1282out:
1283 if (copied) 1283 if (copied)
1284 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); 1284 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1285out_nopush:
1285 release_sock(sk); 1286 release_sock(sk);
1286 return copied + copied_syn; 1287 return copied + copied_syn;
1287 1288
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 62e48cf84e60..9771563ab564 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -131,7 +131,7 @@ static bool tcp_fastopen_create_child(struct sock *sk,
131 struct dst_entry *dst, 131 struct dst_entry *dst,
132 struct request_sock *req) 132 struct request_sock *req)
133{ 133{
134 struct tcp_sock *tp = tcp_sk(sk); 134 struct tcp_sock *tp;
135 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; 135 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
136 struct sock *child; 136 struct sock *child;
137 137
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 40661fc1e233..7832d941dbcd 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1106,7 +1106,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
1106 } 1106 }
1107 1107
1108 /* D-SACK for already forgotten data... Do dumb counting. */ 1108 /* D-SACK for already forgotten data... Do dumb counting. */
1109 if (dup_sack && tp->undo_marker && tp->undo_retrans && 1109 if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
1110 !after(end_seq_0, prior_snd_una) && 1110 !after(end_seq_0, prior_snd_una) &&
1111 after(end_seq_0, tp->undo_marker)) 1111 after(end_seq_0, tp->undo_marker))
1112 tp->undo_retrans--; 1112 tp->undo_retrans--;
@@ -1162,7 +1162,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1162 unsigned int new_len = (pkt_len / mss) * mss; 1162 unsigned int new_len = (pkt_len / mss) * mss;
1163 if (!in_sack && new_len < pkt_len) { 1163 if (!in_sack && new_len < pkt_len) {
1164 new_len += mss; 1164 new_len += mss;
1165 if (new_len > skb->len) 1165 if (new_len >= skb->len)
1166 return 0; 1166 return 0;
1167 } 1167 }
1168 pkt_len = new_len; 1168 pkt_len = new_len;
@@ -1187,7 +1187,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
1187 1187
1188 /* Account D-SACK for retransmitted packet. */ 1188 /* Account D-SACK for retransmitted packet. */
1189 if (dup_sack && (sacked & TCPCB_RETRANS)) { 1189 if (dup_sack && (sacked & TCPCB_RETRANS)) {
1190 if (tp->undo_marker && tp->undo_retrans && 1190 if (tp->undo_marker && tp->undo_retrans > 0 &&
1191 after(end_seq, tp->undo_marker)) 1191 after(end_seq, tp->undo_marker))
1192 tp->undo_retrans--; 1192 tp->undo_retrans--;
1193 if (sacked & TCPCB_SACKED_ACKED) 1193 if (sacked & TCPCB_SACKED_ACKED)
@@ -1893,7 +1893,7 @@ static void tcp_clear_retrans_partial(struct tcp_sock *tp)
1893 tp->lost_out = 0; 1893 tp->lost_out = 0;
1894 1894
1895 tp->undo_marker = 0; 1895 tp->undo_marker = 0;
1896 tp->undo_retrans = 0; 1896 tp->undo_retrans = -1;
1897} 1897}
1898 1898
1899void tcp_clear_retrans(struct tcp_sock *tp) 1899void tcp_clear_retrans(struct tcp_sock *tp)
@@ -2475,7 +2475,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
2475 * losses and/or application stalls), do not perform any further cwnd 2475 * losses and/or application stalls), do not perform any further cwnd
2476 * reductions, but instead slow start up to ssthresh. 2476 * reductions, but instead slow start up to ssthresh.
2477 */ 2477 */
2478static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) 2478static void tcp_init_cwnd_reduction(struct sock *sk)
2479{ 2479{
2480 struct tcp_sock *tp = tcp_sk(sk); 2480 struct tcp_sock *tp = tcp_sk(sk);
2481 2481
@@ -2485,8 +2485,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
2485 tp->prior_cwnd = tp->snd_cwnd; 2485 tp->prior_cwnd = tp->snd_cwnd;
2486 tp->prr_delivered = 0; 2486 tp->prr_delivered = 0;
2487 tp->prr_out = 0; 2487 tp->prr_out = 0;
2488 if (set_ssthresh) 2488 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2489 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2490 TCP_ECN_queue_cwr(tp); 2489 TCP_ECN_queue_cwr(tp);
2491} 2490}
2492 2491
@@ -2528,14 +2527,14 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
2528} 2527}
2529 2528
2530/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */ 2529/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
2531void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) 2530void tcp_enter_cwr(struct sock *sk)
2532{ 2531{
2533 struct tcp_sock *tp = tcp_sk(sk); 2532 struct tcp_sock *tp = tcp_sk(sk);
2534 2533
2535 tp->prior_ssthresh = 0; 2534 tp->prior_ssthresh = 0;
2536 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { 2535 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2537 tp->undo_marker = 0; 2536 tp->undo_marker = 0;
2538 tcp_init_cwnd_reduction(sk, set_ssthresh); 2537 tcp_init_cwnd_reduction(sk);
2539 tcp_set_ca_state(sk, TCP_CA_CWR); 2538 tcp_set_ca_state(sk, TCP_CA_CWR);
2540 } 2539 }
2541} 2540}
@@ -2564,7 +2563,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
2564 tp->retrans_stamp = 0; 2563 tp->retrans_stamp = 0;
2565 2564
2566 if (flag & FLAG_ECE) 2565 if (flag & FLAG_ECE)
2567 tcp_enter_cwr(sk, 1); 2566 tcp_enter_cwr(sk);
2568 2567
2569 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { 2568 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
2570 tcp_try_keep_open(sk); 2569 tcp_try_keep_open(sk);
@@ -2665,12 +2664,12 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2665 2664
2666 tp->prior_ssthresh = 0; 2665 tp->prior_ssthresh = 0;
2667 tp->undo_marker = tp->snd_una; 2666 tp->undo_marker = tp->snd_una;
2668 tp->undo_retrans = tp->retrans_out; 2667 tp->undo_retrans = tp->retrans_out ? : -1;
2669 2668
2670 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { 2669 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2671 if (!ece_ack) 2670 if (!ece_ack)
2672 tp->prior_ssthresh = tcp_current_ssthresh(sk); 2671 tp->prior_ssthresh = tcp_current_ssthresh(sk);
2673 tcp_init_cwnd_reduction(sk, true); 2672 tcp_init_cwnd_reduction(sk);
2674 } 2673 }
2675 tcp_set_ca_state(sk, TCP_CA_Recovery); 2674 tcp_set_ca_state(sk, TCP_CA_Recovery);
2676} 2675}
@@ -3346,7 +3345,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
3346 tp->tlp_high_seq = 0; 3345 tp->tlp_high_seq = 0;
3347 /* Don't reduce cwnd if DSACK arrives for TLP retrans. */ 3346 /* Don't reduce cwnd if DSACK arrives for TLP retrans. */
3348 if (!(flag & FLAG_DSACKING_ACK)) { 3347 if (!(flag & FLAG_DSACKING_ACK)) {
3349 tcp_init_cwnd_reduction(sk, true); 3348 tcp_init_cwnd_reduction(sk);
3350 tcp_set_ca_state(sk, TCP_CA_CWR); 3349 tcp_set_ca_state(sk, TCP_CA_CWR);
3351 tcp_end_cwnd_reduction(sk); 3350 tcp_end_cwnd_reduction(sk);
3352 tcp_try_keep_open(sk); 3351 tcp_try_keep_open(sk);
@@ -5877,3 +5876,153 @@ discard:
5877 return 0; 5876 return 0;
5878} 5877}
5879EXPORT_SYMBOL(tcp_rcv_state_process); 5878EXPORT_SYMBOL(tcp_rcv_state_process);
5879
5880static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
5881{
5882 struct inet_request_sock *ireq = inet_rsk(req);
5883
5884 if (family == AF_INET)
5885 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
5886 &ireq->ir_rmt_addr, port);
5887#if IS_ENABLED(CONFIG_IPV6)
5888 else if (family == AF_INET6)
5889 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"),
5890 &ireq->ir_v6_rmt_addr, port);
5891#endif
5892}
5893
5894int tcp_conn_request(struct request_sock_ops *rsk_ops,
5895 const struct tcp_request_sock_ops *af_ops,
5896 struct sock *sk, struct sk_buff *skb)
5897{
5898 struct tcp_options_received tmp_opt;
5899 struct request_sock *req;
5900 struct tcp_sock *tp = tcp_sk(sk);
5901 struct dst_entry *dst = NULL;
5902 __u32 isn = TCP_SKB_CB(skb)->when;
5903 bool want_cookie = false, fastopen;
5904 struct flowi fl;
5905 struct tcp_fastopen_cookie foc = { .len = -1 };
5906 int err;
5907
5908
5909 /* TW buckets are converted to open requests without
5910 * limitations, they conserve resources and peer is
5911 * evidently real one.
5912 */
5913 if ((sysctl_tcp_syncookies == 2 ||
5914 inet_csk_reqsk_queue_is_full(sk)) && !isn) {
5915 want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
5916 if (!want_cookie)
5917 goto drop;
5918 }
5919
5920
5921 /* Accept backlog is full. If we have already queued enough
5922 * of warm entries in syn queue, drop request. It is better than
5923 * clogging syn queue with openreqs with exponentially increasing
5924 * timeout.
5925 */
5926 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
5927 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
5928 goto drop;
5929 }
5930
5931 req = inet_reqsk_alloc(rsk_ops);
5932 if (!req)
5933 goto drop;
5934
5935 tcp_rsk(req)->af_specific = af_ops;
5936
5937 tcp_clear_options(&tmp_opt);
5938 tmp_opt.mss_clamp = af_ops->mss_clamp;
5939 tmp_opt.user_mss = tp->rx_opt.user_mss;
5940 tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
5941
5942 if (want_cookie && !tmp_opt.saw_tstamp)
5943 tcp_clear_options(&tmp_opt);
5944
5945 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
5946 tcp_openreq_init(req, &tmp_opt, skb, sk);
5947
5948 af_ops->init_req(req, sk, skb);
5949
5950 if (security_inet_conn_request(sk, skb, req))
5951 goto drop_and_free;
5952
5953 if (!want_cookie || tmp_opt.tstamp_ok)
5954 TCP_ECN_create_request(req, skb, sock_net(sk));
5955
5956 if (want_cookie) {
5957 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
5958 req->cookie_ts = tmp_opt.tstamp_ok;
5959 } else if (!isn) {
5960 /* VJ's idea. We save last timestamp seen
5961 * from the destination in peer table, when entering
5962 * state TIME-WAIT, and check against it before
5963 * accepting new connection request.
5964 *
5965 * If "isn" is not zero, this request hit alive
5966 * timewait bucket, so that all the necessary checks
5967 * are made in the function processing timewait state.
5968 */
5969 if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) {
5970 bool strict;
5971
5972 dst = af_ops->route_req(sk, &fl, req, &strict);
5973 if (dst && strict &&
5974 !tcp_peer_is_proven(req, dst, true)) {
5975 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
5976 goto drop_and_release;
5977 }
5978 }
5979 /* Kill the following clause, if you dislike this way. */
5980 else if (!sysctl_tcp_syncookies &&
5981 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
5982 (sysctl_max_syn_backlog >> 2)) &&
5983 !tcp_peer_is_proven(req, dst, false)) {
5984 /* Without syncookies last quarter of
5985 * backlog is filled with destinations,
5986 * proven to be alive.
5987 * It means that we continue to communicate
5988 * to destinations, already remembered
5989 * to the moment of synflood.
5990 */
5991 pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
5992 rsk_ops->family);
5993 goto drop_and_release;
5994 }
5995
5996 isn = af_ops->init_seq(skb);
5997 }
5998 if (!dst) {
5999 dst = af_ops->route_req(sk, &fl, req, NULL);
6000 if (!dst)
6001 goto drop_and_free;
6002 }
6003
6004 tcp_rsk(req)->snt_isn = isn;
6005 tcp_openreq_init_rwin(req, sk, dst);
6006 fastopen = !want_cookie &&
6007 tcp_try_fastopen(sk, skb, req, &foc, dst);
6008 err = af_ops->send_synack(sk, dst, &fl, req,
6009 skb_get_queue_mapping(skb), &foc);
6010 if (!fastopen) {
6011 if (err || want_cookie)
6012 goto drop_and_free;
6013
6014 tcp_rsk(req)->listener = NULL;
6015 af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
6016 }
6017
6018 return 0;
6019
6020drop_and_release:
6021 dst_release(dst);
6022drop_and_free:
6023 reqsk_free(req);
6024drop:
6025 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
6026 return 0;
6027}
6028EXPORT_SYMBOL(tcp_conn_request);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 180336d47df6..1edc739b9da5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -99,7 +99,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
99struct inet_hashinfo tcp_hashinfo; 99struct inet_hashinfo tcp_hashinfo;
100EXPORT_SYMBOL(tcp_hashinfo); 100EXPORT_SYMBOL(tcp_hashinfo);
101 101
102static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb) 102static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
103{ 103{
104 return secure_tcp_sequence_number(ip_hdr(skb)->daddr, 104 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
105 ip_hdr(skb)->saddr, 105 ip_hdr(skb)->saddr,
@@ -208,6 +208,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
208 inet->inet_dport = usin->sin_port; 208 inet->inet_dport = usin->sin_port;
209 inet->inet_daddr = daddr; 209 inet->inet_daddr = daddr;
210 210
211 inet_set_txhash(sk);
212
211 inet_csk(sk)->icsk_ext_hdr_len = 0; 213 inet_csk(sk)->icsk_ext_hdr_len = 0;
212 if (inet_opt) 214 if (inet_opt)
213 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 215 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
@@ -814,6 +816,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
814 * socket. 816 * socket.
815 */ 817 */
816static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, 818static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
819 struct flowi *fl,
817 struct request_sock *req, 820 struct request_sock *req,
818 u16 queue_mapping, 821 u16 queue_mapping,
819 struct tcp_fastopen_cookie *foc) 822 struct tcp_fastopen_cookie *foc)
@@ -837,24 +840,11 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
837 ireq->ir_rmt_addr, 840 ireq->ir_rmt_addr,
838 ireq->opt); 841 ireq->opt);
839 err = net_xmit_eval(err); 842 err = net_xmit_eval(err);
840 if (!tcp_rsk(req)->snt_synack && !err)
841 tcp_rsk(req)->snt_synack = tcp_time_stamp;
842 } 843 }
843 844
844 return err; 845 return err;
845} 846}
846 847
847static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
848{
849 int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL);
850
851 if (!res) {
852 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
853 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
854 }
855 return res;
856}
857
858/* 848/*
859 * IPv4 request_sock destructor. 849 * IPv4 request_sock destructor.
860 */ 850 */
@@ -1237,160 +1227,68 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1237 1227
1238#endif 1228#endif
1239 1229
1230static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
1231 struct sk_buff *skb)
1232{
1233 struct inet_request_sock *ireq = inet_rsk(req);
1234
1235 ireq->ir_loc_addr = ip_hdr(skb)->daddr;
1236 ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
1237 ireq->no_srccheck = inet_sk(sk)->transparent;
1238 ireq->opt = tcp_v4_save_options(skb);
1239}
1240
1241static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
1242 const struct request_sock *req,
1243 bool *strict)
1244{
1245 struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1246
1247 if (strict) {
1248 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1249 *strict = true;
1250 else
1251 *strict = false;
1252 }
1253
1254 return dst;
1255}
1256
1240struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1257struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1241 .family = PF_INET, 1258 .family = PF_INET,
1242 .obj_size = sizeof(struct tcp_request_sock), 1259 .obj_size = sizeof(struct tcp_request_sock),
1243 .rtx_syn_ack = tcp_v4_rtx_synack, 1260 .rtx_syn_ack = tcp_rtx_synack,
1244 .send_ack = tcp_v4_reqsk_send_ack, 1261 .send_ack = tcp_v4_reqsk_send_ack,
1245 .destructor = tcp_v4_reqsk_destructor, 1262 .destructor = tcp_v4_reqsk_destructor,
1246 .send_reset = tcp_v4_send_reset, 1263 .send_reset = tcp_v4_send_reset,
1247 .syn_ack_timeout = tcp_syn_ack_timeout, 1264 .syn_ack_timeout = tcp_syn_ack_timeout,
1248}; 1265};
1249 1266
1250#ifdef CONFIG_TCP_MD5SIG
1251static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1267static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1268 .mss_clamp = TCP_MSS_DEFAULT,
1269#ifdef CONFIG_TCP_MD5SIG
1252 .md5_lookup = tcp_v4_reqsk_md5_lookup, 1270 .md5_lookup = tcp_v4_reqsk_md5_lookup,
1253 .calc_md5_hash = tcp_v4_md5_hash_skb, 1271 .calc_md5_hash = tcp_v4_md5_hash_skb,
1254};
1255#endif 1272#endif
1273 .init_req = tcp_v4_init_req,
1274#ifdef CONFIG_SYN_COOKIES
1275 .cookie_init_seq = cookie_v4_init_sequence,
1276#endif
1277 .route_req = tcp_v4_route_req,
1278 .init_seq = tcp_v4_init_sequence,
1279 .send_synack = tcp_v4_send_synack,
1280 .queue_hash_add = inet_csk_reqsk_queue_hash_add,
1281};
1256 1282
1257int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1283int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1258{ 1284{
1259 struct tcp_options_received tmp_opt;
1260 struct request_sock *req;
1261 struct inet_request_sock *ireq;
1262 struct tcp_sock *tp = tcp_sk(sk);
1263 struct dst_entry *dst = NULL;
1264 __be32 saddr = ip_hdr(skb)->saddr;
1265 __be32 daddr = ip_hdr(skb)->daddr;
1266 __u32 isn = TCP_SKB_CB(skb)->when;
1267 bool want_cookie = false, fastopen;
1268 struct flowi4 fl4;
1269 struct tcp_fastopen_cookie foc = { .len = -1 };
1270 int err;
1271
1272 /* Never answer to SYNs send to broadcast or multicast */ 1285 /* Never answer to SYNs send to broadcast or multicast */
1273 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1286 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1274 goto drop; 1287 goto drop;
1275 1288
1276 /* TW buckets are converted to open requests without 1289 return tcp_conn_request(&tcp_request_sock_ops,
1277 * limitations, they conserve resources and peer is 1290 &tcp_request_sock_ipv4_ops, sk, skb);
1278 * evidently real one.
1279 */
1280 if ((sysctl_tcp_syncookies == 2 ||
1281 inet_csk_reqsk_queue_is_full(sk)) && !isn) {
1282 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1283 if (!want_cookie)
1284 goto drop;
1285 }
1286
1287 /* Accept backlog is full. If we have already queued enough
1288 * of warm entries in syn queue, drop request. It is better than
1289 * clogging syn queue with openreqs with exponentially increasing
1290 * timeout.
1291 */
1292 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
1293 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1294 goto drop;
1295 }
1296
1297 req = inet_reqsk_alloc(&tcp_request_sock_ops);
1298 if (!req)
1299 goto drop;
1300
1301#ifdef CONFIG_TCP_MD5SIG
1302 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1303#endif
1304
1305 tcp_clear_options(&tmp_opt);
1306 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1307 tmp_opt.user_mss = tp->rx_opt.user_mss;
1308 tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
1309
1310 if (want_cookie && !tmp_opt.saw_tstamp)
1311 tcp_clear_options(&tmp_opt);
1312
1313 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1314 tcp_openreq_init(req, &tmp_opt, skb, sk);
1315
1316 ireq = inet_rsk(req);
1317 ireq->ir_loc_addr = daddr;
1318 ireq->ir_rmt_addr = saddr;
1319 ireq->no_srccheck = inet_sk(sk)->transparent;
1320 ireq->opt = tcp_v4_save_options(skb);
1321 1291
1322 if (security_inet_conn_request(sk, skb, req))
1323 goto drop_and_free;
1324
1325 if (!want_cookie || tmp_opt.tstamp_ok)
1326 TCP_ECN_create_request(req, skb, sock_net(sk));
1327
1328 if (want_cookie) {
1329 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1330 req->cookie_ts = tmp_opt.tstamp_ok;
1331 } else if (!isn) {
1332 /* VJ's idea. We save last timestamp seen
1333 * from the destination in peer table, when entering
1334 * state TIME-WAIT, and check against it before
1335 * accepting new connection request.
1336 *
1337 * If "isn" is not zero, this request hit alive
1338 * timewait bucket, so that all the necessary checks
1339 * are made in the function processing timewait state.
1340 */
1341 if (tmp_opt.saw_tstamp &&
1342 tcp_death_row.sysctl_tw_recycle &&
1343 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1344 fl4.daddr == saddr) {
1345 if (!tcp_peer_is_proven(req, dst, true)) {
1346 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1347 goto drop_and_release;
1348 }
1349 }
1350 /* Kill the following clause, if you dislike this way. */
1351 else if (!sysctl_tcp_syncookies &&
1352 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1353 (sysctl_max_syn_backlog >> 2)) &&
1354 !tcp_peer_is_proven(req, dst, false)) {
1355 /* Without syncookies last quarter of
1356 * backlog is filled with destinations,
1357 * proven to be alive.
1358 * It means that we continue to communicate
1359 * to destinations, already remembered
1360 * to the moment of synflood.
1361 */
1362 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1363 &saddr, ntohs(tcp_hdr(skb)->source));
1364 goto drop_and_release;
1365 }
1366
1367 isn = tcp_v4_init_sequence(skb);
1368 }
1369 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1370 goto drop_and_free;
1371
1372 tcp_rsk(req)->snt_isn = isn;
1373 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1374 tcp_openreq_init_rwin(req, sk, dst);
1375 fastopen = !want_cookie &&
1376 tcp_try_fastopen(sk, skb, req, &foc, dst);
1377 err = tcp_v4_send_synack(sk, dst, req,
1378 skb_get_queue_mapping(skb), &foc);
1379 if (!fastopen) {
1380 if (err || want_cookie)
1381 goto drop_and_free;
1382
1383 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1384 tcp_rsk(req)->listener = NULL;
1385 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1386 }
1387
1388 return 0;
1389
1390drop_and_release:
1391 dst_release(dst);
1392drop_and_free:
1393 reqsk_free(req);
1394drop: 1292drop:
1395 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 1293 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1396 return 0; 1294 return 0;
@@ -1438,6 +1336,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1438 newinet->mc_ttl = ip_hdr(skb)->ttl; 1336 newinet->mc_ttl = ip_hdr(skb)->ttl;
1439 newinet->rcv_tos = ip_hdr(skb)->tos; 1337 newinet->rcv_tos = ip_hdr(skb)->tos;
1440 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1338 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1339 inet_set_txhash(newsk);
1441 if (inet_opt) 1340 if (inet_opt)
1442 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1341 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1443 newinet->inet_id = newtp->write_seq ^ jiffies; 1342 newinet->inet_id = newtp->write_seq ^ jiffies;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index e68e0d4af6c9..1649988bd1b6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -298,7 +298,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
298 tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; 298 tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
299 tw->tw_tclass = np->tclass; 299 tw->tw_tclass = np->tclass;
300 tw->tw_flowlabel = np->flow_label >> 12; 300 tw->tw_flowlabel = np->flow_label >> 12;
301 tw->tw_ipv6only = np->ipv6only; 301 tw->tw_ipv6only = sk->sk_ipv6only;
302 } 302 }
303#endif 303#endif
304 304
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 4e86c59ec7f7..55046ecd083e 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -309,7 +309,7 @@ static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
309 309
310 th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, 310 th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
311 iph->daddr, 0); 311 iph->daddr, 0);
312 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; 312 skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
313 313
314 return tcp_gro_complete(skb); 314 return tcp_gro_complete(skb);
315} 315}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d92bce0ea24e..8fcfc91964ec 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -916,6 +916,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
916 skb_orphan(skb); 916 skb_orphan(skb);
917 skb->sk = sk; 917 skb->sk = sk;
918 skb->destructor = tcp_wfree; 918 skb->destructor = tcp_wfree;
919 skb_set_hash_from_sk(skb, sk);
919 atomic_add(skb->truesize, &sk->sk_wmem_alloc); 920 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
920 921
921 /* Build TCP header and checksum it. */ 922 /* Build TCP header and checksum it. */
@@ -978,7 +979,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
978 if (likely(err <= 0)) 979 if (likely(err <= 0))
979 return err; 980 return err;
980 981
981 tcp_enter_cwr(sk, 1); 982 tcp_enter_cwr(sk);
982 983
983 return net_xmit_eval(err); 984 return net_xmit_eval(err);
984} 985}
@@ -2525,8 +2526,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2525 if (!tp->retrans_stamp) 2526 if (!tp->retrans_stamp)
2526 tp->retrans_stamp = TCP_SKB_CB(skb)->when; 2527 tp->retrans_stamp = TCP_SKB_CB(skb)->when;
2527 2528
2528 tp->undo_retrans += tcp_skb_pcount(skb);
2529
2530 /* snd_nxt is stored to detect loss of retransmitted segment, 2529 /* snd_nxt is stored to detect loss of retransmitted segment,
2531 * see tcp_input.c tcp_sacktag_write_queue(). 2530 * see tcp_input.c tcp_sacktag_write_queue().
2532 */ 2531 */
@@ -2534,6 +2533,10 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2534 } else if (err != -EBUSY) { 2533 } else if (err != -EBUSY) {
2535 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); 2534 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
2536 } 2535 }
2536
2537 if (tp->undo_retrans < 0)
2538 tp->undo_retrans = 0;
2539 tp->undo_retrans += tcp_skb_pcount(skb);
2537 return err; 2540 return err;
2538} 2541}
2539 2542
@@ -3299,3 +3302,18 @@ void tcp_send_probe0(struct sock *sk)
3299 TCP_RTO_MAX); 3302 TCP_RTO_MAX);
3300 } 3303 }
3301} 3304}
3305
3306int tcp_rtx_synack(struct sock *sk, struct request_sock *req)
3307{
3308 const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
3309 struct flowi fl;
3310 int res;
3311
3312 res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
3313 if (!res) {
3314 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
3315 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
3316 }
3317 return res;
3318}
3319EXPORT_SYMBOL(tcp_rtx_synack);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index d92f94b7e402..f57c0e4c2326 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -594,27 +594,6 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
594 return true; 594 return true;
595} 595}
596 596
597static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
598 __be16 loc_port, __be32 loc_addr,
599 __be16 rmt_port, __be32 rmt_addr,
600 int dif)
601{
602 struct hlist_nulls_node *node;
603 struct sock *s = sk;
604 unsigned short hnum = ntohs(loc_port);
605
606 sk_nulls_for_each_from(s, node) {
607 if (__udp_is_mcast_sock(net, s,
608 loc_port, loc_addr,
609 rmt_port, rmt_addr,
610 dif, hnum))
611 goto found;
612 }
613 s = NULL;
614found:
615 return s;
616}
617
618/* 597/*
619 * This routine is called by the ICMP module when it gets some 598 * This routine is called by the ICMP module when it gets some
620 * sort of error condition. If err < 0 then the socket should 599 * sort of error condition. If err < 0 then the socket should
@@ -1588,8 +1567,11 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1588 goto csum_error; 1567 goto csum_error;
1589 1568
1590 1569
1591 if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) 1570 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
1571 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
1572 is_udplite);
1592 goto drop; 1573 goto drop;
1574 }
1593 1575
1594 rc = 0; 1576 rc = 0;
1595 1577
@@ -1637,6 +1619,8 @@ static void flush_stack(struct sock **stack, unsigned int count,
1637 1619
1638 if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) 1620 if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
1639 skb1 = NULL; 1621 skb1 = NULL;
1622
1623 sock_put(sk);
1640 } 1624 }
1641 if (unlikely(skb1)) 1625 if (unlikely(skb1))
1642 kfree_skb(skb1); 1626 kfree_skb(skb1);
@@ -1665,41 +1649,50 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
1665 struct udp_table *udptable) 1649 struct udp_table *udptable)
1666{ 1650{
1667 struct sock *sk, *stack[256 / sizeof(struct sock *)]; 1651 struct sock *sk, *stack[256 / sizeof(struct sock *)];
1668 struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); 1652 struct hlist_nulls_node *node;
1669 int dif; 1653 unsigned short hnum = ntohs(uh->dest);
1670 unsigned int i, count = 0; 1654 struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
1655 int dif = skb->dev->ifindex;
1656 unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
1657 unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
1658
1659 if (use_hash2) {
1660 hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
1661 udp_table.mask;
1662 hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask;
1663start_lookup:
1664 hslot = &udp_table.hash2[hash2];
1665 offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
1666 }
1671 1667
1672 spin_lock(&hslot->lock); 1668 spin_lock(&hslot->lock);
1673 sk = sk_nulls_head(&hslot->head); 1669 sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
1674 dif = skb->dev->ifindex; 1670 if (__udp_is_mcast_sock(net, sk,
1675 sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); 1671 uh->dest, daddr,
1676 while (sk) { 1672 uh->source, saddr,
1677 stack[count++] = sk; 1673 dif, hnum)) {
1678 sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, 1674 if (unlikely(count == ARRAY_SIZE(stack))) {
1679 daddr, uh->source, saddr, dif); 1675 flush_stack(stack, count, skb, ~0);
1680 if (unlikely(count == ARRAY_SIZE(stack))) { 1676 count = 0;
1681 if (!sk) 1677 }
1682 break; 1678 stack[count++] = sk;
1683 flush_stack(stack, count, skb, ~0); 1679 sock_hold(sk);
1684 count = 0;
1685 } 1680 }
1686 } 1681 }
1687 /*
1688 * before releasing chain lock, we must take a reference on sockets
1689 */
1690 for (i = 0; i < count; i++)
1691 sock_hold(stack[i]);
1692 1682
1693 spin_unlock(&hslot->lock); 1683 spin_unlock(&hslot->lock);
1694 1684
1685 /* Also lookup *:port if we are using hash2 and haven't done so yet. */
1686 if (use_hash2 && hash2 != hash2_any) {
1687 hash2 = hash2_any;
1688 goto start_lookup;
1689 }
1690
1695 /* 1691 /*
1696 * do the slow work with no lock held 1692 * do the slow work with no lock held
1697 */ 1693 */
1698 if (count) { 1694 if (count) {
1699 flush_stack(stack, count, skb, count - 1); 1695 flush_stack(stack, count, skb, count - 1);
1700
1701 for (i = 0; i < count; i++)
1702 sock_put(stack[i]);
1703 } else { 1696 } else {
1704 kfree_skb(skb); 1697 kfree_skb(skb);
1705 } 1698 }
@@ -2523,79 +2516,3 @@ void __init udp_init(void)
2523 sysctl_udp_rmem_min = SK_MEM_QUANTUM; 2516 sysctl_udp_rmem_min = SK_MEM_QUANTUM;
2524 sysctl_udp_wmem_min = SK_MEM_QUANTUM; 2517 sysctl_udp_wmem_min = SK_MEM_QUANTUM;
2525} 2518}
2526
2527struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
2528 netdev_features_t features)
2529{
2530 struct sk_buff *segs = ERR_PTR(-EINVAL);
2531 u16 mac_offset = skb->mac_header;
2532 int mac_len = skb->mac_len;
2533 int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
2534 __be16 protocol = skb->protocol;
2535 netdev_features_t enc_features;
2536 int udp_offset, outer_hlen;
2537 unsigned int oldlen;
2538 bool need_csum;
2539
2540 oldlen = (u16)~skb->len;
2541
2542 if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
2543 goto out;
2544
2545 skb->encapsulation = 0;
2546 __skb_pull(skb, tnl_hlen);
2547 skb_reset_mac_header(skb);
2548 skb_set_network_header(skb, skb_inner_network_offset(skb));
2549 skb->mac_len = skb_inner_network_offset(skb);
2550 skb->protocol = htons(ETH_P_TEB);
2551
2552 need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
2553 if (need_csum)
2554 skb->encap_hdr_csum = 1;
2555
2556 /* segment inner packet. */
2557 enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
2558 segs = skb_mac_gso_segment(skb, enc_features);
2559 if (!segs || IS_ERR(segs)) {
2560 skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
2561 mac_len);
2562 goto out;
2563 }
2564
2565 outer_hlen = skb_tnl_header_len(skb);
2566 udp_offset = outer_hlen - tnl_hlen;
2567 skb = segs;
2568 do {
2569 struct udphdr *uh;
2570 int len;
2571
2572 skb_reset_inner_headers(skb);
2573 skb->encapsulation = 1;
2574
2575 skb->mac_len = mac_len;
2576
2577 skb_push(skb, outer_hlen);
2578 skb_reset_mac_header(skb);
2579 skb_set_network_header(skb, mac_len);
2580 skb_set_transport_header(skb, udp_offset);
2581 len = skb->len - udp_offset;
2582 uh = udp_hdr(skb);
2583 uh->len = htons(len);
2584
2585 if (need_csum) {
2586 __be32 delta = htonl(oldlen + len);
2587
2588 uh->check = ~csum_fold((__force __wsum)
2589 ((__force u32)uh->check +
2590 (__force u32)delta));
2591 uh->check = gso_make_checksum(skb, ~uh->check);
2592
2593 if (uh->check == 0)
2594 uh->check = CSUM_MANGLED_0;
2595 }
2596
2597 skb->protocol = protocol;
2598 } while ((skb = skb->next));
2599out:
2600 return segs;
2601}
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 546d2d439dda..59035bc3008d 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -47,6 +47,82 @@ static int udp4_ufo_send_check(struct sk_buff *skb)
47 return 0; 47 return 0;
48} 48}
49 49
50struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
51 netdev_features_t features)
52{
53 struct sk_buff *segs = ERR_PTR(-EINVAL);
54 u16 mac_offset = skb->mac_header;
55 int mac_len = skb->mac_len;
56 int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
57 __be16 protocol = skb->protocol;
58 netdev_features_t enc_features;
59 int udp_offset, outer_hlen;
60 unsigned int oldlen;
61 bool need_csum;
62
63 oldlen = (u16)~skb->len;
64
65 if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
66 goto out;
67
68 skb->encapsulation = 0;
69 __skb_pull(skb, tnl_hlen);
70 skb_reset_mac_header(skb);
71 skb_set_network_header(skb, skb_inner_network_offset(skb));
72 skb->mac_len = skb_inner_network_offset(skb);
73 skb->protocol = htons(ETH_P_TEB);
74
75 need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
76 if (need_csum)
77 skb->encap_hdr_csum = 1;
78
79 /* segment inner packet. */
80 enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
81 segs = skb_mac_gso_segment(skb, enc_features);
82 if (IS_ERR_OR_NULL(segs)) {
83 skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
84 mac_len);
85 goto out;
86 }
87
88 outer_hlen = skb_tnl_header_len(skb);
89 udp_offset = outer_hlen - tnl_hlen;
90 skb = segs;
91 do {
92 struct udphdr *uh;
93 int len;
94
95 skb_reset_inner_headers(skb);
96 skb->encapsulation = 1;
97
98 skb->mac_len = mac_len;
99
100 skb_push(skb, outer_hlen);
101 skb_reset_mac_header(skb);
102 skb_set_network_header(skb, mac_len);
103 skb_set_transport_header(skb, udp_offset);
104 len = skb->len - udp_offset;
105 uh = udp_hdr(skb);
106 uh->len = htons(len);
107
108 if (need_csum) {
109 __be32 delta = htonl(oldlen + len);
110
111 uh->check = ~csum_fold((__force __wsum)
112 ((__force u32)uh->check +
113 (__force u32)delta));
114 uh->check = gso_make_checksum(skb, ~uh->check);
115
116 if (uh->check == 0)
117 uh->check = CSUM_MANGLED_0;
118 }
119
120 skb->protocol = protocol;
121 } while ((skb = skb->next));
122out:
123 return segs;
124}
125
50static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, 126static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
51 netdev_features_t features) 127 netdev_features_t features)
52{ 128{
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
new file mode 100644
index 000000000000..61ec1a65207e
--- /dev/null
+++ b/net/ipv4/udp_tunnel.c
@@ -0,0 +1,100 @@
1#include <linux/module.h>
2#include <linux/errno.h>
3#include <linux/socket.h>
4#include <linux/udp.h>
5#include <linux/types.h>
6#include <linux/kernel.h>
7#include <net/udp.h>
8#include <net/udp_tunnel.h>
9#include <net/net_namespace.h>
10
11int udp_sock_create(struct net *net, struct udp_port_cfg *cfg,
12 struct socket **sockp)
13{
14 int err = -EINVAL;
15 struct socket *sock = NULL;
16
17#if IS_ENABLED(CONFIG_IPV6)
18 if (cfg->family == AF_INET6) {
19 struct sockaddr_in6 udp6_addr;
20
21 err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock);
22 if (err < 0)
23 goto error;
24
25 sk_change_net(sock->sk, net);
26
27 udp6_addr.sin6_family = AF_INET6;
28 memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6,
29 sizeof(udp6_addr.sin6_addr));
30 udp6_addr.sin6_port = cfg->local_udp_port;
31 err = kernel_bind(sock, (struct sockaddr *)&udp6_addr,
32 sizeof(udp6_addr));
33 if (err < 0)
34 goto error;
35
36 if (cfg->peer_udp_port) {
37 udp6_addr.sin6_family = AF_INET6;
38 memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6,
39 sizeof(udp6_addr.sin6_addr));
40 udp6_addr.sin6_port = cfg->peer_udp_port;
41 err = kernel_connect(sock,
42 (struct sockaddr *)&udp6_addr,
43 sizeof(udp6_addr), 0);
44 }
45 if (err < 0)
46 goto error;
47
48 udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums);
49 udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums);
50 } else
51#endif
52 if (cfg->family == AF_INET) {
53 struct sockaddr_in udp_addr;
54
55 err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock);
56 if (err < 0)
57 goto error;
58
59 sk_change_net(sock->sk, net);
60
61 udp_addr.sin_family = AF_INET;
62 udp_addr.sin_addr = cfg->local_ip;
63 udp_addr.sin_port = cfg->local_udp_port;
64 err = kernel_bind(sock, (struct sockaddr *)&udp_addr,
65 sizeof(udp_addr));
66 if (err < 0)
67 goto error;
68
69 if (cfg->peer_udp_port) {
70 udp_addr.sin_family = AF_INET;
71 udp_addr.sin_addr = cfg->peer_ip;
72 udp_addr.sin_port = cfg->peer_udp_port;
73 err = kernel_connect(sock,
74 (struct sockaddr *)&udp_addr,
75 sizeof(udp_addr), 0);
76 if (err < 0)
77 goto error;
78 }
79
80 sock->sk->sk_no_check_tx = !cfg->use_udp_checksums;
81 } else {
82 return -EPFNOSUPPORT;
83 }
84
85
86 *sockp = sock;
87
88 return 0;
89
90error:
91 if (sock) {
92 kernel_sock_shutdown(sock, SHUT_RDWR);
93 sk_release_kernel(sock->sk);
94 }
95 *sockp = NULL;
96 return err;
97}
98EXPORT_SYMBOL(udp_sock_create);
99
100MODULE_LICENSE("GPL");