aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2012-06-26 19:14:15 -0400
committerDavid S. Miller <davem@davemloft.net>2012-06-27 18:34:24 -0400
commitc074da2810c118b3812f32d6754bd9ead2f169e7 (patch)
tree772c7fbb9da464f9afd6d56e9e610157ed665e8f
parent93040ae5cc8dcc893eca4a4366dc8415af278edf (diff)
ipv4: tcp: dont cache unconfirmed intput dst
DDOS synflood attacks hit badly IP route cache. On typical machines, this cache is allowed to hold up to 8 Millions dst entries, 256 bytes for each, for a total of 2GB of memory. rt_garbage_collect() triggers and tries to cleanup things. Eventually route cache is disabled but machine is under fire and might OOM and crash. This patch exploits the new TCP early demux, to set a nocache boolean in case incoming TCP frame is for a not yet ESTABLISHED or TIMEWAIT socket. This 'nocache' boolean is then used in case dst entry is not found in route cache, to create an unhashed dst entry (DST_NOCACHE) SYN-cookie-ACK sent use a similar mechanism (ipv4: tcp: dont cache output dst for syncookies), so after this patch, a machine is able to absorb a DDOS synflood attack without polluting its IP route cache. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Hans Schillstrom <hans.schillstrom@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/protocol.h2
-rw-r--r--include/net/route.h8
-rw-r--r--include/net/tcp.h2
-rw-r--r--net/ipv4/arp.c2
-rw-r--r--net/ipv4/ip_fragment.c2
-rw-r--r--net/ipv4/ip_input.c5
-rw-r--r--net/ipv4/route.c8
-rw-r--r--net/ipv4/tcp_ipv4.c4
-rw-r--r--net/ipv4/xfrm4_input.c2
9 files changed, 20 insertions, 15 deletions
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 967b926cbfb1..7cfc8f76914d 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -37,7 +37,7 @@
37 37
38/* This is used to register protocols. */ 38/* This is used to register protocols. */
39struct net_protocol { 39struct net_protocol {
40 int (*early_demux)(struct sk_buff *skb); 40 int (*early_demux)(struct sk_buff *skb, bool *nocache);
41 int (*handler)(struct sk_buff *skb); 41 int (*handler)(struct sk_buff *skb);
42 void (*err_handler)(struct sk_buff *skb, u32 info); 42 void (*err_handler)(struct sk_buff *skb, u32 info);
43 int (*gso_send_check)(struct sk_buff *skb); 43 int (*gso_send_check)(struct sk_buff *skb);
diff --git a/include/net/route.h b/include/net/route.h
index 47eb25ac1f7f..6361f9335774 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -201,18 +201,18 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4
201} 201}
202 202
203extern int ip_route_input_common(struct sk_buff *skb, __be32 dst, __be32 src, 203extern int ip_route_input_common(struct sk_buff *skb, __be32 dst, __be32 src,
204 u8 tos, struct net_device *devin, bool noref); 204 u8 tos, struct net_device *devin, bool noref, bool nocache);
205 205
206static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, 206static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
207 u8 tos, struct net_device *devin) 207 u8 tos, struct net_device *devin)
208{ 208{
209 return ip_route_input_common(skb, dst, src, tos, devin, false); 209 return ip_route_input_common(skb, dst, src, tos, devin, false, false);
210} 210}
211 211
212static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, 212static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
213 u8 tos, struct net_device *devin) 213 u8 tos, struct net_device *devin, bool nocache)
214{ 214{
215 return ip_route_input_common(skb, dst, src, tos, devin, true); 215 return ip_route_input_common(skb, dst, src, tos, devin, true, nocache);
216} 216}
217 217
218extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, 218extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6660ffc4963d..917ed2e55e8c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -325,7 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32);
325 325
326extern void tcp_shutdown (struct sock *sk, int how); 326extern void tcp_shutdown (struct sock *sk, int how);
327 327
328extern int tcp_v4_early_demux(struct sk_buff *skb); 328extern int tcp_v4_early_demux(struct sk_buff *skb, bool *nocache);
329extern int tcp_v4_rcv(struct sk_buff *skb); 329extern int tcp_v4_rcv(struct sk_buff *skb);
330 330
331extern struct inet_peer *tcp_v4_get_peer(struct sock *sk); 331extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 2e560f0c757d..6a9795944369 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -828,7 +828,7 @@ static int arp_process(struct sk_buff *skb)
828 } 828 }
829 829
830 if (arp->ar_op == htons(ARPOP_REQUEST) && 830 if (arp->ar_op == htons(ARPOP_REQUEST) &&
831 ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { 831 ip_route_input_noref(skb, tip, sip, 0, dev, false) == 0) {
832 832
833 rt = skb_rtable(skb); 833 rt = skb_rtable(skb);
834 addr_type = rt->rt_type; 834 addr_type = rt->rt_type;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8d07c973409c..978d55f256ea 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -259,7 +259,7 @@ static void ip_expire(unsigned long arg)
259 skb_dst_drop(head); 259 skb_dst_drop(head);
260 iph = ip_hdr(head); 260 iph = ip_hdr(head);
261 err = ip_route_input_noref(head, iph->daddr, iph->saddr, 261 err = ip_route_input_noref(head, iph->daddr, iph->saddr,
262 iph->tos, head->dev); 262 iph->tos, head->dev, false);
263 if (err) 263 if (err)
264 goto out_rcu_unlock; 264 goto out_rcu_unlock;
265 265
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 2a39204de5bc..7be54c8dcbe2 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -326,6 +326,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
326 */ 326 */
327 if (skb_dst(skb) == NULL) { 327 if (skb_dst(skb) == NULL) {
328 int err = -ENOENT; 328 int err = -ENOENT;
329 bool nocache = false;
329 330
330 if (sysctl_ip_early_demux) { 331 if (sysctl_ip_early_demux) {
331 const struct net_protocol *ipprot; 332 const struct net_protocol *ipprot;
@@ -334,13 +335,13 @@ static int ip_rcv_finish(struct sk_buff *skb)
334 rcu_read_lock(); 335 rcu_read_lock();
335 ipprot = rcu_dereference(inet_protos[protocol]); 336 ipprot = rcu_dereference(inet_protos[protocol]);
336 if (ipprot && ipprot->early_demux) 337 if (ipprot && ipprot->early_demux)
337 err = ipprot->early_demux(skb); 338 err = ipprot->early_demux(skb, &nocache);
338 rcu_read_unlock(); 339 rcu_read_unlock();
339 } 340 }
340 341
341 if (err) { 342 if (err) {
342 err = ip_route_input_noref(skb, iph->daddr, iph->saddr, 343 err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
343 iph->tos, skb->dev); 344 iph->tos, skb->dev, nocache);
344 if (unlikely(err)) { 345 if (unlikely(err)) {
345 if (err == -EXDEV) 346 if (err == -EXDEV)
346 NET_INC_STATS_BH(dev_net(skb->dev), 347 NET_INC_STATS_BH(dev_net(skb->dev),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 81533e3a23d1..fdc7900f9d7a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2214,7 +2214,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
2214 */ 2214 */
2215 2215
2216static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2216static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2217 u8 tos, struct net_device *dev) 2217 u8 tos, struct net_device *dev, bool nocache)
2218{ 2218{
2219 struct fib_result res; 2219 struct fib_result res;
2220 struct in_device *in_dev = __in_dev_get_rcu(dev); 2220 struct in_device *in_dev = __in_dev_get_rcu(dev);
@@ -2353,6 +2353,8 @@ local_input:
2353 rth->dst.error= -err; 2353 rth->dst.error= -err;
2354 rth->rt_flags &= ~RTCF_LOCAL; 2354 rth->rt_flags &= ~RTCF_LOCAL;
2355 } 2355 }
2356 if (nocache)
2357 rth->dst.flags |= DST_NOCACHE;
2356 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); 2358 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2357 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); 2359 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2358 err = 0; 2360 err = 0;
@@ -2395,7 +2397,7 @@ martian_source_keep_err:
2395} 2397}
2396 2398
2397int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2399int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2398 u8 tos, struct net_device *dev, bool noref) 2400 u8 tos, struct net_device *dev, bool noref, bool nocache)
2399{ 2401{
2400 struct rtable *rth; 2402 struct rtable *rth;
2401 unsigned int hash; 2403 unsigned int hash;
@@ -2471,7 +2473,7 @@ skip_cache:
2471 rcu_read_unlock(); 2473 rcu_read_unlock();
2472 return -EINVAL; 2474 return -EINVAL;
2473 } 2475 }
2474 res = ip_route_input_slow(skb, daddr, saddr, tos, dev); 2476 res = ip_route_input_slow(skb, daddr, saddr, tos, dev, nocache);
2475 rcu_read_unlock(); 2477 rcu_read_unlock();
2476 return res; 2478 return res;
2477} 2479}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1781dc650b9d..33aabd4fc20f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1673,7 +1673,7 @@ csum_err:
1673} 1673}
1674EXPORT_SYMBOL(tcp_v4_do_rcv); 1674EXPORT_SYMBOL(tcp_v4_do_rcv);
1675 1675
1676int tcp_v4_early_demux(struct sk_buff *skb) 1676int tcp_v4_early_demux(struct sk_buff *skb, bool *no_dst_cache)
1677{ 1677{
1678 struct net *net = dev_net(skb->dev); 1678 struct net *net = dev_net(skb->dev);
1679 const struct iphdr *iph; 1679 const struct iphdr *iph;
@@ -1719,6 +1719,8 @@ int tcp_v4_early_demux(struct sk_buff *skb)
1719 } 1719 }
1720 } 1720 }
1721 } 1721 }
1722 } else {
1723 *no_dst_cache = true;
1722 } 1724 }
1723 1725
1724out_err: 1726out_err:
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 06814b6216dc..eee636b191b9 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -28,7 +28,7 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
28 const struct iphdr *iph = ip_hdr(skb); 28 const struct iphdr *iph = ip_hdr(skb);
29 29
30 if (ip_route_input_noref(skb, iph->daddr, iph->saddr, 30 if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
31 iph->tos, skb->dev)) 31 iph->tos, skb->dev, false))
32 goto drop; 32 goto drop;
33 } 33 }
34 return dst_input(skb); 34 return dst_input(skb);