aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/ip-sysctl.txt23
-rw-r--r--include/linux/sysctl.h1
-rw-r--r--include/net/inetpeer.h1
-rw-r--r--include/net/ip.h2
-rw-r--r--net/ipv4/inetpeer.c1
-rw-r--r--net/ipv4/ip_fragment.c68
-rw-r--r--net/ipv4/ip_output.c1
-rw-r--r--net/ipv4/sysctl_net_ipv4.c10
8 files changed, 106 insertions, 1 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index ebc09a159f62..2b7cf19a06ad 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -46,6 +46,29 @@ ipfrag_secret_interval - INTEGER
46 for the hash secret) for IP fragments. 46 for the hash secret) for IP fragments.
47 Default: 600 47 Default: 600
48 48
49ipfrag_max_dist - INTEGER
50 ipfrag_max_dist is a non-negative integer value which defines the
51 maximum "disorder" which is allowed among fragments which share a
52 common IP source address. Note that reordering of packets is
53 not unusual, but if a large number of fragments arrive from a source
54 IP address while a particular fragment queue remains incomplete, it
55 probably indicates that one or more fragments belonging to that queue
56 have been lost. When ipfrag_max_dist is positive, an additional check
57 is done on fragments before they are added to a reassembly queue - if
58 ipfrag_max_dist (or more) fragments have arrived from a particular IP
59 address between additions to any IP fragment queue using that source
60 address, it's presumed that one or more fragments in the queue are
61 lost. The existing fragment queue will be dropped, and a new one
62 started. An ipfrag_max_dist value of zero disables this check.
63
64 Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can
65 result in unnecessarily dropping fragment queues when normal
66 reordering of packets occurs, which could lead to poor application
67 performance. Using a very large value, e.g. 50000, increases the
68 likelihood of incorrectly reassembling IP fragments that originate
69 from different IP datagrams, which could result in data corruption.
70 Default: 64
71
49INET peer storage: 72INET peer storage:
50 73
51inet_peer_threshold - INTEGER 74inet_peer_threshold - INTEGER
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 4be34ef8c2f7..93fa765e47d3 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -390,6 +390,7 @@ enum
390 NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, 390 NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109,
391 NET_TCP_CONG_CONTROL=110, 391 NET_TCP_CONG_CONTROL=110,
392 NET_TCP_ABC=111, 392 NET_TCP_ABC=111,
393 NET_IPV4_IPFRAG_MAX_DIST=112,
393}; 394};
394 395
395enum { 396enum {
diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index 7fda471002b6..0965515f40cf 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -25,6 +25,7 @@ struct inet_peer
25 __u32 v4daddr; /* peer's address */ 25 __u32 v4daddr; /* peer's address */
26 __u16 avl_height; 26 __u16 avl_height;
27 __u16 ip_id_count; /* IP ID for the next packet */ 27 __u16 ip_id_count; /* IP ID for the next packet */
28 atomic_t rid; /* Frag reception counter */
28 __u32 tcp_ts; 29 __u32 tcp_ts;
29 unsigned long tcp_ts_stamp; 30 unsigned long tcp_ts_stamp;
30}; 31};
diff --git a/include/net/ip.h b/include/net/ip.h
index e4563bbee6ea..4d6294ba038e 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -45,6 +45,7 @@ struct inet_skb_parm
45#define IPSKB_TRANSLATED 2 45#define IPSKB_TRANSLATED 2
46#define IPSKB_FORWARDED 4 46#define IPSKB_FORWARDED 4
47#define IPSKB_XFRM_TUNNEL_SIZE 8 47#define IPSKB_XFRM_TUNNEL_SIZE 8
48#define IPSKB_FRAG_COMPLETE 16
48}; 49};
49 50
50struct ipcm_cookie 51struct ipcm_cookie
@@ -168,6 +169,7 @@ extern int sysctl_ipfrag_high_thresh;
168extern int sysctl_ipfrag_low_thresh; 169extern int sysctl_ipfrag_low_thresh;
169extern int sysctl_ipfrag_time; 170extern int sysctl_ipfrag_time;
170extern int sysctl_ipfrag_secret_interval; 171extern int sysctl_ipfrag_secret_interval;
172extern int sysctl_ipfrag_max_dist;
171 173
172/* From inetpeer.c */ 174/* From inetpeer.c */
173extern int inet_peer_threshold; 175extern int inet_peer_threshold;
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 2fc3fd38924f..ce5fe3f74a3d 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -401,6 +401,7 @@ struct inet_peer *inet_getpeer(__u32 daddr, int create)
401 return NULL; 401 return NULL;
402 n->v4daddr = daddr; 402 n->v4daddr = daddr;
403 atomic_set(&n->refcnt, 1); 403 atomic_set(&n->refcnt, 1);
404 atomic_set(&n->rid, 0);
404 n->ip_id_count = secure_ip_id(daddr); 405 n->ip_id_count = secure_ip_id(daddr);
405 n->tcp_ts_stamp = 0; 406 n->tcp_ts_stamp = 0;
406 407
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8ce0ce2ee48e..ce2b70ce4018 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -22,6 +22,7 @@
22 * Patrick McHardy : LRU queue of frag heads for evictor. 22 * Patrick McHardy : LRU queue of frag heads for evictor.
23 */ 23 */
24 24
25#include <linux/compiler.h>
25#include <linux/config.h> 26#include <linux/config.h>
26#include <linux/module.h> 27#include <linux/module.h>
27#include <linux/types.h> 28#include <linux/types.h>
@@ -38,6 +39,7 @@
38#include <net/ip.h> 39#include <net/ip.h>
39#include <net/icmp.h> 40#include <net/icmp.h>
40#include <net/checksum.h> 41#include <net/checksum.h>
42#include <net/inetpeer.h>
41#include <linux/tcp.h> 43#include <linux/tcp.h>
42#include <linux/udp.h> 44#include <linux/udp.h>
43#include <linux/inet.h> 45#include <linux/inet.h>
@@ -56,6 +58,8 @@
56int sysctl_ipfrag_high_thresh = 256*1024; 58int sysctl_ipfrag_high_thresh = 256*1024;
57int sysctl_ipfrag_low_thresh = 192*1024; 59int sysctl_ipfrag_low_thresh = 192*1024;
58 60
61int sysctl_ipfrag_max_dist = 64;
62
59/* Important NOTE! Fragment queue must be destroyed before MSL expires. 63/* Important NOTE! Fragment queue must be destroyed before MSL expires.
60 * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL. 64 * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL.
61 */ 65 */
@@ -89,8 +93,10 @@ struct ipq {
89 spinlock_t lock; 93 spinlock_t lock;
90 atomic_t refcnt; 94 atomic_t refcnt;
91 struct timer_list timer; /* when will this queue expire? */ 95 struct timer_list timer; /* when will this queue expire? */
92 int iif;
93 struct timeval stamp; 96 struct timeval stamp;
97 int iif;
98 unsigned int rid;
99 struct inet_peer *peer;
94}; 100};
95 101
96/* Hash table. */ 102/* Hash table. */
@@ -195,6 +201,9 @@ static void ip_frag_destroy(struct ipq *qp, int *work)
195 BUG_TRAP(qp->last_in&COMPLETE); 201 BUG_TRAP(qp->last_in&COMPLETE);
196 BUG_TRAP(del_timer(&qp->timer) == 0); 202 BUG_TRAP(del_timer(&qp->timer) == 0);
197 203
204 if (qp->peer)
205 inet_putpeer(qp->peer);
206
198 /* Release all fragment data. */ 207 /* Release all fragment data. */
199 fp = qp->fragments; 208 fp = qp->fragments;
200 while (fp) { 209 while (fp) {
@@ -353,6 +362,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
353 qp->meat = 0; 362 qp->meat = 0;
354 qp->fragments = NULL; 363 qp->fragments = NULL;
355 qp->iif = 0; 364 qp->iif = 0;
365 qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL;
356 366
357 /* Initialize a timer for this entry. */ 367 /* Initialize a timer for this entry. */
358 init_timer(&qp->timer); 368 init_timer(&qp->timer);
@@ -398,6 +408,56 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
398 return ip_frag_create(hash, iph, user); 408 return ip_frag_create(hash, iph, user);
399} 409}
400 410
411/* Is the fragment too far ahead to be part of ipq? */
412static inline int ip_frag_too_far(struct ipq *qp)
413{
414 struct inet_peer *peer = qp->peer;
415 unsigned int max = sysctl_ipfrag_max_dist;
416 unsigned int start, end;
417
418 int rc;
419
420 if (!peer || !max)
421 return 0;
422
423 start = qp->rid;
424 end = atomic_inc_return(&peer->rid);
425 qp->rid = end;
426
427 rc = qp->fragments && (end - start) > max;
428
429 if (rc) {
430 IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
431 }
432
433 return rc;
434}
435
436static int ip_frag_reinit(struct ipq *qp)
437{
438 struct sk_buff *fp;
439
440 if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) {
441 atomic_inc(&qp->refcnt);
442 return -ETIMEDOUT;
443 }
444
445 fp = qp->fragments;
446 do {
447 struct sk_buff *xp = fp->next;
448 frag_kfree_skb(fp, NULL);
449 fp = xp;
450 } while (fp);
451
452 qp->last_in = 0;
453 qp->len = 0;
454 qp->meat = 0;
455 qp->fragments = NULL;
456 qp->iif = 0;
457
458 return 0;
459}
460
401/* Add new segment to existing queue. */ 461/* Add new segment to existing queue. */
402static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) 462static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
403{ 463{
@@ -408,6 +468,12 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
408 if (qp->last_in & COMPLETE) 468 if (qp->last_in & COMPLETE)
409 goto err; 469 goto err;
410 470
471 if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
472 unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) {
473 ipq_kill(qp);
474 goto err;
475 }
476
411 offset = ntohs(skb->nh.iph->frag_off); 477 offset = ntohs(skb->nh.iph->frag_off);
412 flags = offset & ~IP_OFFSET; 478 flags = offset & ~IP_OFFSET;
413 offset &= IP_OFFSET; 479 offset &= IP_OFFSET;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index eba64e2bd397..2a830de3a699 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -445,6 +445,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
445 445
446 hlen = iph->ihl * 4; 446 hlen = iph->ihl * 4;
447 mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ 447 mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
448 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
448 449
449 /* When frag_list is given, use it. First, check its validity: 450 /* When frag_list is given, use it. First, check its validity:
450 * some transformers could create wrong frag_list or break existing 451 * some transformers could create wrong frag_list or break existing
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 01444a02b48b..dbf82955aabe 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -22,6 +22,7 @@
22extern int sysctl_ip_nonlocal_bind; 22extern int sysctl_ip_nonlocal_bind;
23 23
24#ifdef CONFIG_SYSCTL 24#ifdef CONFIG_SYSCTL
25static int zero;
25static int tcp_retr1_max = 255; 26static int tcp_retr1_max = 255;
26static int ip_local_port_range_min[] = { 1, 1 }; 27static int ip_local_port_range_min[] = { 1, 1 };
27static int ip_local_port_range_max[] = { 65535, 65535 }; 28static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -614,6 +615,15 @@ ctl_table ipv4_table[] = {
614 .strategy = &sysctl_jiffies 615 .strategy = &sysctl_jiffies
615 }, 616 },
616 { 617 {
618 .ctl_name = NET_IPV4_IPFRAG_MAX_DIST,
619 .procname = "ipfrag_max_dist",
620 .data = &sysctl_ipfrag_max_dist,
621 .maxlen = sizeof(int),
622 .mode = 0644,
623 .proc_handler = &proc_dointvec_minmax,
624 .extra1 = &zero
625 },
626 {
617 .ctl_name = NET_TCP_NO_METRICS_SAVE, 627 .ctl_name = NET_TCP_NO_METRICS_SAVE,
618 .procname = "tcp_no_metrics_save", 628 .procname = "tcp_no_metrics_save",
619 .data = &sysctl_tcp_nometrics_save, 629 .data = &sysctl_tcp_nometrics_save,