aboutsummaryrefslogtreecommitdiffstats
path: root/net/sched/sch_fq.c
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2015-02-05 00:30:40 -0500
committerDavid S. Miller <davem@davemloft.net>2015-02-05 01:15:45 -0500
commit06eb395fa9856b5a87cf7d80baee2a0ed3cdb9d7 (patch)
treeb17f7ee51599f992622f924d6a39fcc8e06292f5 /net/sched/sch_fq.c
parentf2683b743f2334ef49a5361bf596dd1fbd2c9be4 (diff)
pkt_sched: fq: better control of DDOS traffic
FQ has a fast path for skb attached to a socket, as it does not have to compute a flow hash. But for other packets, FQ being non stochastic means that hosts exposed to random Internet traffic can allocate million of flows structure (104 bytes each) pretty easily. Not only host can OOM, but lookup in RB trees can take too much cpu and memory resources. This patch adds a new attribute, orphan_mask, that is adding possibility of having a stochastic hash for orphaned skb. Its default value is 1024 slots, to mimic SFQ behavior. Note: This does not apply to locally generated TCP traffic, and no locally generated traffic will share a flow structure with another perfect or stochastic flow. This patch also handles the specific case of SYNACK messages: They are attached to the listener socket, and therefore all map to a single hash bucket. If listener have set SO_MAX_PACING_RATE, hoping to have new accepted socket inherit this rate, SYNACK might be paced and even dropped. This is very similar to an internal patch Google have used more than one year. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/sched/sch_fq.c')
-rw-r--r--net/sched/sch_fq.c19
1 files changed, 17 insertions, 2 deletions
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 69a3dbf55c60..a00c43043001 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -93,6 +93,7 @@ struct fq_sched_data {
93 u32 flow_refill_delay; 93 u32 flow_refill_delay;
94 u32 flow_max_rate; /* optional max rate per flow */ 94 u32 flow_max_rate; /* optional max rate per flow */
95 u32 flow_plimit; /* max packets per flow */ 95 u32 flow_plimit; /* max packets per flow */
96 u32 orphan_mask; /* mask for orphaned skb */
96 struct rb_root *fq_root; 97 struct rb_root *fq_root;
97 u8 rate_enable; 98 u8 rate_enable;
98 u8 fq_trees_log; 99 u8 fq_trees_log;
@@ -223,11 +224,20 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
223 if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) 224 if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
224 return &q->internal; 225 return &q->internal;
225 226
226 if (unlikely(!sk)) { 227 /* SYNACK messages are attached to a listener socket.
228 * 1) They are not part of a 'flow' yet
229 * 2) We do not want to rate limit them (eg SYNFLOOD attack),
230 * especially if the listener set SO_MAX_PACING_RATE
231 * 3) We pretend they are orphaned
232 */
233 if (!sk || sk->sk_state == TCP_LISTEN) {
234 unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
235
227 /* By forcing low order bit to 1, we make sure to not 236 /* By forcing low order bit to 1, we make sure to not
228 * collide with a local flow (socket pointers are word aligned) 237 * collide with a local flow (socket pointers are word aligned)
229 */ 238 */
230 sk = (struct sock *)(skb_get_hash(skb) | 1L); 239 sk = (struct sock *)((hash << 1) | 1UL);
240 skb_orphan(skb);
231 } 241 }
232 242
233 root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)]; 243 root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
@@ -704,6 +714,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
704 q->flow_refill_delay = usecs_to_jiffies(usecs_delay); 714 q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
705 } 715 }
706 716
717 if (tb[TCA_FQ_ORPHAN_MASK])
718 q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
719
707 if (!err) { 720 if (!err) {
708 sch_tree_unlock(sch); 721 sch_tree_unlock(sch);
709 err = fq_resize(sch, fq_log); 722 err = fq_resize(sch, fq_log);
@@ -749,6 +762,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
749 q->delayed = RB_ROOT; 762 q->delayed = RB_ROOT;
750 q->fq_root = NULL; 763 q->fq_root = NULL;
751 q->fq_trees_log = ilog2(1024); 764 q->fq_trees_log = ilog2(1024);
765 q->orphan_mask = 1024 - 1;
752 qdisc_watchdog_init(&q->watchdog, sch); 766 qdisc_watchdog_init(&q->watchdog, sch);
753 767
754 if (opt) 768 if (opt)
@@ -778,6 +792,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
778 nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || 792 nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
779 nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, 793 nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
780 jiffies_to_usecs(q->flow_refill_delay)) || 794 jiffies_to_usecs(q->flow_refill_delay)) ||
795 nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
781 nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) 796 nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
782 goto nla_put_failure; 797 goto nla_put_failure;
783 798