aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorTerry Lam <vtlam@google.com>2013-12-15 03:30:21 -0500
committerDavid S. Miller <davem@davemloft.net>2013-12-19 14:48:42 -0500
commit10239edf86f137ce4c39b62ea9575e8053c549a0 (patch)
treef7f81307559556f9d435c1d2a91ea9e497a13a80 /net
parent2a2529ef2bb477dbf11311710c6de46298192316 (diff)
net-qdisc-hhf: Heavy-Hitter Filter (HHF) qdisc
This patch implements the first size-based qdisc that attempts to differentiate between small flows and heavy-hitters. The goal is to catch the heavy-hitters and move them to a separate queue with less priority so that bulk traffic does not affect the latency of critical traffic. Currently "less priority" means less weight (2:1 in particular) in a Weighted Deficit Round Robin (WDRR) scheduler. In essence, this patch addresses the "delay-bloat" problem due to bloated buffers. In some systems, large queues may be necessary for obtaining CPU efficiency, or due to the presence of unresponsive traffic like UDP, or just a large number of connections with each having a small amount of outstanding traffic. In these circumstances, HHF aims to reduce the HoL blocking for latency sensitive traffic, while not impacting the queues built up by bulk traffic. HHF can also be used in conjunction with other AQM mechanisms such as CoDel. To capture heavy-hitters, we implement the "multi-stage filter" design in the following paper: C. Estan and G. Varghese, "New Directions in Traffic Measurement and Accounting", in ACM SIGCOMM, 2002. Some configurable qdisc settings through 'tc': - hhf_reset_timeout: period to reset counter values in the multi-stage filter (default 40ms) - hhf_admit_bytes: threshold to classify heavy-hitters (default 128KB) - hhf_evict_timeout: threshold to evict idle heavy-hitters (default 1s) - hhf_non_hh_weight: Weighted Deficit Round Robin (WDRR) weight for non-heavy-hitters (default 2) - hh_flows_limit: max number of heavy-hitter flow entries (default 2048) Note that the ratio between hhf_admit_bytes and hhf_reset_timeout reflects the bandwidth of heavy-hitters that we attempt to capture (25Mbps with the above default settings). The false negative rate (heavy-hitter flows getting away unclassified) is zero by the design of the multi-stage filter algorithm. With 100 heavy-hitter flows, using four hashes and 4000 counters yields a false positive rate (non-heavy-hitters mistakenly classified as heavy-hitters) of less than 1e-4. Signed-off-by: Terry Lam <vtlam@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/sched/Kconfig9
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/sch_hhf.c746
3 files changed, 756 insertions, 0 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index ad1f1d819203..919847beec39 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -286,6 +286,15 @@ config NET_SCH_FQ
286 286
287 If unsure, say N. 287 If unsure, say N.
288 288
289config NET_SCH_HHF
290 tristate "Heavy-Hitter Filter (HHF)"
291 help
292 Say Y here if you want to use the Heavy-Hitter Filter (HHF)
293 packet scheduling algorithm.
294
295 To compile this driver as a module, choose M here: the module
296 will be called sch_hhf.
297
289config NET_SCH_INGRESS 298config NET_SCH_INGRESS
290 tristate "Ingress Qdisc" 299 tristate "Ingress Qdisc"
291 depends on NET_CLS_ACT 300 depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 35fa47a494ab..3442e5fbc4d7 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
40obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o 40obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o
41obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o 41obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o
42obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o 42obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o
43obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o
43 44
44obj-$(CONFIG_NET_CLS_U32) += cls_u32.o 45obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
45obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o 46obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
new file mode 100644
index 000000000000..97aa33dbb90f
--- /dev/null
+++ b/net/sched/sch_hhf.c
@@ -0,0 +1,746 @@
1/* net/sched/sch_hhf.c Heavy-Hitter Filter (HHF)
2 *
3 * Copyright (C) 2013 Terry Lam <vtlam@google.com>
4 * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com>
5 */
6
7#include <linux/jhash.h>
8#include <linux/jiffies.h>
9#include <linux/module.h>
10#include <linux/skbuff.h>
11#include <linux/vmalloc.h>
12#include <net/flow_keys.h>
13#include <net/pkt_sched.h>
14#include <net/sock.h>
15
16/* Heavy-Hitter Filter (HHF)
17 *
18 * Principles :
19 * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter
20 * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified
21 * as heavy-hitter, it is immediately switched to the heavy-hitter bucket.
22 * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler,
23 * in which the heavy-hitter bucket is served with less weight.
24 * In other words, non-heavy-hitters (e.g., short bursts of critical traffic)
25 * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have
26 * higher share of bandwidth.
27 *
28 * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the
29 * following paper:
30 * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and
31 * Accounting", in ACM SIGCOMM, 2002.
32 *
33 * Conceptually, a multi-stage filter comprises k independent hash functions
34 * and k counter arrays. Packets are indexed into k counter arrays by k hash
35 * functions, respectively. The counters are then increased by the packet sizes.
36 * Therefore,
37 * - For a heavy-hitter flow: *all* of its k array counters must be large.
38 * - For a non-heavy-hitter flow: some of its k array counters can be large
39 * due to hash collision with other small flows; however, with high
40 * probability, not *all* k counters are large.
41 *
42 * By the design of the multi-stage filter algorithm, the false negative rate
43 * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is
44 * susceptible to false positives (non-heavy-hitters mistakenly classified as
45 * heavy-hitters).
46 * Therefore, we also implement the following optimizations to reduce false
47 * positives by avoiding unnecessary increment of the counter values:
48 * - Optimization O1: once a heavy-hitter is identified, its bytes are not
49 * accounted in the array counters. This technique is called "shielding"
50 * in Section 3.3.1 of [EV02].
51 * - Optimization O2: conservative update of counters
52 * (Section 3.3.2 of [EV02]),
53 * New counter value = max {old counter value,
54 * smallest counter value + packet bytes}
55 *
56 * Finally, we refresh the counters periodically since otherwise the counter
57 * values will keep accumulating.
58 *
59 * Once a flow is classified as heavy-hitter, we also save its per-flow state
60 * in an exact-matching flow table so that its subsequent packets can be
61 * dispatched to the heavy-hitter bucket accordingly.
62 *
63 *
64 * At a high level, this qdisc works as follows:
65 * Given a packet p:
66 * - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching
67 * heavy-hitter flow table, denoted table T, then send p to the heavy-hitter
68 * bucket.
69 * - Otherwise, forward p to the multi-stage filter, denoted filter F
70 * + If F decides that p belongs to a non-heavy-hitter flow, then send p
71 * to the non-heavy-hitter bucket.
72 * + Otherwise, if F decides that p belongs to a new heavy-hitter flow,
73 * then set up a new flow entry for the flow-id of p in the table T and
74 * send p to the heavy-hitter bucket.
75 *
76 * In this implementation:
77 * - T is a fixed-size hash-table with 1024 entries. Hash collision is
78 * resolved by linked-list chaining.
79 * - F has four counter arrays, each array containing 1024 32-bit counters.
80 * That means 4 * 1024 * 32 bits = 16KB of memory.
81 * - Since each array in F contains 1024 counters, 10 bits are sufficient to
82 * index into each array.
83 * Hence, instead of having four hash functions, we chop the 32-bit
84 * skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is
85 * computed as XOR sum of those three chunks.
86 * - We need to clear the counter arrays periodically; however, directly
87 * memsetting 16KB of memory can lead to cache eviction and unwanted delay.
88 * So by representing each counter by a valid bit, we only need to reset
89 * 4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory.
90 * - The Deficit Round Robin engine is taken from fq_codel implementation
91 * (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to
92 * fq_codel_flow in fq_codel implementation.
93 *
94 */
95
96/* Non-configurable parameters */
97#define HH_FLOWS_CNT 1024 /* number of entries in exact-matching table T */
98#define HHF_ARRAYS_CNT 4 /* number of arrays in multi-stage filter F */
99#define HHF_ARRAYS_LEN 1024 /* number of counters in each array of F */
100#define HHF_BIT_MASK_LEN 10 /* masking 10 bits */
101#define HHF_BIT_MASK 0x3FF /* bitmask of 10 bits */
102
103#define WDRR_BUCKET_CNT 2 /* two buckets for Weighted DRR */
104enum wdrr_bucket_idx {
105 WDRR_BUCKET_FOR_HH = 0, /* bucket id for heavy-hitters */
106 WDRR_BUCKET_FOR_NON_HH = 1 /* bucket id for non-heavy-hitters */
107};
108
109#define hhf_time_before(a, b) \
110 (typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0))
111
112/* Heavy-hitter per-flow state */
113struct hh_flow_state {
114 u32 hash_id; /* hash of flow-id (e.g. TCP 5-tuple) */
115 u32 hit_timestamp; /* last time heavy-hitter was seen */
116 struct list_head flowchain; /* chaining under hash collision */
117};
118
119/* Weighted Deficit Round Robin (WDRR) scheduler */
120struct wdrr_bucket {
121 struct sk_buff *head;
122 struct sk_buff *tail;
123 struct list_head bucketchain;
124 int deficit;
125};
126
127struct hhf_sched_data {
128 struct wdrr_bucket buckets[WDRR_BUCKET_CNT];
129 u32 perturbation; /* hash perturbation */
130 u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
131 u32 drop_overlimit; /* number of times max qdisc packet
132 * limit was hit
133 */
134 struct list_head *hh_flows; /* table T (currently active HHs) */
135 u32 hh_flows_limit; /* max active HH allocs */
136 u32 hh_flows_overlimit; /* num of disallowed HH allocs */
137 u32 hh_flows_total_cnt; /* total admitted HHs */
138 u32 hh_flows_current_cnt; /* total current HHs */
139 u32 *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */
140 u32 hhf_arrays_reset_timestamp; /* last time hhf_arrays
141 * was reset
142 */
143 unsigned long *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits
144 * of hhf_arrays
145 */
146 /* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */
147 struct list_head new_buckets; /* list of new buckets */
148 struct list_head old_buckets; /* list of old buckets */
149
150 /* Configurable HHF parameters */
151 u32 hhf_reset_timeout; /* interval to reset counter
152 * arrays in filter F
153 * (default 40ms)
154 */
155 u32 hhf_admit_bytes; /* counter thresh to classify as
156 * HH (default 128KB).
157 * With these default values,
158 * 128KB / 40ms = 25 Mbps
159 * i.e., we expect to capture HHs
160 * sending > 25 Mbps.
161 */
162 u32 hhf_evict_timeout; /* aging threshold to evict idle
163 * HHs out of table T. This should
164 * be large enough to avoid
165 * reordering during HH eviction.
166 * (default 1s)
167 */
168 u32 hhf_non_hh_weight; /* WDRR weight for non-HHs
169 * (default 2,
170 * i.e., non-HH : HH = 2 : 1)
171 */
172};
173
174static u32 hhf_time_stamp(void)
175{
176 return jiffies;
177}
178
179static unsigned int skb_hash(const struct hhf_sched_data *q,
180 const struct sk_buff *skb)
181{
182 struct flow_keys keys;
183 unsigned int hash;
184
185 if (skb->sk && skb->sk->sk_hash)
186 return skb->sk->sk_hash;
187
188 skb_flow_dissect(skb, &keys);
189 hash = jhash_3words((__force u32)keys.dst,
190 (__force u32)keys.src ^ keys.ip_proto,
191 (__force u32)keys.ports, q->perturbation);
192 return hash;
193}
194
195/* Looks up a heavy-hitter flow in a chaining list of table T. */
196static struct hh_flow_state *seek_list(const u32 hash,
197 struct list_head *head,
198 struct hhf_sched_data *q)
199{
200 struct hh_flow_state *flow, *next;
201 u32 now = hhf_time_stamp();
202
203 if (list_empty(head))
204 return NULL;
205
206 list_for_each_entry_safe(flow, next, head, flowchain) {
207 u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
208
209 if (hhf_time_before(prev, now)) {
210 /* Delete expired heavy-hitters, but preserve one entry
211 * to avoid kzalloc() when next time this slot is hit.
212 */
213 if (list_is_last(&flow->flowchain, head))
214 return NULL;
215 list_del(&flow->flowchain);
216 kfree(flow);
217 q->hh_flows_current_cnt--;
218 } else if (flow->hash_id == hash) {
219 return flow;
220 }
221 }
222 return NULL;
223}
224
225/* Returns a flow state entry for a new heavy-hitter. Either reuses an expired
226 * entry or dynamically alloc a new entry.
227 */
228static struct hh_flow_state *alloc_new_hh(struct list_head *head,
229 struct hhf_sched_data *q)
230{
231 struct hh_flow_state *flow;
232 u32 now = hhf_time_stamp();
233
234 if (!list_empty(head)) {
235 /* Find an expired heavy-hitter flow entry. */
236 list_for_each_entry(flow, head, flowchain) {
237 u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
238
239 if (hhf_time_before(prev, now))
240 return flow;
241 }
242 }
243
244 if (q->hh_flows_current_cnt >= q->hh_flows_limit) {
245 q->hh_flows_overlimit++;
246 return NULL;
247 }
248 /* Create new entry. */
249 flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC);
250 if (!flow)
251 return NULL;
252
253 q->hh_flows_current_cnt++;
254 INIT_LIST_HEAD(&flow->flowchain);
255 list_add_tail(&flow->flowchain, head);
256
257 return flow;
258}
259
260/* Assigns packets to WDRR buckets. Implements a multi-stage filter to
261 * classify heavy-hitters.
262 */
263static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch)
264{
265 struct hhf_sched_data *q = qdisc_priv(sch);
266 u32 tmp_hash, hash;
267 u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos;
268 struct hh_flow_state *flow;
269 u32 pkt_len, min_hhf_val;
270 int i;
271 u32 prev;
272 u32 now = hhf_time_stamp();
273
274 /* Reset the HHF counter arrays if this is the right time. */
275 prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout;
276 if (hhf_time_before(prev, now)) {
277 for (i = 0; i < HHF_ARRAYS_CNT; i++)
278 bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN);
279 q->hhf_arrays_reset_timestamp = now;
280 }
281
282 /* Get hashed flow-id of the skb. */
283 hash = skb_hash(q, skb);
284
285 /* Check if this packet belongs to an already established HH flow. */
286 flow_pos = hash & HHF_BIT_MASK;
287 flow = seek_list(hash, &q->hh_flows[flow_pos], q);
288 if (flow) { /* found its HH flow */
289 flow->hit_timestamp = now;
290 return WDRR_BUCKET_FOR_HH;
291 }
292
293 /* Now pass the packet through the multi-stage filter. */
294 tmp_hash = hash;
295 xorsum = 0;
296 for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) {
297 /* Split the skb_hash into three 10-bit chunks. */
298 filter_pos[i] = tmp_hash & HHF_BIT_MASK;
299 xorsum ^= filter_pos[i];
300 tmp_hash >>= HHF_BIT_MASK_LEN;
301 }
302 /* The last chunk is computed as XOR sum of other chunks. */
303 filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash;
304
305 pkt_len = qdisc_pkt_len(skb);
306 min_hhf_val = ~0U;
307 for (i = 0; i < HHF_ARRAYS_CNT; i++) {
308 u32 val;
309
310 if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) {
311 q->hhf_arrays[i][filter_pos[i]] = 0;
312 __set_bit(filter_pos[i], q->hhf_valid_bits[i]);
313 }
314
315 val = q->hhf_arrays[i][filter_pos[i]] + pkt_len;
316 if (min_hhf_val > val)
317 min_hhf_val = val;
318 }
319
320 /* Found a new HH iff all counter values > HH admit threshold. */
321 if (min_hhf_val > q->hhf_admit_bytes) {
322 /* Just captured a new heavy-hitter. */
323 flow = alloc_new_hh(&q->hh_flows[flow_pos], q);
324 if (!flow) /* memory alloc problem */
325 return WDRR_BUCKET_FOR_NON_HH;
326 flow->hash_id = hash;
327 flow->hit_timestamp = now;
328 q->hh_flows_total_cnt++;
329
330 /* By returning without updating counters in q->hhf_arrays,
331 * we implicitly implement "shielding" (see Optimization O1).
332 */
333 return WDRR_BUCKET_FOR_HH;
334 }
335
336 /* Conservative update of HHF arrays (see Optimization O2). */
337 for (i = 0; i < HHF_ARRAYS_CNT; i++) {
338 if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val)
339 q->hhf_arrays[i][filter_pos[i]] = min_hhf_val;
340 }
341 return WDRR_BUCKET_FOR_NON_HH;
342}
343
344/* Removes one skb from head of bucket. */
345static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket)
346{
347 struct sk_buff *skb = bucket->head;
348
349 bucket->head = skb->next;
350 skb->next = NULL;
351 return skb;
352}
353
354/* Tail-adds skb to bucket. */
355static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb)
356{
357 if (bucket->head == NULL)
358 bucket->head = skb;
359 else
360 bucket->tail->next = skb;
361 bucket->tail = skb;
362 skb->next = NULL;
363}
364
365static unsigned int hhf_drop(struct Qdisc *sch)
366{
367 struct hhf_sched_data *q = qdisc_priv(sch);
368 struct wdrr_bucket *bucket;
369
370 /* Always try to drop from heavy-hitters first. */
371 bucket = &q->buckets[WDRR_BUCKET_FOR_HH];
372 if (!bucket->head)
373 bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH];
374
375 if (bucket->head) {
376 struct sk_buff *skb = dequeue_head(bucket);
377
378 sch->q.qlen--;
379 sch->qstats.drops++;
380 sch->qstats.backlog -= qdisc_pkt_len(skb);
381 kfree_skb(skb);
382 }
383
384 /* Return id of the bucket from which the packet was dropped. */
385 return bucket - q->buckets;
386}
387
388static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
389{
390 struct hhf_sched_data *q = qdisc_priv(sch);
391 enum wdrr_bucket_idx idx;
392 struct wdrr_bucket *bucket;
393
394 idx = hhf_classify(skb, sch);
395
396 bucket = &q->buckets[idx];
397 bucket_add(bucket, skb);
398 sch->qstats.backlog += qdisc_pkt_len(skb);
399
400 if (list_empty(&bucket->bucketchain)) {
401 unsigned int weight;
402
403 /* The logic of new_buckets vs. old_buckets is the same as
404 * new_flows vs. old_flows in the implementation of fq_codel,
405 * i.e., short bursts of non-HHs should have strict priority.
406 */
407 if (idx == WDRR_BUCKET_FOR_HH) {
408 /* Always move heavy-hitters to old bucket. */
409 weight = 1;
410 list_add_tail(&bucket->bucketchain, &q->old_buckets);
411 } else {
412 weight = q->hhf_non_hh_weight;
413 list_add_tail(&bucket->bucketchain, &q->new_buckets);
414 }
415 bucket->deficit = weight * q->quantum;
416 }
417 if (++sch->q.qlen < sch->limit)
418 return NET_XMIT_SUCCESS;
419
420 q->drop_overlimit++;
421 /* Return Congestion Notification only if we dropped a packet from this
422 * bucket.
423 */
424 if (hhf_drop(sch) == idx)
425 return NET_XMIT_CN;
426
427 /* As we dropped a packet, better let upper stack know this. */
428 qdisc_tree_decrease_qlen(sch, 1);
429 return NET_XMIT_SUCCESS;
430}
431
432static struct sk_buff *hhf_dequeue(struct Qdisc *sch)
433{
434 struct hhf_sched_data *q = qdisc_priv(sch);
435 struct sk_buff *skb = NULL;
436 struct wdrr_bucket *bucket;
437 struct list_head *head;
438
439begin:
440 head = &q->new_buckets;
441 if (list_empty(head)) {
442 head = &q->old_buckets;
443 if (list_empty(head))
444 return NULL;
445 }
446 bucket = list_first_entry(head, struct wdrr_bucket, bucketchain);
447
448 if (bucket->deficit <= 0) {
449 int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ?
450 1 : q->hhf_non_hh_weight;
451
452 bucket->deficit += weight * q->quantum;
453 list_move_tail(&bucket->bucketchain, &q->old_buckets);
454 goto begin;
455 }
456
457 if (bucket->head) {
458 skb = dequeue_head(bucket);
459 sch->q.qlen--;
460 sch->qstats.backlog -= qdisc_pkt_len(skb);
461 }
462
463 if (!skb) {
464 /* Force a pass through old_buckets to prevent starvation. */
465 if ((head == &q->new_buckets) && !list_empty(&q->old_buckets))
466 list_move_tail(&bucket->bucketchain, &q->old_buckets);
467 else
468 list_del_init(&bucket->bucketchain);
469 goto begin;
470 }
471 qdisc_bstats_update(sch, skb);
472 bucket->deficit -= qdisc_pkt_len(skb);
473
474 return skb;
475}
476
477static void hhf_reset(struct Qdisc *sch)
478{
479 struct sk_buff *skb;
480
481 while ((skb = hhf_dequeue(sch)) != NULL)
482 kfree_skb(skb);
483}
484
485static void *hhf_zalloc(size_t sz)
486{
487 void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
488
489 if (!ptr)
490 ptr = vzalloc(sz);
491
492 return ptr;
493}
494
495static void hhf_free(void *addr)
496{
497 if (addr) {
498 if (is_vmalloc_addr(addr))
499 vfree(addr);
500 else
501 kfree(addr);
502 }
503}
504
505static void hhf_destroy(struct Qdisc *sch)
506{
507 int i;
508 struct hhf_sched_data *q = qdisc_priv(sch);
509
510 for (i = 0; i < HHF_ARRAYS_CNT; i++) {
511 hhf_free(q->hhf_arrays[i]);
512 hhf_free(q->hhf_valid_bits[i]);
513 }
514
515 for (i = 0; i < HH_FLOWS_CNT; i++) {
516 struct hh_flow_state *flow, *next;
517 struct list_head *head = &q->hh_flows[i];
518
519 if (list_empty(head))
520 continue;
521 list_for_each_entry_safe(flow, next, head, flowchain) {
522 list_del(&flow->flowchain);
523 kfree(flow);
524 }
525 }
526 hhf_free(q->hh_flows);
527}
528
529static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = {
530 [TCA_HHF_BACKLOG_LIMIT] = { .type = NLA_U32 },
531 [TCA_HHF_QUANTUM] = { .type = NLA_U32 },
532 [TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 },
533 [TCA_HHF_RESET_TIMEOUT] = { .type = NLA_U32 },
534 [TCA_HHF_ADMIT_BYTES] = { .type = NLA_U32 },
535 [TCA_HHF_EVICT_TIMEOUT] = { .type = NLA_U32 },
536 [TCA_HHF_NON_HH_WEIGHT] = { .type = NLA_U32 },
537};
538
539static int hhf_change(struct Qdisc *sch, struct nlattr *opt)
540{
541 struct hhf_sched_data *q = qdisc_priv(sch);
542 struct nlattr *tb[TCA_HHF_MAX + 1];
543 unsigned int qlen;
544 int err;
545 u64 non_hh_quantum;
546 u32 new_quantum = q->quantum;
547 u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight;
548
549 if (!opt)
550 return -EINVAL;
551
552 err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy);
553 if (err < 0)
554 return err;
555
556 sch_tree_lock(sch);
557
558 if (tb[TCA_HHF_BACKLOG_LIMIT])
559 sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]);
560
561 if (tb[TCA_HHF_QUANTUM])
562 new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]);
563
564 if (tb[TCA_HHF_NON_HH_WEIGHT])
565 new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]);
566
567 non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight;
568 if (non_hh_quantum > INT_MAX)
569 return -EINVAL;
570 q->quantum = new_quantum;
571 q->hhf_non_hh_weight = new_hhf_non_hh_weight;
572
573 if (tb[TCA_HHF_HH_FLOWS_LIMIT])
574 q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]);
575
576 if (tb[TCA_HHF_RESET_TIMEOUT]) {
577 u32 ms = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]);
578
579 q->hhf_reset_timeout = msecs_to_jiffies(ms);
580 }
581
582 if (tb[TCA_HHF_ADMIT_BYTES])
583 q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]);
584
585 if (tb[TCA_HHF_EVICT_TIMEOUT]) {
586 u32 ms = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]);
587
588 q->hhf_evict_timeout = msecs_to_jiffies(ms);
589 }
590
591 qlen = sch->q.qlen;
592 while (sch->q.qlen > sch->limit) {
593 struct sk_buff *skb = hhf_dequeue(sch);
594
595 kfree_skb(skb);
596 }
597 qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
598
599 sch_tree_unlock(sch);
600 return 0;
601}
602
603static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
604{
605 struct hhf_sched_data *q = qdisc_priv(sch);
606 int i;
607
608 sch->limit = 1000;
609 q->quantum = psched_mtu(qdisc_dev(sch));
610 q->perturbation = net_random();
611 INIT_LIST_HEAD(&q->new_buckets);
612 INIT_LIST_HEAD(&q->old_buckets);
613
614 /* Configurable HHF parameters */
615 q->hhf_reset_timeout = HZ / 25; /* 40 ms */
616 q->hhf_admit_bytes = 131072; /* 128 KB */
617 q->hhf_evict_timeout = HZ; /* 1 sec */
618 q->hhf_non_hh_weight = 2;
619
620 if (opt) {
621 int err = hhf_change(sch, opt);
622
623 if (err)
624 return err;
625 }
626
627 if (!q->hh_flows) {
628 /* Initialize heavy-hitter flow table. */
629 q->hh_flows = hhf_zalloc(HH_FLOWS_CNT *
630 sizeof(struct list_head));
631 if (!q->hh_flows)
632 return -ENOMEM;
633 for (i = 0; i < HH_FLOWS_CNT; i++)
634 INIT_LIST_HEAD(&q->hh_flows[i]);
635
636 /* Cap max active HHs at twice len of hh_flows table. */
637 q->hh_flows_limit = 2 * HH_FLOWS_CNT;
638 q->hh_flows_overlimit = 0;
639 q->hh_flows_total_cnt = 0;
640 q->hh_flows_current_cnt = 0;
641
642 /* Initialize heavy-hitter filter arrays. */
643 for (i = 0; i < HHF_ARRAYS_CNT; i++) {
644 q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN *
645 sizeof(u32));
646 if (!q->hhf_arrays[i]) {
647 hhf_destroy(sch);
648 return -ENOMEM;
649 }
650 }
651 q->hhf_arrays_reset_timestamp = hhf_time_stamp();
652
653 /* Initialize valid bits of heavy-hitter filter arrays. */
654 for (i = 0; i < HHF_ARRAYS_CNT; i++) {
655 q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN /
656 BITS_PER_BYTE);
657 if (!q->hhf_valid_bits[i]) {
658 hhf_destroy(sch);
659 return -ENOMEM;
660 }
661 }
662
663 /* Initialize Weighted DRR buckets. */
664 for (i = 0; i < WDRR_BUCKET_CNT; i++) {
665 struct wdrr_bucket *bucket = q->buckets + i;
666
667 INIT_LIST_HEAD(&bucket->bucketchain);
668 }
669 }
670
671 return 0;
672}
673
674static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb)
675{
676 struct hhf_sched_data *q = qdisc_priv(sch);
677 struct nlattr *opts;
678
679 opts = nla_nest_start(skb, TCA_OPTIONS);
680 if (opts == NULL)
681 goto nla_put_failure;
682
683 if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) ||
684 nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) ||
685 nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) ||
686 nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT,
687 jiffies_to_msecs(q->hhf_reset_timeout)) ||
688 nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) ||
689 nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT,
690 jiffies_to_msecs(q->hhf_evict_timeout)) ||
691 nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight))
692 goto nla_put_failure;
693
694 nla_nest_end(skb, opts);
695 return skb->len;
696
697nla_put_failure:
698 return -1;
699}
700
701static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
702{
703 struct hhf_sched_data *q = qdisc_priv(sch);
704 struct tc_hhf_xstats st = {
705 .drop_overlimit = q->drop_overlimit,
706 .hh_overlimit = q->hh_flows_overlimit,
707 .hh_tot_count = q->hh_flows_total_cnt,
708 .hh_cur_count = q->hh_flows_current_cnt,
709 };
710
711 return gnet_stats_copy_app(d, &st, sizeof(st));
712}
713
714struct Qdisc_ops hhf_qdisc_ops __read_mostly = {
715 .id = "hhf",
716 .priv_size = sizeof(struct hhf_sched_data),
717
718 .enqueue = hhf_enqueue,
719 .dequeue = hhf_dequeue,
720 .peek = qdisc_peek_dequeued,
721 .drop = hhf_drop,
722 .init = hhf_init,
723 .reset = hhf_reset,
724 .destroy = hhf_destroy,
725 .change = hhf_change,
726 .dump = hhf_dump,
727 .dump_stats = hhf_dump_stats,
728 .owner = THIS_MODULE,
729};
730EXPORT_SYMBOL(hhf_qdisc_ops);
731
732static int __init hhf_module_init(void)
733{
734 return register_qdisc(&hhf_qdisc_ops);
735}
736
737static void __exit hhf_module_exit(void)
738{
739 unregister_qdisc(&hhf_qdisc_ops);
740}
741
742module_init(hhf_module_init)
743module_exit(hhf_module_exit)
744MODULE_AUTHOR("Terry Lam");
745MODULE_AUTHOR("Nandita Dukkipati");
746MODULE_LICENSE("GPL");