aboutsummaryrefslogtreecommitdiffstats
path: root/include/net/codel.h
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2012-05-10 03:51:25 -0400
committerDavid S. Miller <davem@davemloft.net>2012-05-10 23:35:02 -0400
commit76e3cc126bb223013a6b9a0e2a51238d1ef2e409 (patch)
tree37d1c2a3c4f4ebf68e9849262c7d75115652313f /include/net/codel.h
parent2dd875ff31ac7ff42d6fc7d7f78ac6c0635439f5 (diff)
codel: Controlled Delay AQM
An implementation of CoDel AQM, from Kathleen Nichols and Van Jacobson. http://queue.acm.org/detail.cfm?id=2209336 This AQM main input is no longer queue size in bytes or packets, but the delay packets stay in (FIFO) queue. As we don't have infinite memory, we still can drop packets in enqueue() in case of massive load, but mean of CoDel is to drop packets in dequeue(), using a control law based on two simple parameters : target : target sojourn time (default 5ms) interval : width of moving time window (default 100ms) Based on initial work from Dave Taht. Refactored to help future codel inclusion as a plugin for other linux qdisc (FQ_CODEL, ...), like RED. include/net/codel.h contains codel algorithm as close as possible than Kathleen reference. net/sched/sch_codel.c contains the linux qdisc specific glue. Separate structures permit a memory efficient implementation of fq_codel (to be sent as a separate work) : Each flow has its own struct codel_vars. timestamps are taken at enqueue() time with 1024 ns precision, allowing a range of 2199 seconds in queue, and 100Gb links support. iproute2 uses usec as base unit. Selected packets are dropped, unless ECN is enabled and packets can get ECN mark instead. Tested from 2Mb to 10Gb speeds with no particular problems, on ixgbe and tg3 drivers (BQL enabled). Usage: tc qdisc ... codel [ limit PACKETS ] [ target TIME ] [ interval TIME ] [ ecn ] qdisc codel 10: parent 1:1 limit 2000p target 3.0ms interval 60.0ms ecn Sent 13347099587 bytes 8815805 pkt (dropped 0, overlimits 0 requeues 0) rate 202365Kbit 16708pps backlog 113550b 75p requeues 0 count 116 lastcount 98 ldelay 4.3ms dropping drop_next 816us maxpacket 1514 ecn_mark 84399 drop_overlimit 0 CoDel must be seen as a base module, and should be used keeping in mind there is still a FIFO queue. So a typical setup will probably need a hierarchy of several qdiscs and packet classifiers to be able to meet whatever constraints a user might have. One possible example would be to use fq_codel, which combines Fair Queueing and CoDel, in replacement of sfq / sfq_red. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Dave Taht <dave.taht@bufferbloat.net> Cc: Kathleen Nichols <nichols@pollere.com> Cc: Van Jacobson <van@pollere.net> Cc: Tom Herbert <therbert@google.com> Cc: Matt Mathis <mattmathis@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Stephen Hemminger <shemminger@vyatta.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include/net/codel.h')
-rw-r--r--include/net/codel.h332
1 files changed, 332 insertions, 0 deletions
diff --git a/include/net/codel.h b/include/net/codel.h
new file mode 100644
index 000000000000..bce2cefa8c94
--- /dev/null
+++ b/include/net/codel.h
@@ -0,0 +1,332 @@
1#ifndef __NET_SCHED_CODEL_H
2#define __NET_SCHED_CODEL_H
3
4/*
5 * Codel - The Controlled-Delay Active Queue Management algorithm
6 *
7 * Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com>
8 * Copyright (C) 2011-2012 Van Jacobson <van@pollere.net>
9 * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net>
10 * Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions, and the following disclaimer,
17 * without modification.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 * 3. The names of the authors may not be used to endorse or promote products
22 * derived from this software without specific prior written permission.
23 *
24 * Alternatively, provided that this notice is retained in full, this
25 * software may be distributed under the terms of the GNU General
26 * Public License ("GPL") version 2, in which case the provisions of the
27 * GPL apply INSTEAD OF those given above.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
30 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
31 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
32 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
33 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
34 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
35 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
36 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
37 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
38 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
39 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
40 * DAMAGE.
41 *
42 */
43
44#include <linux/types.h>
45#include <linux/ktime.h>
46#include <linux/skbuff.h>
47#include <net/pkt_sched.h>
48#include <net/inet_ecn.h>
49
50/* Controlling Queue Delay (CoDel) algorithm
51 * =========================================
52 * Source : Kathleen Nichols and Van Jacobson
53 * http://queue.acm.org/detail.cfm?id=2209336
54 *
55 * Implemented on linux by Dave Taht and Eric Dumazet
56 */
57
58
59/* CoDel uses a 1024 nsec clock, encoded in u32
60 * This gives a range of 2199 seconds, because of signed compares
61 */
62typedef u32 codel_time_t;
63typedef s32 codel_tdiff_t;
64#define CODEL_SHIFT 10
65#define MS2TIME(a) ((a * NSEC_PER_MSEC) >> CODEL_SHIFT)
66
67static inline codel_time_t codel_get_time(void)
68{
69 u64 ns = ktime_to_ns(ktime_get());
70
71 return ns >> CODEL_SHIFT;
72}
73
74#define codel_time_after(a, b) ((s32)(a) - (s32)(b) > 0)
75#define codel_time_after_eq(a, b) ((s32)(a) - (s32)(b) >= 0)
76#define codel_time_before(a, b) ((s32)(a) - (s32)(b) < 0)
77#define codel_time_before_eq(a, b) ((s32)(a) - (s32)(b) <= 0)
78
79/* Qdiscs using codel plugin must use codel_skb_cb in their own cb[] */
80struct codel_skb_cb {
81 codel_time_t enqueue_time;
82};
83
84static struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb)
85{
86 qdisc_cb_private_validate(skb, sizeof(struct codel_skb_cb));
87 return (struct codel_skb_cb *)qdisc_skb_cb(skb)->data;
88}
89
90static codel_time_t codel_get_enqueue_time(const struct sk_buff *skb)
91{
92 return get_codel_cb(skb)->enqueue_time;
93}
94
95static void codel_set_enqueue_time(struct sk_buff *skb)
96{
97 get_codel_cb(skb)->enqueue_time = codel_get_time();
98}
99
100static inline u32 codel_time_to_us(codel_time_t val)
101{
102 u64 valns = ((u64)val << CODEL_SHIFT);
103
104 do_div(valns, NSEC_PER_USEC);
105 return (u32)valns;
106}
107
108/**
109 * struct codel_params - contains codel parameters
110 * @target: target queue size (in time units)
111 * @interval: width of moving time window
112 * @ecn: is Explicit Congestion Notification enabled
113 */
114struct codel_params {
115 codel_time_t target;
116 codel_time_t interval;
117 bool ecn;
118};
119
120/**
121 * struct codel_vars - contains codel variables
122 * @count: how many drops we've done since the last time we
123 * entered dropping state
124 * @lastcount: count at entry to dropping state
125 * @dropping: set to true if in dropping state
126 * @first_above_time: when we went (or will go) continuously above target
127 * for interval
128 * @drop_next: time to drop next packet, or when we dropped last
129 * @ldelay: sojourn time of last dequeued packet
130 */
131struct codel_vars {
132 u32 count;
133 u32 lastcount;
134 bool dropping;
135 codel_time_t first_above_time;
136 codel_time_t drop_next;
137 codel_time_t ldelay;
138};
139
140/**
141 * struct codel_stats - contains codel shared variables and stats
142 * @maxpacket: largest packet we've seen so far
143 * @drop_count: temp count of dropped packets in dequeue()
144 * ecn_mark: number of packets we ECN marked instead of dropping
145 */
146struct codel_stats {
147 u32 maxpacket;
148 u32 drop_count;
149 u32 ecn_mark;
150};
151
152static void codel_params_init(struct codel_params *params)
153{
154 params->interval = MS2TIME(100);
155 params->target = MS2TIME(5);
156 params->ecn = false;
157}
158
159static void codel_vars_init(struct codel_vars *vars)
160{
161 vars->drop_next = 0;
162 vars->first_above_time = 0;
163 vars->dropping = false; /* exit dropping state */
164 vars->count = 0;
165 vars->lastcount = 0;
166}
167
168static void codel_stats_init(struct codel_stats *stats)
169{
170 stats->maxpacket = 256;
171}
172
173/* return interval/sqrt(x) with good precision
174 * relies on int_sqrt(unsigned long x) kernel implementation
175 */
176static u32 codel_inv_sqrt(u32 _interval, u32 _x)
177{
178 u64 interval = _interval;
179 unsigned long x = _x;
180
181 /* Scale operands for max precision */
182
183#if BITS_PER_LONG == 64
184 x <<= 32; /* On 64bit arches, we can prescale x by 32bits */
185 interval <<= 16;
186#endif
187
188 while (x < (1UL << (BITS_PER_LONG - 2))) {
189 x <<= 2;
190 interval <<= 1;
191 }
192 do_div(interval, int_sqrt(x));
193 return (u32)interval;
194}
195
196static codel_time_t codel_control_law(codel_time_t t,
197 codel_time_t interval,
198 u32 count)
199{
200 return t + codel_inv_sqrt(interval, count);
201}
202
203
204static bool codel_should_drop(struct sk_buff *skb,
205 unsigned int *backlog,
206 struct codel_vars *vars,
207 struct codel_params *params,
208 struct codel_stats *stats,
209 codel_time_t now)
210{
211 bool ok_to_drop;
212
213 if (!skb) {
214 vars->first_above_time = 0;
215 return false;
216 }
217
218 vars->ldelay = now - codel_get_enqueue_time(skb);
219 *backlog -= qdisc_pkt_len(skb);
220
221 if (unlikely(qdisc_pkt_len(skb) > stats->maxpacket))
222 stats->maxpacket = qdisc_pkt_len(skb);
223
224 if (codel_time_before(vars->ldelay, params->target) ||
225 *backlog <= stats->maxpacket) {
226 /* went below - stay below for at least interval */
227 vars->first_above_time = 0;
228 return false;
229 }
230 ok_to_drop = false;
231 if (vars->first_above_time == 0) {
232 /* just went above from below. If we stay above
233 * for at least interval we'll say it's ok to drop
234 */
235 vars->first_above_time = now + params->interval;
236 } else if (codel_time_after(now, vars->first_above_time)) {
237 ok_to_drop = true;
238 }
239 return ok_to_drop;
240}
241
242typedef struct sk_buff * (*codel_skb_dequeue_t)(struct codel_vars *vars,
243 struct Qdisc *sch);
244
245static struct sk_buff *codel_dequeue(struct Qdisc *sch,
246 struct codel_params *params,
247 struct codel_vars *vars,
248 struct codel_stats *stats,
249 codel_skb_dequeue_t dequeue_func,
250 u32 *backlog)
251{
252 struct sk_buff *skb = dequeue_func(vars, sch);
253 codel_time_t now;
254 bool drop;
255
256 if (!skb) {
257 vars->dropping = false;
258 return skb;
259 }
260 now = codel_get_time();
261 drop = codel_should_drop(skb, backlog, vars, params, stats, now);
262 if (vars->dropping) {
263 if (!drop) {
264 /* sojourn time below target - leave dropping state */
265 vars->dropping = false;
266 } else if (codel_time_after_eq(now, vars->drop_next)) {
267 /* It's time for the next drop. Drop the current
268 * packet and dequeue the next. The dequeue might
269 * take us out of dropping state.
270 * If not, schedule the next drop.
271 * A large backlog might result in drop rates so high
272 * that the next drop should happen now,
273 * hence the while loop.
274 */
275 while (vars->dropping &&
276 codel_time_after_eq(now, vars->drop_next)) {
277 if (++vars->count == 0) /* avoid zero divides */
278 vars->count = ~0U;
279 if (params->ecn && INET_ECN_set_ce(skb)) {
280 stats->ecn_mark++;
281 vars->drop_next =
282 codel_control_law(vars->drop_next,
283 params->interval,
284 vars->count);
285 goto end;
286 }
287 qdisc_drop(skb, sch);
288 stats->drop_count++;
289 skb = dequeue_func(vars, sch);
290 if (!codel_should_drop(skb, backlog,
291 vars, params, stats, now)) {
292 /* leave dropping state */
293 vars->dropping = false;
294 } else {
295 /* and schedule the next drop */
296 vars->drop_next =
297 codel_control_law(vars->drop_next,
298 params->interval,
299 vars->count);
300 }
301 }
302 }
303 } else if (drop) {
304 if (params->ecn && INET_ECN_set_ce(skb)) {
305 stats->ecn_mark++;
306 } else {
307 qdisc_drop(skb, sch);
308 stats->drop_count++;
309
310 skb = dequeue_func(vars, sch);
311 drop = codel_should_drop(skb, backlog, vars, params,
312 stats, now);
313 }
314 vars->dropping = true;
315 /* if min went above target close to when we last went below it
316 * assume that the drop rate that controlled the queue on the
317 * last cycle is a good starting point to control it now.
318 */
319 if (codel_time_before(now - vars->drop_next,
320 16 * params->interval)) {
321 vars->count = (vars->count - vars->lastcount) | 1;
322 } else {
323 vars->count = 1;
324 }
325 vars->lastcount = vars->count;
326 vars->drop_next = codel_control_law(now, params->interval,
327 vars->count);
328 }
329end:
330 return skb;
331}
332#endif