aboutsummaryrefslogtreecommitdiffstats
path: root/net/sched
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2012-05-10 03:51:25 -0400
committerDavid S. Miller <davem@davemloft.net>2012-05-10 23:35:02 -0400
commit76e3cc126bb223013a6b9a0e2a51238d1ef2e409 (patch)
tree37d1c2a3c4f4ebf68e9849262c7d75115652313f /net/sched
parent2dd875ff31ac7ff42d6fc7d7f78ac6c0635439f5 (diff)
codel: Controlled Delay AQM
An implementation of CoDel AQM, from Kathleen Nichols and Van Jacobson. http://queue.acm.org/detail.cfm?id=2209336 This AQM main input is no longer queue size in bytes or packets, but the delay packets stay in (FIFO) queue. As we don't have infinite memory, we still can drop packets in enqueue() in case of massive load, but mean of CoDel is to drop packets in dequeue(), using a control law based on two simple parameters : target : target sojourn time (default 5ms) interval : width of moving time window (default 100ms) Based on initial work from Dave Taht. Refactored to help future codel inclusion as a plugin for other linux qdisc (FQ_CODEL, ...), like RED. include/net/codel.h contains codel algorithm as close as possible than Kathleen reference. net/sched/sch_codel.c contains the linux qdisc specific glue. Separate structures permit a memory efficient implementation of fq_codel (to be sent as a separate work) : Each flow has its own struct codel_vars. timestamps are taken at enqueue() time with 1024 ns precision, allowing a range of 2199 seconds in queue, and 100Gb links support. iproute2 uses usec as base unit. Selected packets are dropped, unless ECN is enabled and packets can get ECN mark instead. Tested from 2Mb to 10Gb speeds with no particular problems, on ixgbe and tg3 drivers (BQL enabled). Usage: tc qdisc ... codel [ limit PACKETS ] [ target TIME ] [ interval TIME ] [ ecn ] qdisc codel 10: parent 1:1 limit 2000p target 3.0ms interval 60.0ms ecn Sent 13347099587 bytes 8815805 pkt (dropped 0, overlimits 0 requeues 0) rate 202365Kbit 16708pps backlog 113550b 75p requeues 0 count 116 lastcount 98 ldelay 4.3ms dropping drop_next 816us maxpacket 1514 ecn_mark 84399 drop_overlimit 0 CoDel must be seen as a base module, and should be used keeping in mind there is still a FIFO queue. So a typical setup will probably need a hierarchy of several qdiscs and packet classifiers to be able to meet whatever constraints a user might have. One possible example would be to use fq_codel, which combines Fair Queueing and CoDel, in replacement of sfq / sfq_red. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Dave Taht <dave.taht@bufferbloat.net> Cc: Kathleen Nichols <nichols@pollere.com> Cc: Van Jacobson <van@pollere.net> Cc: Tom Herbert <therbert@google.com> Cc: Matt Mathis <mattmathis@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Stephen Hemminger <shemminger@vyatta.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/Kconfig11
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/sch_codel.c275
3 files changed, 287 insertions, 0 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 75b58f81d53d..fadd2522053d 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -250,6 +250,17 @@ config NET_SCH_QFQ
250 250
251 If unsure, say N. 251 If unsure, say N.
252 252
253config NET_SCH_CODEL
254 tristate "Controlled Delay AQM (CODEL)"
255 help
256 Say Y here if you want to use the Controlled Delay (CODEL)
257 packet scheduling algorithm.
258
259 To compile this driver as a module, choose M here: the module
260 will be called sch_codel.
261
262 If unsure, say N.
263
253config NET_SCH_INGRESS 264config NET_SCH_INGRESS
254 tristate "Ingress Qdisc" 265 tristate "Ingress Qdisc"
255 depends on NET_CLS_ACT 266 depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8cdf4e2b51d3..30fab03b8516 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
37obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o 37obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
38obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o 38obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
39obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o 39obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
40obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o
40 41
41obj-$(CONFIG_NET_CLS_U32) += cls_u32.o 42obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
42obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o 43obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
new file mode 100644
index 000000000000..b4a1a81e757e
--- /dev/null
+++ b/net/sched/sch_codel.c
@@ -0,0 +1,275 @@
1/*
2 * Codel - The Controlled-Delay Active Queue Management algorithm
3 *
4 * Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com>
5 * Copyright (C) 2011-2012 Van Jacobson <van@pollere.net>
6 *
7 * Implemented on linux by :
8 * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net>
9 * Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions, and the following disclaimer,
16 * without modification.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. The names of the authors may not be used to endorse or promote products
21 * derived from this software without specific prior written permission.
22 *
23 * Alternatively, provided that this notice is retained in full, this
24 * software may be distributed under the terms of the GNU General
25 * Public License ("GPL") version 2, in which case the provisions of the
26 * GPL apply INSTEAD OF those given above.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
29 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
31 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
32 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
33 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
34 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
35 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
36 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
37 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
38 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
39 * DAMAGE.
40 *
41 */
42
43#include <linux/module.h>
44#include <linux/slab.h>
45#include <linux/types.h>
46#include <linux/kernel.h>
47#include <linux/errno.h>
48#include <linux/skbuff.h>
49#include <net/pkt_sched.h>
50#include <net/codel.h>
51
52
53#define DEFAULT_CODEL_LIMIT 1000
54
55struct codel_sched_data {
56 struct codel_params params;
57 struct codel_vars vars;
58 struct codel_stats stats;
59 u32 drop_overlimit;
60};
61
62/* This is the specific function called from codel_dequeue()
63 * to dequeue a packet from queue. Note: backlog is handled in
64 * codel, we dont need to reduce it here.
65 */
66static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
67{
68 struct sk_buff *skb = __skb_dequeue(&sch->q);
69
70 prefetch(&skb->end); /* we'll need skb_shinfo() */
71 return skb;
72}
73
74static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
75{
76 struct codel_sched_data *q = qdisc_priv(sch);
77 struct sk_buff *skb;
78
79 skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats,
80 dequeue, &sch->qstats.backlog);
81 /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
82 * or HTB crashes. Defer it for next round.
83 */
84 if (q->stats.drop_count && sch->q.qlen) {
85 qdisc_tree_decrease_qlen(sch, q->stats.drop_count);
86 q->stats.drop_count = 0;
87 }
88 if (skb)
89 qdisc_bstats_update(sch, skb);
90 return skb;
91}
92
93static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
94{
95 struct codel_sched_data *q;
96
97 if (likely(qdisc_qlen(sch) < sch->limit)) {
98 codel_set_enqueue_time(skb);
99 return qdisc_enqueue_tail(skb, sch);
100 }
101 q = qdisc_priv(sch);
102 q->drop_overlimit++;
103 return qdisc_drop(skb, sch);
104}
105
106static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = {
107 [TCA_CODEL_TARGET] = { .type = NLA_U32 },
108 [TCA_CODEL_LIMIT] = { .type = NLA_U32 },
109 [TCA_CODEL_INTERVAL] = { .type = NLA_U32 },
110 [TCA_CODEL_ECN] = { .type = NLA_U32 },
111};
112
113static int codel_change(struct Qdisc *sch, struct nlattr *opt)
114{
115 struct codel_sched_data *q = qdisc_priv(sch);
116 struct nlattr *tb[TCA_CODEL_MAX + 1];
117 unsigned int qlen;
118 int err;
119
120 if (!opt)
121 return -EINVAL;
122
123 err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy);
124 if (err < 0)
125 return err;
126
127 sch_tree_lock(sch);
128
129 if (tb[TCA_CODEL_TARGET]) {
130 u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]);
131
132 q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT;
133 }
134
135 if (tb[TCA_CODEL_INTERVAL]) {
136 u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]);
137
138 q->params.interval = ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT;
139 }
140
141 if (tb[TCA_CODEL_LIMIT])
142 sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]);
143
144 if (tb[TCA_CODEL_ECN])
145 q->params.ecn = !!nla_get_u32(tb[TCA_CODEL_ECN]);
146
147 qlen = sch->q.qlen;
148 while (sch->q.qlen > sch->limit) {
149 struct sk_buff *skb = __skb_dequeue(&sch->q);
150
151 sch->qstats.backlog -= qdisc_pkt_len(skb);
152 qdisc_drop(skb, sch);
153 }
154 qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
155
156 sch_tree_unlock(sch);
157 return 0;
158}
159
160static int codel_init(struct Qdisc *sch, struct nlattr *opt)
161{
162 struct codel_sched_data *q = qdisc_priv(sch);
163
164 sch->limit = DEFAULT_CODEL_LIMIT;
165
166 codel_params_init(&q->params);
167 codel_vars_init(&q->vars);
168 codel_stats_init(&q->stats);
169
170 if (opt) {
171 int err = codel_change(sch, opt);
172
173 if (err)
174 return err;
175 }
176
177 if (sch->limit >= 1)
178 sch->flags |= TCQ_F_CAN_BYPASS;
179 else
180 sch->flags &= ~TCQ_F_CAN_BYPASS;
181
182 return 0;
183}
184
185static int codel_dump(struct Qdisc *sch, struct sk_buff *skb)
186{
187 struct codel_sched_data *q = qdisc_priv(sch);
188 struct nlattr *opts;
189
190 opts = nla_nest_start(skb, TCA_OPTIONS);
191 if (opts == NULL)
192 goto nla_put_failure;
193
194 if (nla_put_u32(skb, TCA_CODEL_TARGET,
195 codel_time_to_us(q->params.target)) ||
196 nla_put_u32(skb, TCA_CODEL_LIMIT,
197 sch->limit) ||
198 nla_put_u32(skb, TCA_CODEL_INTERVAL,
199 codel_time_to_us(q->params.interval)) ||
200 nla_put_u32(skb, TCA_CODEL_ECN,
201 q->params.ecn))
202 goto nla_put_failure;
203
204 return nla_nest_end(skb, opts);
205
206nla_put_failure:
207 nla_nest_cancel(skb, opts);
208 return -1;
209}
210
211static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
212{
213 const struct codel_sched_data *q = qdisc_priv(sch);
214 struct tc_codel_xstats st = {
215 .maxpacket = q->stats.maxpacket,
216 .count = q->vars.count,
217 .lastcount = q->vars.lastcount,
218 .drop_overlimit = q->drop_overlimit,
219 .ldelay = codel_time_to_us(q->vars.ldelay),
220 .dropping = q->vars.dropping,
221 .ecn_mark = q->stats.ecn_mark,
222 };
223
224 if (q->vars.dropping) {
225 codel_tdiff_t delta = q->vars.drop_next - codel_get_time();
226
227 if (delta >= 0)
228 st.drop_next = codel_time_to_us(delta);
229 else
230 st.drop_next = -codel_time_to_us(-delta);
231 }
232
233 return gnet_stats_copy_app(d, &st, sizeof(st));
234}
235
236static void codel_reset(struct Qdisc *sch)
237{
238 struct codel_sched_data *q = qdisc_priv(sch);
239
240 qdisc_reset_queue(sch);
241 codel_vars_init(&q->vars);
242}
243
244static struct Qdisc_ops codel_qdisc_ops __read_mostly = {
245 .id = "codel",
246 .priv_size = sizeof(struct codel_sched_data),
247
248 .enqueue = codel_qdisc_enqueue,
249 .dequeue = codel_qdisc_dequeue,
250 .peek = qdisc_peek_dequeued,
251 .init = codel_init,
252 .reset = codel_reset,
253 .change = codel_change,
254 .dump = codel_dump,
255 .dump_stats = codel_dump_stats,
256 .owner = THIS_MODULE,
257};
258
259static int __init codel_module_init(void)
260{
261 return register_qdisc(&codel_qdisc_ops);
262}
263
264static void __exit codel_module_exit(void)
265{
266 unregister_qdisc(&codel_qdisc_ops);
267}
268
269module_init(codel_module_init)
270module_exit(codel_module_exit)
271
272MODULE_DESCRIPTION("Controlled Delay queue discipline");
273MODULE_AUTHOR("Dave Taht");
274MODULE_AUTHOR("Eric Dumazet");
275MODULE_LICENSE("Dual BSD/GPL");