aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJohn Fastabend <john.r.fastabend@intel.com>2011-01-17 03:06:09 -0500
committerDavid S. Miller <davem@davemloft.net>2011-01-20 02:31:11 -0500
commitb8970f0bfc78103cb74c66055de7379b15097840 (patch)
treea85578cddfd506e049af0c78a56dd0ee342fc93b
parent4f57c087de9b46182545676d2c594120a20f2e58 (diff)
net_sched: implement a root container qdisc sch_mqprio
This implements a mqprio queueing discipline that by default creates a pfifo_fast qdisc per tx queue and provides the needed configuration interface. Using the mqprio qdisc the number of tcs currently in use along with the range of queues alloted to each class can be configured. By default skbs are mapped to traffic classes using the skb priority. This mapping is configurable. Configurable parameters, struct tc_mqprio_qopt { __u8 num_tc; __u8 prio_tc_map[TC_BITMASK + 1]; __u8 hw; __u16 count[TC_MAX_QUEUE]; __u16 offset[TC_MAX_QUEUE]; }; Here the count/offset pairing give the queue alignment and the prio_tc_map gives the mapping from skb->priority to tc. The hw bit determines if the hardware should configure the count and offset values. If the hardware bit is set then the operation will fail if the hardware does not implement the ndo_setup_tc operation. This is to avoid undetermined states where the hardware may or may not control the queue mapping. Also minimal bounds checking is done on the count/offset to verify a queue does not exceed num_tx_queues and that queue ranges do not overlap. Otherwise it is left to user policy or hardware configuration to create useful mappings. It is expected that hardware QOS schemes can be implemented by creating appropriate mappings of queues in ndo_tc_setup(). One expected use case is drivers will use the ndo_setup_tc to map queue ranges onto 802.1Q traffic classes. This provides a generic mechanism to map network traffic onto these traffic classes and removes the need for lower layer drivers to know specifics about traffic types. Signed-off-by: John Fastabend <john.r.fastabend@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/pkt_sched.h12
-rw-r--r--net/sched/Kconfig12
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/sch_generic.c4
-rw-r--r--net/sched/sch_mqprio.c417
5 files changed, 446 insertions, 0 deletions
diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 2cfa4bc8dea6..776cd93d5f7b 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -481,4 +481,16 @@ struct tc_drr_stats {
481 __u32 deficit; 481 __u32 deficit;
482}; 482};
483 483
484/* MQPRIO */
485#define TC_QOPT_BITMASK 15
486#define TC_QOPT_MAX_QUEUE 16
487
488struct tc_mqprio_qopt {
489 __u8 num_tc;
490 __u8 prio_tc_map[TC_QOPT_BITMASK + 1];
491 __u8 hw;
492 __u16 count[TC_QOPT_MAX_QUEUE];
493 __u16 offset[TC_QOPT_MAX_QUEUE];
494};
495
484#endif 496#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index f04d4a484d53..73431d4aa6ef 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -205,6 +205,18 @@ config NET_SCH_DRR
205 205
206 If unsure, say N. 206 If unsure, say N.
207 207
208config NET_SCH_MQPRIO
209 tristate "Multi-queue priority scheduler (MQPRIO)"
210 help
211 Say Y here if you want to use the Multi-queue Priority scheduler.
212 This scheduler allows QOS to be offloaded on NICs that have support
213 for offloading QOS schedulers.
214
215 To compile this driver as a module, choose M here: the module will
216 be called sch_mqprio.
217
218 If unsure, say N.
219
208config NET_SCH_INGRESS 220config NET_SCH_INGRESS
209 tristate "Ingress Qdisc" 221 tristate "Ingress Qdisc"
210 depends on NET_CLS_ACT 222 depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 960f5dba6304..26ce681a2c60 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o
32obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o 32obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
33obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o 33obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
34obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o 34obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
35obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
35obj-$(CONFIG_NET_CLS_U32) += cls_u32.o 36obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
36obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o 37obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
37obj-$(CONFIG_NET_CLS_FW) += cls_fw.o 38obj-$(CONFIG_NET_CLS_FW) += cls_fw.o
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 34dc598440a2..723b27849a50 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -540,6 +540,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
540 .dump = pfifo_fast_dump, 540 .dump = pfifo_fast_dump,
541 .owner = THIS_MODULE, 541 .owner = THIS_MODULE,
542}; 542};
543EXPORT_SYMBOL(pfifo_fast_ops);
543 544
544struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, 545struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
545 struct Qdisc_ops *ops) 546 struct Qdisc_ops *ops)
@@ -674,6 +675,7 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
674 675
675 return oqdisc; 676 return oqdisc;
676} 677}
678EXPORT_SYMBOL(dev_graft_qdisc);
677 679
678static void attach_one_default_qdisc(struct net_device *dev, 680static void attach_one_default_qdisc(struct net_device *dev,
679 struct netdev_queue *dev_queue, 681 struct netdev_queue *dev_queue,
@@ -761,6 +763,7 @@ void dev_activate(struct net_device *dev)
761 dev_watchdog_up(dev); 763 dev_watchdog_up(dev);
762 } 764 }
763} 765}
766EXPORT_SYMBOL(dev_activate);
764 767
765static void dev_deactivate_queue(struct net_device *dev, 768static void dev_deactivate_queue(struct net_device *dev,
766 struct netdev_queue *dev_queue, 769 struct netdev_queue *dev_queue,
@@ -840,6 +843,7 @@ void dev_deactivate(struct net_device *dev)
840 list_add(&dev->unreg_list, &single); 843 list_add(&dev->unreg_list, &single);
841 dev_deactivate_many(&single); 844 dev_deactivate_many(&single);
842} 845}
846EXPORT_SYMBOL(dev_deactivate);
843 847
844static void dev_init_scheduler_queue(struct net_device *dev, 848static void dev_init_scheduler_queue(struct net_device *dev,
845 struct netdev_queue *dev_queue, 849 struct netdev_queue *dev_queue,
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
new file mode 100644
index 000000000000..8620c65f480a
--- /dev/null
+++ b/net/sched/sch_mqprio.c
@@ -0,0 +1,417 @@
1/*
2 * net/sched/sch_mqprio.c
3 *
4 * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * version 2 as published by the Free Software Foundation.
9 */
10
11#include <linux/types.h>
12#include <linux/slab.h>
13#include <linux/kernel.h>
14#include <linux/string.h>
15#include <linux/errno.h>
16#include <linux/skbuff.h>
17#include <net/netlink.h>
18#include <net/pkt_sched.h>
19#include <net/sch_generic.h>
20
21struct mqprio_sched {
22 struct Qdisc **qdiscs;
23 int hw_owned;
24};
25
26static void mqprio_destroy(struct Qdisc *sch)
27{
28 struct net_device *dev = qdisc_dev(sch);
29 struct mqprio_sched *priv = qdisc_priv(sch);
30 unsigned int ntx;
31
32 if (!priv->qdiscs)
33 return;
34
35 for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
36 qdisc_destroy(priv->qdiscs[ntx]);
37
38 if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc)
39 dev->netdev_ops->ndo_setup_tc(dev, 0);
40 else
41 netdev_set_num_tc(dev, 0);
42
43 kfree(priv->qdiscs);
44}
45
46static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
47{
48 int i, j;
49
50 /* Verify num_tc is not out of max range */
51 if (qopt->num_tc > TC_MAX_QUEUE)
52 return -EINVAL;
53
54 /* Verify priority mapping uses valid tcs */
55 for (i = 0; i < TC_BITMASK + 1; i++) {
56 if (qopt->prio_tc_map[i] >= qopt->num_tc)
57 return -EINVAL;
58 }
59
60 /* net_device does not support requested operation */
61 if (qopt->hw && !dev->netdev_ops->ndo_setup_tc)
62 return -EINVAL;
63
64 /* if hw owned qcount and qoffset are taken from LLD so
65 * no reason to verify them here
66 */
67 if (qopt->hw)
68 return 0;
69
70 for (i = 0; i < qopt->num_tc; i++) {
71 unsigned int last = qopt->offset[i] + qopt->count[i];
72
73 /* Verify the queue count is in tx range being equal to the
74 * real_num_tx_queues indicates the last queue is in use.
75 */
76 if (qopt->offset[i] >= dev->real_num_tx_queues ||
77 !qopt->count[i] ||
78 last > dev->real_num_tx_queues)
79 return -EINVAL;
80
81 /* Verify that the offset and counts do not overlap */
82 for (j = i + 1; j < qopt->num_tc; j++) {
83 if (last > qopt->offset[j])
84 return -EINVAL;
85 }
86 }
87
88 return 0;
89}
90
91static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
92{
93 struct net_device *dev = qdisc_dev(sch);
94 struct mqprio_sched *priv = qdisc_priv(sch);
95 struct netdev_queue *dev_queue;
96 struct Qdisc *qdisc;
97 int i, err = -EOPNOTSUPP;
98 struct tc_mqprio_qopt *qopt = NULL;
99
100 BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
101 BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
102
103 if (sch->parent != TC_H_ROOT)
104 return -EOPNOTSUPP;
105
106 if (!netif_is_multiqueue(dev))
107 return -EOPNOTSUPP;
108
109 if (nla_len(opt) < sizeof(*qopt))
110 return -EINVAL;
111
112 qopt = nla_data(opt);
113 if (mqprio_parse_opt(dev, qopt))
114 return -EINVAL;
115
116 /* pre-allocate qdisc, attachment can't fail */
117 priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
118 GFP_KERNEL);
119 if (priv->qdiscs == NULL) {
120 err = -ENOMEM;
121 goto err;
122 }
123
124 for (i = 0; i < dev->num_tx_queues; i++) {
125 dev_queue = netdev_get_tx_queue(dev, i);
126 qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops,
127 TC_H_MAKE(TC_H_MAJ(sch->handle),
128 TC_H_MIN(i + 1)));
129 if (qdisc == NULL) {
130 err = -ENOMEM;
131 goto err;
132 }
133 qdisc->flags |= TCQ_F_CAN_BYPASS;
134 priv->qdiscs[i] = qdisc;
135 }
136
137 /* If the mqprio options indicate that hardware should own
138 * the queue mapping then run ndo_setup_tc otherwise use the
139 * supplied and verified mapping
140 */
141 if (qopt->hw) {
142 priv->hw_owned = 1;
143 err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc);
144 if (err)
145 goto err;
146 } else {
147 netdev_set_num_tc(dev, qopt->num_tc);
148 for (i = 0; i < qopt->num_tc; i++)
149 netdev_set_tc_queue(dev, i,
150 qopt->count[i], qopt->offset[i]);
151 }
152
153 /* Always use supplied priority mappings */
154 for (i = 0; i < TC_BITMASK + 1; i++)
155 netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]);
156
157 sch->flags |= TCQ_F_MQROOT;
158 return 0;
159
160err:
161 mqprio_destroy(sch);
162 return err;
163}
164
165static void mqprio_attach(struct Qdisc *sch)
166{
167 struct net_device *dev = qdisc_dev(sch);
168 struct mqprio_sched *priv = qdisc_priv(sch);
169 struct Qdisc *qdisc;
170 unsigned int ntx;
171
172 /* Attach underlying qdisc */
173 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
174 qdisc = priv->qdiscs[ntx];
175 qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc);
176 if (qdisc)
177 qdisc_destroy(qdisc);
178 }
179 kfree(priv->qdiscs);
180 priv->qdiscs = NULL;
181}
182
183static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch,
184 unsigned long cl)
185{
186 struct net_device *dev = qdisc_dev(sch);
187 unsigned long ntx = cl - 1 - netdev_get_num_tc(dev);
188
189 if (ntx >= dev->num_tx_queues)
190 return NULL;
191 return netdev_get_tx_queue(dev, ntx);
192}
193
194static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
195 struct Qdisc **old)
196{
197 struct net_device *dev = qdisc_dev(sch);
198 struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
199
200 if (!dev_queue)
201 return -EINVAL;
202
203 if (dev->flags & IFF_UP)
204 dev_deactivate(dev);
205
206 *old = dev_graft_qdisc(dev_queue, new);
207
208 if (dev->flags & IFF_UP)
209 dev_activate(dev);
210
211 return 0;
212}
213
214static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
215{
216 struct net_device *dev = qdisc_dev(sch);
217 struct mqprio_sched *priv = qdisc_priv(sch);
218 unsigned char *b = skb_tail_pointer(skb);
219 struct tc_mqprio_qopt opt;
220 struct Qdisc *qdisc;
221 unsigned int i;
222
223 sch->q.qlen = 0;
224 memset(&sch->bstats, 0, sizeof(sch->bstats));
225 memset(&sch->qstats, 0, sizeof(sch->qstats));
226
227 for (i = 0; i < dev->num_tx_queues; i++) {
228 qdisc = netdev_get_tx_queue(dev, i)->qdisc;
229 spin_lock_bh(qdisc_lock(qdisc));
230 sch->q.qlen += qdisc->q.qlen;
231 sch->bstats.bytes += qdisc->bstats.bytes;
232 sch->bstats.packets += qdisc->bstats.packets;
233 sch->qstats.qlen += qdisc->qstats.qlen;
234 sch->qstats.backlog += qdisc->qstats.backlog;
235 sch->qstats.drops += qdisc->qstats.drops;
236 sch->qstats.requeues += qdisc->qstats.requeues;
237 sch->qstats.overlimits += qdisc->qstats.overlimits;
238 spin_unlock_bh(qdisc_lock(qdisc));
239 }
240
241 opt.num_tc = netdev_get_num_tc(dev);
242 memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
243 opt.hw = priv->hw_owned;
244
245 for (i = 0; i < netdev_get_num_tc(dev); i++) {
246 opt.count[i] = dev->tc_to_txq[i].count;
247 opt.offset[i] = dev->tc_to_txq[i].offset;
248 }
249
250 NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
251
252 return skb->len;
253nla_put_failure:
254 nlmsg_trim(skb, b);
255 return -1;
256}
257
258static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl)
259{
260 struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
261
262 if (!dev_queue)
263 return NULL;
264
265 return dev_queue->qdisc_sleeping;
266}
267
268static unsigned long mqprio_get(struct Qdisc *sch, u32 classid)
269{
270 struct net_device *dev = qdisc_dev(sch);
271 unsigned int ntx = TC_H_MIN(classid);
272
273 if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev))
274 return 0;
275 return ntx;
276}
277
278static void mqprio_put(struct Qdisc *sch, unsigned long cl)
279{
280}
281
282static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl,
283 struct sk_buff *skb, struct tcmsg *tcm)
284{
285 struct net_device *dev = qdisc_dev(sch);
286
287 if (cl <= netdev_get_num_tc(dev)) {
288 tcm->tcm_parent = TC_H_ROOT;
289 tcm->tcm_info = 0;
290 } else {
291 int i;
292 struct netdev_queue *dev_queue;
293
294 dev_queue = mqprio_queue_get(sch, cl);
295 tcm->tcm_parent = 0;
296 for (i = 0; i < netdev_get_num_tc(dev); i++) {
297 struct netdev_tc_txq tc = dev->tc_to_txq[i];
298 int q_idx = cl - netdev_get_num_tc(dev);
299
300 if (q_idx > tc.offset &&
301 q_idx <= tc.offset + tc.count) {
302 tcm->tcm_parent =
303 TC_H_MAKE(TC_H_MAJ(sch->handle),
304 TC_H_MIN(i + 1));
305 break;
306 }
307 }
308 tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
309 }
310 tcm->tcm_handle |= TC_H_MIN(cl);
311 return 0;
312}
313
314static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
315 struct gnet_dump *d)
316{
317 struct net_device *dev = qdisc_dev(sch);
318
319 if (cl <= netdev_get_num_tc(dev)) {
320 int i;
321 struct Qdisc *qdisc;
322 struct gnet_stats_queue qstats = {0};
323 struct gnet_stats_basic_packed bstats = {0};
324 struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1];
325
326 /* Drop lock here it will be reclaimed before touching
327 * statistics this is required because the d->lock we
328 * hold here is the look on dev_queue->qdisc_sleeping
329 * also acquired below.
330 */
331 spin_unlock_bh(d->lock);
332
333 for (i = tc.offset; i < tc.offset + tc.count; i++) {
334 qdisc = netdev_get_tx_queue(dev, i)->qdisc;
335 spin_lock_bh(qdisc_lock(qdisc));
336 bstats.bytes += qdisc->bstats.bytes;
337 bstats.packets += qdisc->bstats.packets;
338 qstats.qlen += qdisc->qstats.qlen;
339 qstats.backlog += qdisc->qstats.backlog;
340 qstats.drops += qdisc->qstats.drops;
341 qstats.requeues += qdisc->qstats.requeues;
342 qstats.overlimits += qdisc->qstats.overlimits;
343 spin_unlock_bh(qdisc_lock(qdisc));
344 }
345 /* Reclaim root sleeping lock before completing stats */
346 spin_lock_bh(d->lock);
347 if (gnet_stats_copy_basic(d, &bstats) < 0 ||
348 gnet_stats_copy_queue(d, &qstats) < 0)
349 return -1;
350 } else {
351 struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
352
353 sch = dev_queue->qdisc_sleeping;
354 sch->qstats.qlen = sch->q.qlen;
355 if (gnet_stats_copy_basic(d, &sch->bstats) < 0 ||
356 gnet_stats_copy_queue(d, &sch->qstats) < 0)
357 return -1;
358 }
359 return 0;
360}
361
362static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
363{
364 struct net_device *dev = qdisc_dev(sch);
365 unsigned long ntx;
366
367 if (arg->stop)
368 return;
369
370 /* Walk hierarchy with a virtual class per tc */
371 arg->count = arg->skip;
372 for (ntx = arg->skip;
373 ntx < dev->num_tx_queues + netdev_get_num_tc(dev);
374 ntx++) {
375 if (arg->fn(sch, ntx + 1, arg) < 0) {
376 arg->stop = 1;
377 break;
378 }
379 arg->count++;
380 }
381}
382
383static const struct Qdisc_class_ops mqprio_class_ops = {
384 .graft = mqprio_graft,
385 .leaf = mqprio_leaf,
386 .get = mqprio_get,
387 .put = mqprio_put,
388 .walk = mqprio_walk,
389 .dump = mqprio_dump_class,
390 .dump_stats = mqprio_dump_class_stats,
391};
392
393struct Qdisc_ops mqprio_qdisc_ops __read_mostly = {
394 .cl_ops = &mqprio_class_ops,
395 .id = "mqprio",
396 .priv_size = sizeof(struct mqprio_sched),
397 .init = mqprio_init,
398 .destroy = mqprio_destroy,
399 .attach = mqprio_attach,
400 .dump = mqprio_dump,
401 .owner = THIS_MODULE,
402};
403
404static int __init mqprio_module_init(void)
405{
406 return register_qdisc(&mqprio_qdisc_ops);
407}
408
409static void __exit mqprio_module_exit(void)
410{
411 unregister_qdisc(&mqprio_qdisc_ops);
412}
413
414module_init(mqprio_module_init);
415module_exit(mqprio_module_exit);
416
417MODULE_LICENSE("GPL");