aboutsummaryrefslogtreecommitdiffstats
path: root/net/sched
diff options
context:
space:
mode:
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/Kconfig508
-rw-r--r--net/sched/Makefile41
-rw-r--r--net/sched/act_api.c894
-rw-r--r--net/sched/cls_api.c642
-rw-r--r--net/sched/cls_basic.c303
-rw-r--r--net/sched/cls_fw.c378
-rw-r--r--net/sched/cls_route.c639
-rw-r--r--net/sched/cls_rsvp.c43
-rw-r--r--net/sched/cls_rsvp.h667
-rw-r--r--net/sched/cls_rsvp6.c44
-rw-r--r--net/sched/cls_tcindex.c537
-rw-r--r--net/sched/cls_u32.c828
-rw-r--r--net/sched/em_cmp.c101
-rw-r--r--net/sched/em_meta.c661
-rw-r--r--net/sched/em_nbyte.c82
-rw-r--r--net/sched/em_u32.c63
-rw-r--r--net/sched/ematch.c524
-rw-r--r--net/sched/estimator.c197
-rw-r--r--net/sched/gact.c231
-rw-r--r--net/sched/ipt.c326
-rw-r--r--net/sched/mirred.c276
-rw-r--r--net/sched/pedit.c288
-rw-r--r--net/sched/police.c612
-rw-r--r--net/sched/sch_api.c1296
-rw-r--r--net/sched/sch_atm.c735
-rw-r--r--net/sched/sch_cbq.c2124
-rw-r--r--net/sched/sch_dsmark.c479
-rw-r--r--net/sched/sch_fifo.c212
-rw-r--r--net/sched/sch_generic.c609
-rw-r--r--net/sched/sch_gred.c630
-rw-r--r--net/sched/sch_hfsc.c1822
-rw-r--r--net/sched/sch_htb.c1759
-rw-r--r--net/sched/sch_ingress.c436
-rw-r--r--net/sched/sch_netem.c598
-rw-r--r--net/sched/sch_prio.c444
-rw-r--r--net/sched/sch_red.c459
-rw-r--r--net/sched/sch_sfq.c497
-rw-r--r--net/sched/sch_tbf.c543
-rw-r--r--net/sched/sch_teql.c511
39 files changed, 22039 insertions, 0 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
new file mode 100644
index 000000000000..3d1d902dd1a1
--- /dev/null
+++ b/net/sched/Kconfig
@@ -0,0 +1,508 @@
1#
2# Traffic control configuration.
3#
4choice
5 prompt "Packet scheduler clock source"
6 depends on NET_SCHED
7 default NET_SCH_CLK_JIFFIES
8 help
9 Packet schedulers need a monotonic clock that increments at a static
10 rate. The kernel provides several suitable interfaces, each with
11 different properties:
12
13 - high resolution (us or better)
14 - fast to read (minimal locking, no i/o access)
15 - synchronized on all processors
16 - handles cpu clock frequency changes
17
18 but nothing provides all of the above.
19
20config NET_SCH_CLK_JIFFIES
21 bool "Timer interrupt"
22 help
23 Say Y here if you want to use the timer interrupt (jiffies) as clock
24 source. This clock source is fast, synchronized on all processors and
25 handles cpu clock frequency changes, but its resolution is too low
26 for accurate shaping except at very low speed.
27
28config NET_SCH_CLK_GETTIMEOFDAY
29 bool "gettimeofday"
30 help
31 Say Y here if you want to use gettimeofday as clock source. This clock
32 source has high resolution, is synchronized on all processors and
33 handles cpu clock frequency changes, but it is slow.
34
35 Choose this if you need a high resolution clock source but can't use
36 the CPU's cycle counter.
37
38config NET_SCH_CLK_CPU
39 bool "CPU cycle counter"
40 depends on X86_TSC || X86_64 || ALPHA || SPARC64 || PPC64 || IA64
41 help
42 Say Y here if you want to use the CPU's cycle counter as clock source.
43 This is a cheap and high resolution clock source, but on some
44 architectures it is not synchronized on all processors and doesn't
45 handle cpu clock frequency changes.
46
47 The useable cycle counters are:
48
49 x86/x86_64 - Timestamp Counter
50 alpha - Cycle Counter
51 sparc64 - %ticks register
52 ppc64 - Time base
53 ia64 - Interval Time Counter
54
55 Choose this if your CPU's cycle counter is working properly.
56
57endchoice
58
59config NET_SCH_CBQ
60 tristate "CBQ packet scheduler"
61 depends on NET_SCHED
62 ---help---
63 Say Y here if you want to use the Class-Based Queueing (CBQ) packet
64 scheduling algorithm for some of your network devices. This
65 algorithm classifies the waiting packets into a tree-like hierarchy
66 of classes; the leaves of this tree are in turn scheduled by
67 separate algorithms (called "disciplines" in this context).
68
69 See the top of <file:net/sched/sch_cbq.c> for references about the
70 CBQ algorithm.
71
72 CBQ is a commonly used scheduler, so if you're unsure, you should
73 say Y here. Then say Y to all the queueing algorithms below that you
74 want to use as CBQ disciplines. Then say Y to "Packet classifier
75 API" and say Y to all the classifiers you want to use; a classifier
76 is a routine that allows you to sort your outgoing traffic into
77 classes based on a certain criterion.
78
79 To compile this code as a module, choose M here: the
80 module will be called sch_cbq.
81
82config NET_SCH_HTB
83 tristate "HTB packet scheduler"
84 depends on NET_SCHED
85 ---help---
86 Say Y here if you want to use the Hierarchical Token Buckets (HTB)
87 packet scheduling algorithm for some of your network devices. See
88 <http://luxik.cdi.cz/~devik/qos/htb/> for complete manual and
89 in-depth articles.
90
91 HTB is very similar to the CBQ regarding its goals however is has
92 different properties and different algorithm.
93
94 To compile this code as a module, choose M here: the
95 module will be called sch_htb.
96
97config NET_SCH_HFSC
98 tristate "HFSC packet scheduler"
99 depends on NET_SCHED
100 ---help---
101 Say Y here if you want to use the Hierarchical Fair Service Curve
102 (HFSC) packet scheduling algorithm for some of your network devices.
103
104 To compile this code as a module, choose M here: the
105 module will be called sch_hfsc.
106
107#tristate ' H-PFQ packet scheduler' CONFIG_NET_SCH_HPFQ
108config NET_SCH_ATM
109 tristate "ATM pseudo-scheduler"
110 depends on NET_SCHED && ATM
111 ---help---
112 Say Y here if you want to use the ATM pseudo-scheduler. This
113 provides a framework for invoking classifiers (aka "filters"), which
114 in turn select classes of this queuing discipline. Each class maps
115 the flow(s) it is handling to a given virtual circuit (see the top of
116 <file:net/sched/sch_atm.c>).
117
118 To compile this code as a module, choose M here: the
119 module will be called sch_atm.
120
121config NET_SCH_PRIO
122 tristate "The simplest PRIO pseudoscheduler"
123 depends on NET_SCHED
124 help
125 Say Y here if you want to use an n-band priority queue packet
126 "scheduler" for some of your network devices or as a leaf discipline
127 for the CBQ scheduling algorithm. If unsure, say Y.
128
129 To compile this code as a module, choose M here: the
130 module will be called sch_prio.
131
132config NET_SCH_RED
133 tristate "RED queue"
134 depends on NET_SCHED
135 help
136 Say Y here if you want to use the Random Early Detection (RED)
137 packet scheduling algorithm for some of your network devices (see
138 the top of <file:net/sched/sch_red.c> for details and references
139 about the algorithm).
140
141 To compile this code as a module, choose M here: the
142 module will be called sch_red.
143
144config NET_SCH_SFQ
145 tristate "SFQ queue"
146 depends on NET_SCHED
147 ---help---
148 Say Y here if you want to use the Stochastic Fairness Queueing (SFQ)
149 packet scheduling algorithm for some of your network devices or as a
150 leaf discipline for the CBQ scheduling algorithm (see the top of
151 <file:net/sched/sch_sfq.c> for details and references about the SFQ
152 algorithm).
153
154 To compile this code as a module, choose M here: the
155 module will be called sch_sfq.
156
157config NET_SCH_TEQL
158 tristate "TEQL queue"
159 depends on NET_SCHED
160 ---help---
161 Say Y here if you want to use the True Link Equalizer (TLE) packet
162 scheduling algorithm for some of your network devices or as a leaf
163 discipline for the CBQ scheduling algorithm. This queueing
164 discipline allows the combination of several physical devices into
165 one virtual device. (see the top of <file:net/sched/sch_teql.c> for
166 details).
167
168 To compile this code as a module, choose M here: the
169 module will be called sch_teql.
170
171config NET_SCH_TBF
172 tristate "TBF queue"
173 depends on NET_SCHED
174 help
175 Say Y here if you want to use the Simple Token Bucket Filter (TBF)
176 packet scheduling algorithm for some of your network devices or as a
177 leaf discipline for the CBQ scheduling algorithm (see the top of
178 <file:net/sched/sch_tbf.c> for a description of the TBF algorithm).
179
180 To compile this code as a module, choose M here: the
181 module will be called sch_tbf.
182
183config NET_SCH_GRED
184 tristate "GRED queue"
185 depends on NET_SCHED
186 help
187 Say Y here if you want to use the Generic Random Early Detection
188 (RED) packet scheduling algorithm for some of your network devices
189 (see the top of <file:net/sched/sch_red.c> for details and
190 references about the algorithm).
191
192 To compile this code as a module, choose M here: the
193 module will be called sch_gred.
194
195config NET_SCH_DSMARK
196 tristate "Diffserv field marker"
197 depends on NET_SCHED
198 help
199 Say Y if you want to schedule packets according to the
200 Differentiated Services architecture proposed in RFC 2475.
201 Technical information on this method, with pointers to associated
202 RFCs, is available at <http://www.gta.ufrj.br/diffserv/>.
203
204 To compile this code as a module, choose M here: the
205 module will be called sch_dsmark.
206
207config NET_SCH_NETEM
208 tristate "Network emulator"
209 depends on NET_SCHED
210 help
211 Say Y if you want to emulate network delay, loss, and packet
212 re-ordering. This is often useful to simulate networks when
213 testing applications or protocols.
214
215 To compile this driver as a module, choose M here: the module
216 will be called sch_netem.
217
218 If unsure, say N.
219
220config NET_SCH_INGRESS
221 tristate "Ingress Qdisc"
222 depends on NET_SCHED
223 help
224 If you say Y here, you will be able to police incoming bandwidth
225 and drop packets when this bandwidth exceeds your desired rate.
226 If unsure, say Y.
227
228 To compile this code as a module, choose M here: the
229 module will be called sch_ingress.
230
231config NET_QOS
232 bool "QoS support"
233 depends on NET_SCHED
234 ---help---
235 Say Y here if you want to include Quality Of Service scheduling
236 features, which means that you will be able to request certain
237 rate-of-flow limits for your network devices.
238
239 This Quality of Service (QoS) support will enable you to use
240 Differentiated Services (diffserv) and Resource Reservation Protocol
241 (RSVP) on your Linux router if you also say Y to "Packet classifier
242 API" and to some classifiers below. Documentation and software is at
243 <http://diffserv.sourceforge.net/>.
244
245 Note that the answer to this question won't directly affect the
246 kernel: saying N will just cause the configurator to skip all
247 the questions about QoS support.
248
249config NET_ESTIMATOR
250 bool "Rate estimator"
251 depends on NET_QOS
252 help
253 In order for Quality of Service scheduling to work, the current
254 rate-of-flow for a network device has to be estimated; if you say Y
255 here, the kernel will do just that.
256
257config NET_CLS
258 bool "Packet classifier API"
259 depends on NET_SCHED
260 ---help---
261 The CBQ scheduling algorithm requires that network packets which are
262 scheduled to be sent out over a network device be classified
263 according to some criterion. If you say Y here, you will get a
264 choice of several different packet classifiers with the following
265 questions.
266
267 This will enable you to use Differentiated Services (diffserv) and
268 Resource Reservation Protocol (RSVP) on your Linux router.
269 Documentation and software is at
270 <http://diffserv.sourceforge.net/>.
271
272config NET_CLS_BASIC
273 tristate "Basic classifier"
274 depends on NET_CLS
275 ---help---
276 Say Y here if you want to be able to classify packets using
277 only extended matches and actions.
278
279 To compile this code as a module, choose M here: the
280 module will be called cls_basic.
281
282config NET_CLS_TCINDEX
283 tristate "TC index classifier"
284 depends on NET_CLS
285 help
286 If you say Y here, you will be able to classify outgoing packets
287 according to the tc_index field of the skb. You will want this
288 feature if you want to implement Differentiated Services using
289 sch_dsmark. If unsure, say Y.
290
291 To compile this code as a module, choose M here: the
292 module will be called cls_tcindex.
293
294config NET_CLS_ROUTE4
295 tristate "Routing table based classifier"
296 depends on NET_CLS
297 select NET_CLS_ROUTE
298 help
299 If you say Y here, you will be able to classify outgoing packets
300 according to the route table entry they matched. If unsure, say Y.
301
302 To compile this code as a module, choose M here: the
303 module will be called cls_route.
304
305config NET_CLS_ROUTE
306 bool
307 default n
308
309config NET_CLS_FW
310 tristate "Firewall based classifier"
311 depends on NET_CLS
312 help
313 If you say Y here, you will be able to classify outgoing packets
314 according to firewall criteria you specified.
315
316 To compile this code as a module, choose M here: the
317 module will be called cls_fw.
318
319config NET_CLS_U32
320 tristate "U32 classifier"
321 depends on NET_CLS
322 help
323 If you say Y here, you will be able to classify outgoing packets
324 according to their destination address. If unsure, say Y.
325
326 To compile this code as a module, choose M here: the
327 module will be called cls_u32.
328
329config CLS_U32_PERF
330 bool "U32 classifier performance counters"
331 depends on NET_CLS_U32
332 help
333 gathers stats that could be used to tune u32 classifier performance.
334 Requires a new iproute2
335 You MUST NOT turn this on if you dont have an update iproute2.
336
337config NET_CLS_IND
338 bool "classify input device (slows things u32/fw) "
339 depends on NET_CLS_U32 || NET_CLS_FW
340 help
341 This option will be killed eventually when a
342 metadata action appears because it slows things a little
343 Available only for u32 and fw classifiers.
344 Requires a new iproute2
345 You MUST NOT turn this on if you dont have an update iproute2.
346
347config CLS_U32_MARK
348 bool "Use nfmark as a key in U32 classifier"
349 depends on NET_CLS_U32 && NETFILTER
350 help
351 This allows you to match mark in a u32 filter.
352 Example:
353 tc filter add dev eth0 protocol ip parent 1:0 prio 5 u32 \
354 match mark 0x0090 0xffff \
355 match ip dst 4.4.4.4 \
356 flowid 1:90
357 You must use a new iproute2 to use this feature.
358
359config NET_CLS_RSVP
360 tristate "Special RSVP classifier"
361 depends on NET_CLS && NET_QOS
362 ---help---
363 The Resource Reservation Protocol (RSVP) permits end systems to
364 request a minimum and maximum data flow rate for a connection; this
365 is important for real time data such as streaming sound or video.
366
367 Say Y here if you want to be able to classify outgoing packets based
368 on their RSVP requests.
369
370 To compile this code as a module, choose M here: the
371 module will be called cls_rsvp.
372
373config NET_CLS_RSVP6
374 tristate "Special RSVP classifier for IPv6"
375 depends on NET_CLS && NET_QOS
376 ---help---
377 The Resource Reservation Protocol (RSVP) permits end systems to
378 request a minimum and maximum data flow rate for a connection; this
379 is important for real time data such as streaming sound or video.
380
381 Say Y here if you want to be able to classify outgoing packets based
382 on their RSVP requests and you are using the new Internet Protocol
383 IPv6 as opposed to the older and more common IPv4.
384
385 To compile this code as a module, choose M here: the
386 module will be called cls_rsvp6.
387
388config NET_EMATCH
389 bool "Extended Matches"
390 depends on NET_CLS
391 ---help---
392 Say Y here if you want to use extended matches on top of classifiers
393 and select the extended matches below.
394
395 Extended matches are small classification helpers not worth writing
396 a separate classifier.
397
398 You must have a recent version of the iproute2 tools in order to use
399 extended matches.
400
401config NET_EMATCH_STACK
402 int "Stack size"
403 depends on NET_EMATCH
404 default "32"
405 ---help---
406 Size of the local stack variable used while evaluating the tree of
407 ematches. Limits the depth of the tree, i.e. the number of
408 encapsulated precedences. Every level requires 4 bytes of addtional
409 stack space.
410
411config NET_EMATCH_CMP
412 tristate "Simple packet data comparison"
413 depends on NET_EMATCH
414 ---help---
415 Say Y here if you want to be able to classify packets based on
416 simple packet data comparisons for 8, 16, and 32bit values.
417
418 To compile this code as a module, choose M here: the
419 module will be called em_cmp.
420
421config NET_EMATCH_NBYTE
422 tristate "Multi byte comparison"
423 depends on NET_EMATCH
424 ---help---
425 Say Y here if you want to be able to classify packets based on
426 multiple byte comparisons mainly useful for IPv6 address comparisons.
427
428 To compile this code as a module, choose M here: the
429 module will be called em_nbyte.
430
431config NET_EMATCH_U32
432 tristate "U32 hashing key"
433 depends on NET_EMATCH
434 ---help---
435 Say Y here if you want to be able to classify packets using
436 the famous u32 key in combination with logic relations.
437
438 To compile this code as a module, choose M here: the
439 module will be called em_u32.
440
441config NET_EMATCH_META
442 tristate "Metadata"
443 depends on NET_EMATCH
444 ---help---
445 Say Y here if you want to be ablt to classify packets based on
446 metadata such as load average, netfilter attributes, socket
447 attributes and routing decisions.
448
449 To compile this code as a module, choose M here: the
450 module will be called em_meta.
451
452config NET_CLS_ACT
453 bool "Packet ACTION"
454 depends on EXPERIMENTAL && NET_CLS && NET_QOS
455 ---help---
456 This option requires you have a new iproute2. It enables
457 tc extensions which can be used with tc classifiers.
458 You MUST NOT turn this on if you dont have an update iproute2.
459
460config NET_ACT_POLICE
461 tristate "Policing Actions"
462 depends on NET_CLS_ACT
463 ---help---
464 If you are using a newer iproute2 select this one, otherwise use one
465 below to select a policer.
466 You MUST NOT turn this on if you dont have an update iproute2.
467
468config NET_ACT_GACT
469 tristate "generic Actions"
470 depends on NET_CLS_ACT
471 ---help---
472 You must have new iproute2 to use this feature.
473 This adds simple filtering actions like drop, accept etc.
474
475config GACT_PROB
476 bool "generic Actions probability"
477 depends on NET_ACT_GACT
478 ---help---
479 Allows generic actions to be randomly or deterministically used.
480
481config NET_ACT_MIRRED
482 tristate "Packet In/Egress redirecton/mirror Actions"
483 depends on NET_CLS_ACT
484 ---help---
485 requires new iproute2
486 This allows packets to be mirrored or redirected to netdevices
487
488config NET_ACT_IPT
489 tristate "iptables Actions"
490 depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
491 ---help---
492 requires new iproute2
493 This allows iptables targets to be used by tc filters
494
495config NET_ACT_PEDIT
496 tristate "Generic Packet Editor Actions"
497 depends on NET_CLS_ACT
498 ---help---
499 requires new iproute2
500 This allows for packets to be generically edited
501
502config NET_CLS_POLICE
503 bool "Traffic policing (needed for in/egress)"
504 depends on NET_CLS && NET_QOS && NET_CLS_ACT!=y
505 help
506 Say Y to support traffic policing (bandwidth limits). Needed for
507 ingress and egress rate limiting.
508
diff --git a/net/sched/Makefile b/net/sched/Makefile
new file mode 100644
index 000000000000..431e55786efd
--- /dev/null
+++ b/net/sched/Makefile
@@ -0,0 +1,41 @@
1#
2# Makefile for the Linux Traffic Control Unit.
3#
4
5obj-y := sch_generic.o
6
7obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o
8obj-$(CONFIG_NET_CLS) += cls_api.o
9obj-$(CONFIG_NET_CLS_ACT) += act_api.o
10obj-$(CONFIG_NET_ACT_POLICE) += police.o
11obj-$(CONFIG_NET_CLS_POLICE) += police.o
12obj-$(CONFIG_NET_ACT_GACT) += gact.o
13obj-$(CONFIG_NET_ACT_MIRRED) += mirred.o
14obj-$(CONFIG_NET_ACT_IPT) += ipt.o
15obj-$(CONFIG_NET_ACT_PEDIT) += pedit.o
16obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
17obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
18obj-$(CONFIG_NET_SCH_HPFQ) += sch_hpfq.o
19obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o
20obj-$(CONFIG_NET_SCH_RED) += sch_red.o
21obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o
22obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o
23obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o
24obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o
25obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o
26obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o
27obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o
28obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
29obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
30obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
31obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
32obj-$(CONFIG_NET_CLS_FW) += cls_fw.o
33obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o
34obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o
35obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o
36obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o
37obj-$(CONFIG_NET_EMATCH) += ematch.o
38obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o
39obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
40obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o
41obj-$(CONFIG_NET_EMATCH_META) += em_meta.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
new file mode 100644
index 000000000000..5e6cc371b39e
--- /dev/null
+++ b/net/sched/act_api.c
@@ -0,0 +1,894 @@
1/*
2 * net/sched/act_api.c Packet action API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Author: Jamal Hadi Salim
10 *
11 *
12 */
13
14#include <asm/uaccess.h>
15#include <asm/system.h>
16#include <linux/bitops.h>
17#include <linux/config.h>
18#include <linux/types.h>
19#include <linux/kernel.h>
20#include <linux/sched.h>
21#include <linux/string.h>
22#include <linux/mm.h>
23#include <linux/socket.h>
24#include <linux/sockios.h>
25#include <linux/in.h>
26#include <linux/errno.h>
27#include <linux/interrupt.h>
28#include <linux/netdevice.h>
29#include <linux/skbuff.h>
30#include <linux/rtnetlink.h>
31#include <linux/init.h>
32#include <linux/kmod.h>
33#include <net/sock.h>
34#include <net/sch_generic.h>
35#include <net/act_api.h>
36
37#if 1 /* control */
38#define DPRINTK(format, args...) printk(KERN_DEBUG format, ##args)
39#else
40#define DPRINTK(format, args...)
41#endif
42#if 0 /* data */
43#define D2PRINTK(format, args...) printk(KERN_DEBUG format, ##args)
44#else
45#define D2PRINTK(format, args...)
46#endif
47
48static struct tc_action_ops *act_base = NULL;
49static DEFINE_RWLOCK(act_mod_lock);
50
51int tcf_register_action(struct tc_action_ops *act)
52{
53 struct tc_action_ops *a, **ap;
54
55 write_lock(&act_mod_lock);
56 for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) {
57 if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
58 write_unlock(&act_mod_lock);
59 return -EEXIST;
60 }
61 }
62 act->next = NULL;
63 *ap = act;
64 write_unlock(&act_mod_lock);
65 return 0;
66}
67
68int tcf_unregister_action(struct tc_action_ops *act)
69{
70 struct tc_action_ops *a, **ap;
71 int err = -ENOENT;
72
73 write_lock(&act_mod_lock);
74 for (ap = &act_base; (a = *ap) != NULL; ap = &a->next)
75 if (a == act)
76 break;
77 if (a) {
78 *ap = a->next;
79 a->next = NULL;
80 err = 0;
81 }
82 write_unlock(&act_mod_lock);
83 return err;
84}
85
86/* lookup by name */
87static struct tc_action_ops *tc_lookup_action_n(char *kind)
88{
89 struct tc_action_ops *a = NULL;
90
91 if (kind) {
92 read_lock(&act_mod_lock);
93 for (a = act_base; a; a = a->next) {
94 if (strcmp(kind, a->kind) == 0) {
95 if (!try_module_get(a->owner)) {
96 read_unlock(&act_mod_lock);
97 return NULL;
98 }
99 break;
100 }
101 }
102 read_unlock(&act_mod_lock);
103 }
104 return a;
105}
106
107/* lookup by rtattr */
108static struct tc_action_ops *tc_lookup_action(struct rtattr *kind)
109{
110 struct tc_action_ops *a = NULL;
111
112 if (kind) {
113 read_lock(&act_mod_lock);
114 for (a = act_base; a; a = a->next) {
115 if (rtattr_strcmp(kind, a->kind) == 0) {
116 if (!try_module_get(a->owner)) {
117 read_unlock(&act_mod_lock);
118 return NULL;
119 }
120 break;
121 }
122 }
123 read_unlock(&act_mod_lock);
124 }
125 return a;
126}
127
128#if 0
129/* lookup by id */
130static struct tc_action_ops *tc_lookup_action_id(u32 type)
131{
132 struct tc_action_ops *a = NULL;
133
134 if (type) {
135 read_lock(&act_mod_lock);
136 for (a = act_base; a; a = a->next) {
137 if (a->type == type) {
138 if (!try_module_get(a->owner)) {
139 read_unlock(&act_mod_lock);
140 return NULL;
141 }
142 break;
143 }
144 }
145 read_unlock(&act_mod_lock);
146 }
147 return a;
148}
149#endif
150
151int tcf_action_exec(struct sk_buff *skb, struct tc_action *act,
152 struct tcf_result *res)
153{
154 struct tc_action *a;
155 int ret = -1;
156
157 if (skb->tc_verd & TC_NCLS) {
158 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
159 D2PRINTK("(%p)tcf_action_exec: cleared TC_NCLS in %s out %s\n",
160 skb, skb->input_dev ? skb->input_dev->name : "xxx",
161 skb->dev->name);
162 ret = TC_ACT_OK;
163 goto exec_done;
164 }
165 while ((a = act) != NULL) {
166repeat:
167 if (a->ops && a->ops->act) {
168 ret = a->ops->act(&skb, a);
169 if (TC_MUNGED & skb->tc_verd) {
170 /* copied already, allow trampling */
171 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
172 skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd);
173 }
174 if (ret != TC_ACT_PIPE)
175 goto exec_done;
176 if (ret == TC_ACT_REPEAT)
177 goto repeat; /* we need a ttl - JHS */
178 }
179 act = a->next;
180 }
181exec_done:
182 if (skb->tc_classid > 0) {
183 res->classid = skb->tc_classid;
184 res->class = 0;
185 skb->tc_classid = 0;
186 }
187 return ret;
188}
189
190void tcf_action_destroy(struct tc_action *act, int bind)
191{
192 struct tc_action *a;
193
194 for (a = act; a; a = act) {
195 if (a->ops && a->ops->cleanup) {
196 DPRINTK("tcf_action_destroy destroying %p next %p\n",
197 a, a->next);
198 if (a->ops->cleanup(a, bind) == ACT_P_DELETED)
199 module_put(a->ops->owner);
200 act = act->next;
201 kfree(a);
202 } else { /*FIXME: Remove later - catch insertion bugs*/
203 printk("tcf_action_destroy: BUG? destroying NULL ops\n");
204 act = act->next;
205 kfree(a);
206 }
207 }
208}
209
210int
211tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
212{
213 int err = -EINVAL;
214
215 if (a->ops == NULL || a->ops->dump == NULL)
216 return err;
217 return a->ops->dump(skb, a, bind, ref);
218}
219
220int
221tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
222{
223 int err = -EINVAL;
224 unsigned char *b = skb->tail;
225 struct rtattr *r;
226
227 if (a->ops == NULL || a->ops->dump == NULL)
228 return err;
229
230 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, a->ops->kind);
231 if (tcf_action_copy_stats(skb, a, 0))
232 goto rtattr_failure;
233 r = (struct rtattr*) skb->tail;
234 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
235 if ((err = tcf_action_dump_old(skb, a, bind, ref)) > 0) {
236 r->rta_len = skb->tail - (u8*)r;
237 return err;
238 }
239
240rtattr_failure:
241 skb_trim(skb, b - skb->data);
242 return -1;
243}
244
245int
246tcf_action_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref)
247{
248 struct tc_action *a;
249 int err = -EINVAL;
250 unsigned char *b = skb->tail;
251 struct rtattr *r ;
252
253 while ((a = act) != NULL) {
254 r = (struct rtattr*) skb->tail;
255 act = a->next;
256 RTA_PUT(skb, a->order, 0, NULL);
257 err = tcf_action_dump_1(skb, a, bind, ref);
258 if (err < 0)
259 goto rtattr_failure;
260 r->rta_len = skb->tail - (u8*)r;
261 }
262
263 return 0;
264
265rtattr_failure:
266 skb_trim(skb, b - skb->data);
267 return -err;
268}
269
270struct tc_action *tcf_action_init_1(struct rtattr *rta, struct rtattr *est,
271 char *name, int ovr, int bind, int *err)
272{
273 struct tc_action *a;
274 struct tc_action_ops *a_o;
275 char act_name[IFNAMSIZ];
276 struct rtattr *tb[TCA_ACT_MAX+1];
277 struct rtattr *kind;
278
279 *err = -EINVAL;
280
281 if (name == NULL) {
282 if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0)
283 goto err_out;
284 kind = tb[TCA_ACT_KIND-1];
285 if (kind == NULL)
286 goto err_out;
287 if (rtattr_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ)
288 goto err_out;
289 } else {
290 if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ)
291 goto err_out;
292 }
293
294 a_o = tc_lookup_action_n(act_name);
295 if (a_o == NULL) {
296#ifdef CONFIG_KMOD
297 rtnl_unlock();
298 request_module(act_name);
299 rtnl_lock();
300
301 a_o = tc_lookup_action_n(act_name);
302
303 /* We dropped the RTNL semaphore in order to
304 * perform the module load. So, even if we
305 * succeeded in loading the module we have to
306 * tell the caller to replay the request. We
307 * indicate this using -EAGAIN.
308 */
309 if (a_o != NULL) {
310 *err = -EAGAIN;
311 goto err_mod;
312 }
313#endif
314 goto err_out;
315 }
316
317 *err = -ENOMEM;
318 a = kmalloc(sizeof(*a), GFP_KERNEL);
319 if (a == NULL)
320 goto err_mod;
321 memset(a, 0, sizeof(*a));
322
323 /* backward compatibility for policer */
324 if (name == NULL)
325 *err = a_o->init(tb[TCA_ACT_OPTIONS-1], est, a, ovr, bind);
326 else
327 *err = a_o->init(rta, est, a, ovr, bind);
328 if (*err < 0)
329 goto err_free;
330
331 /* module count goes up only when brand new policy is created
332 if it exists and is only bound to in a_o->init() then
333 ACT_P_CREATED is not returned (a zero is).
334 */
335 if (*err != ACT_P_CREATED)
336 module_put(a_o->owner);
337 a->ops = a_o;
338 DPRINTK("tcf_action_init_1: successfull %s\n", act_name);
339
340 *err = 0;
341 return a;
342
343err_free:
344 kfree(a);
345err_mod:
346 module_put(a_o->owner);
347err_out:
348 return NULL;
349}
350
351struct tc_action *tcf_action_init(struct rtattr *rta, struct rtattr *est,
352 char *name, int ovr, int bind, int *err)
353{
354 struct rtattr *tb[TCA_ACT_MAX_PRIO+1];
355 struct tc_action *head = NULL, *act, *act_prev = NULL;
356 int i;
357
358 if (rtattr_parse_nested(tb, TCA_ACT_MAX_PRIO, rta) < 0) {
359 *err = -EINVAL;
360 return head;
361 }
362
363 for (i=0; i < TCA_ACT_MAX_PRIO && tb[i]; i++) {
364 act = tcf_action_init_1(tb[i], est, name, ovr, bind, err);
365 if (act == NULL)
366 goto err;
367 act->order = i+1;
368
369 if (head == NULL)
370 head = act;
371 else
372 act_prev->next = act;
373 act_prev = act;
374 }
375 return head;
376
377err:
378 if (head != NULL)
379 tcf_action_destroy(head, bind);
380 return NULL;
381}
382
383int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
384 int compat_mode)
385{
386 int err = 0;
387 struct gnet_dump d;
388 struct tcf_act_hdr *h = a->priv;
389
390 if (h == NULL)
391 goto errout;
392
393 /* compat_mode being true specifies a call that is supposed
394 * to add additional backward compatiblity statistic TLVs.
395 */
396 if (compat_mode) {
397 if (a->type == TCA_OLD_COMPAT)
398 err = gnet_stats_start_copy_compat(skb, 0,
399 TCA_STATS, TCA_XSTATS, h->stats_lock, &d);
400 else
401 return 0;
402 } else
403 err = gnet_stats_start_copy(skb, TCA_ACT_STATS,
404 h->stats_lock, &d);
405
406 if (err < 0)
407 goto errout;
408
409 if (a->ops != NULL && a->ops->get_stats != NULL)
410 if (a->ops->get_stats(skb, a) < 0)
411 goto errout;
412
413 if (gnet_stats_copy_basic(&d, &h->bstats) < 0 ||
414#ifdef CONFIG_NET_ESTIMATOR
415 gnet_stats_copy_rate_est(&d, &h->rate_est) < 0 ||
416#endif
417 gnet_stats_copy_queue(&d, &h->qstats) < 0)
418 goto errout;
419
420 if (gnet_stats_finish_copy(&d) < 0)
421 goto errout;
422
423 return 0;
424
425errout:
426 return -1;
427}
428
429static int
430tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq,
431 unsigned flags, int event, int bind, int ref)
432{
433 struct tcamsg *t;
434 struct nlmsghdr *nlh;
435 unsigned char *b = skb->tail;
436 struct rtattr *x;
437
438 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*t));
439 nlh->nlmsg_flags = flags;
440 t = NLMSG_DATA(nlh);
441 t->tca_family = AF_UNSPEC;
442
443 x = (struct rtattr*) skb->tail;
444 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
445
446 if (tcf_action_dump(skb, a, bind, ref) < 0)
447 goto rtattr_failure;
448
449 x->rta_len = skb->tail - (u8*)x;
450
451 nlh->nlmsg_len = skb->tail - b;
452 return skb->len;
453
454rtattr_failure:
455nlmsg_failure:
456 skb_trim(skb, b - skb->data);
457 return -1;
458}
459
460static int
461act_get_notify(u32 pid, struct nlmsghdr *n, struct tc_action *a, int event)
462{
463 struct sk_buff *skb;
464 int err = 0;
465
466 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
467 if (!skb)
468 return -ENOBUFS;
469 if (tca_get_fill(skb, a, pid, n->nlmsg_seq, 0, event, 0, 0) <= 0) {
470 kfree_skb(skb);
471 return -EINVAL;
472 }
473 err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
474 if (err > 0)
475 err = 0;
476 return err;
477}
478
479static struct tc_action *
480tcf_action_get_1(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int *err)
481{
482 struct rtattr *tb[TCA_ACT_MAX+1];
483 struct tc_action *a;
484 int index;
485
486 *err = -EINVAL;
487 if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0)
488 return NULL;
489
490 if (tb[TCA_ACT_INDEX - 1] == NULL ||
491 RTA_PAYLOAD(tb[TCA_ACT_INDEX - 1]) < sizeof(index))
492 return NULL;
493 index = *(int *)RTA_DATA(tb[TCA_ACT_INDEX - 1]);
494
495 *err = -ENOMEM;
496 a = kmalloc(sizeof(struct tc_action), GFP_KERNEL);
497 if (a == NULL)
498 return NULL;
499 memset(a, 0, sizeof(struct tc_action));
500
501 *err = -EINVAL;
502 a->ops = tc_lookup_action(tb[TCA_ACT_KIND - 1]);
503 if (a->ops == NULL)
504 goto err_free;
505 if (a->ops->lookup == NULL)
506 goto err_mod;
507 *err = -ENOENT;
508 if (a->ops->lookup(a, index) == 0)
509 goto err_mod;
510
511 module_put(a->ops->owner);
512 *err = 0;
513 return a;
514err_mod:
515 module_put(a->ops->owner);
516err_free:
517 kfree(a);
518 return NULL;
519}
520
521static void cleanup_a(struct tc_action *act)
522{
523 struct tc_action *a;
524
525 for (a = act; a; a = act) {
526 act = a->next;
527 kfree(a);
528 }
529}
530
531static struct tc_action *create_a(int i)
532{
533 struct tc_action *act;
534
535 act = kmalloc(sizeof(*act), GFP_KERNEL);
536 if (act == NULL) {
537 printk("create_a: failed to alloc!\n");
538 return NULL;
539 }
540 memset(act, 0, sizeof(*act));
541 act->order = i;
542 return act;
543}
544
545static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid)
546{
547 struct sk_buff *skb;
548 unsigned char *b;
549 struct nlmsghdr *nlh;
550 struct tcamsg *t;
551 struct netlink_callback dcb;
552 struct rtattr *x;
553 struct rtattr *tb[TCA_ACT_MAX+1];
554 struct rtattr *kind;
555 struct tc_action *a = create_a(0);
556 int err = -EINVAL;
557
558 if (a == NULL) {
559 printk("tca_action_flush: couldnt create tc_action\n");
560 return err;
561 }
562
563 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
564 if (!skb) {
565 printk("tca_action_flush: failed skb alloc\n");
566 kfree(a);
567 return -ENOBUFS;
568 }
569
570 b = (unsigned char *)skb->tail;
571
572 if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0)
573 goto err_out;
574
575 kind = tb[TCA_ACT_KIND-1];
576 a->ops = tc_lookup_action(kind);
577 if (a->ops == NULL)
578 goto err_out;
579
580 nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t));
581 t = NLMSG_DATA(nlh);
582 t->tca_family = AF_UNSPEC;
583
584 x = (struct rtattr *) skb->tail;
585 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
586
587 err = a->ops->walk(skb, &dcb, RTM_DELACTION, a);
588 if (err < 0)
589 goto rtattr_failure;
590
591 x->rta_len = skb->tail - (u8 *) x;
592
593 nlh->nlmsg_len = skb->tail - b;
594 nlh->nlmsg_flags |= NLM_F_ROOT;
595 module_put(a->ops->owner);
596 kfree(a);
597 err = rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
598 if (err > 0)
599 return 0;
600
601 return err;
602
603rtattr_failure:
604 module_put(a->ops->owner);
605nlmsg_failure:
606err_out:
607 kfree_skb(skb);
608 kfree(a);
609 return err;
610}
611
612static int
613tca_action_gd(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int event)
614{
615 int i, ret = 0;
616 struct rtattr *tb[TCA_ACT_MAX_PRIO+1];
617 struct tc_action *head = NULL, *act, *act_prev = NULL;
618
619 if (rtattr_parse_nested(tb, TCA_ACT_MAX_PRIO, rta) < 0)
620 return -EINVAL;
621
622 if (event == RTM_DELACTION && n->nlmsg_flags&NLM_F_ROOT) {
623 if (tb[0] != NULL && tb[1] == NULL)
624 return tca_action_flush(tb[0], n, pid);
625 }
626
627 for (i=0; i < TCA_ACT_MAX_PRIO && tb[i]; i++) {
628 act = tcf_action_get_1(tb[i], n, pid, &ret);
629 if (act == NULL)
630 goto err;
631 act->order = i+1;
632
633 if (head == NULL)
634 head = act;
635 else
636 act_prev->next = act;
637 act_prev = act;
638 }
639
640 if (event == RTM_GETACTION)
641 ret = act_get_notify(pid, n, head, event);
642 else { /* delete */
643 struct sk_buff *skb;
644
645 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
646 if (!skb) {
647 ret = -ENOBUFS;
648 goto err;
649 }
650
651 if (tca_get_fill(skb, head, pid, n->nlmsg_seq, 0, event,
652 0, 1) <= 0) {
653 kfree_skb(skb);
654 ret = -EINVAL;
655 goto err;
656 }
657
658 /* now do the delete */
659 tcf_action_destroy(head, 0);
660 ret = rtnetlink_send(skb, pid, RTMGRP_TC,
661 n->nlmsg_flags&NLM_F_ECHO);
662 if (ret > 0)
663 return 0;
664 return ret;
665 }
666err:
667 cleanup_a(head);
668 return ret;
669}
670
671static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
672 unsigned flags)
673{
674 struct tcamsg *t;
675 struct nlmsghdr *nlh;
676 struct sk_buff *skb;
677 struct rtattr *x;
678 unsigned char *b;
679 int err = 0;
680
681 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
682 if (!skb)
683 return -ENOBUFS;
684
685 b = (unsigned char *)skb->tail;
686
687 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*t));
688 nlh->nlmsg_flags = flags;
689 t = NLMSG_DATA(nlh);
690 t->tca_family = AF_UNSPEC;
691
692 x = (struct rtattr*) skb->tail;
693 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
694
695 if (tcf_action_dump(skb, a, 0, 0) < 0)
696 goto rtattr_failure;
697
698 x->rta_len = skb->tail - (u8*)x;
699
700 nlh->nlmsg_len = skb->tail - b;
701 NETLINK_CB(skb).dst_groups = RTMGRP_TC;
702
703 err = rtnetlink_send(skb, pid, RTMGRP_TC, flags&NLM_F_ECHO);
704 if (err > 0)
705 err = 0;
706 return err;
707
708rtattr_failure:
709nlmsg_failure:
710 skb_trim(skb, b - skb->data);
711 return -1;
712}
713
714
715static int
716tcf_action_add(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int ovr)
717{
718 int ret = 0;
719 struct tc_action *act;
720 struct tc_action *a;
721 u32 seq = n->nlmsg_seq;
722
723 act = tcf_action_init(rta, NULL, NULL, ovr, 0, &ret);
724 if (act == NULL)
725 goto done;
726
727 /* dump then free all the actions after update; inserted policy
728 * stays intact
729 * */
730 ret = tcf_add_notify(act, pid, seq, RTM_NEWACTION, n->nlmsg_flags);
731 for (a = act; a; a = act) {
732 act = a->next;
733 kfree(a);
734 }
735done:
736 return ret;
737}
738
739static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
740{
741 struct rtattr **tca = arg;
742 u32 pid = skb ? NETLINK_CB(skb).pid : 0;
743 int ret = 0, ovr = 0;
744
745 if (tca[TCA_ACT_TAB-1] == NULL) {
746 printk("tc_ctl_action: received NO action attribs\n");
747 return -EINVAL;
748 }
749
750 /* n->nlmsg_flags&NLM_F_CREATE
751 * */
752 switch (n->nlmsg_type) {
753 case RTM_NEWACTION:
754 /* we are going to assume all other flags
755 * imply create only if it doesnt exist
756 * Note that CREATE | EXCL implies that
757 * but since we want avoid ambiguity (eg when flags
758 * is zero) then just set this
759 */
760 if (n->nlmsg_flags&NLM_F_REPLACE)
761 ovr = 1;
762replay:
763 ret = tcf_action_add(tca[TCA_ACT_TAB-1], n, pid, ovr);
764 if (ret == -EAGAIN)
765 goto replay;
766 break;
767 case RTM_DELACTION:
768 ret = tca_action_gd(tca[TCA_ACT_TAB-1], n, pid, RTM_DELACTION);
769 break;
770 case RTM_GETACTION:
771 ret = tca_action_gd(tca[TCA_ACT_TAB-1], n, pid, RTM_GETACTION);
772 break;
773 default:
774 BUG();
775 }
776
777 return ret;
778}
779
780static char *
781find_dump_kind(struct nlmsghdr *n)
782{
783 struct rtattr *tb1, *tb2[TCA_ACT_MAX+1];
784 struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
785 struct rtattr *rta[TCAA_MAX + 1];
786 struct rtattr *kind;
787 int min_len = NLMSG_LENGTH(sizeof(struct tcamsg));
788 int attrlen = n->nlmsg_len - NLMSG_ALIGN(min_len);
789 struct rtattr *attr = (void *) n + NLMSG_ALIGN(min_len);
790
791 if (rtattr_parse(rta, TCAA_MAX, attr, attrlen) < 0)
792 return NULL;
793 tb1 = rta[TCA_ACT_TAB - 1];
794 if (tb1 == NULL)
795 return NULL;
796
797 if (rtattr_parse(tb, TCA_ACT_MAX_PRIO, RTA_DATA(tb1),
798 NLMSG_ALIGN(RTA_PAYLOAD(tb1))) < 0)
799 return NULL;
800 if (tb[0] == NULL)
801 return NULL;
802
803 if (rtattr_parse(tb2, TCA_ACT_MAX, RTA_DATA(tb[0]),
804 RTA_PAYLOAD(tb[0])) < 0)
805 return NULL;
806 kind = tb2[TCA_ACT_KIND-1];
807
808 return (char *) RTA_DATA(kind);
809}
810
811static int
812tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
813{
814 struct nlmsghdr *nlh;
815 unsigned char *b = skb->tail;
816 struct rtattr *x;
817 struct tc_action_ops *a_o;
818 struct tc_action a;
819 int ret = 0;
820 struct tcamsg *t = (struct tcamsg *) NLMSG_DATA(cb->nlh);
821 char *kind = find_dump_kind(cb->nlh);
822
823 if (kind == NULL) {
824 printk("tc_dump_action: action bad kind\n");
825 return 0;
826 }
827
828 a_o = tc_lookup_action_n(kind);
829 if (a_o == NULL) {
830 printk("failed to find %s\n", kind);
831 return 0;
832 }
833
834 memset(&a, 0, sizeof(struct tc_action));
835 a.ops = a_o;
836
837 if (a_o->walk == NULL) {
838 printk("tc_dump_action: %s !capable of dumping table\n", kind);
839 goto rtattr_failure;
840 }
841
842 nlh = NLMSG_PUT(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
843 cb->nlh->nlmsg_type, sizeof(*t));
844 t = NLMSG_DATA(nlh);
845 t->tca_family = AF_UNSPEC;
846
847 x = (struct rtattr *) skb->tail;
848 RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
849
850 ret = a_o->walk(skb, cb, RTM_GETACTION, &a);
851 if (ret < 0)
852 goto rtattr_failure;
853
854 if (ret > 0) {
855 x->rta_len = skb->tail - (u8 *) x;
856 ret = skb->len;
857 } else
858 skb_trim(skb, (u8*)x - skb->data);
859
860 nlh->nlmsg_len = skb->tail - b;
861 if (NETLINK_CB(cb->skb).pid && ret)
862 nlh->nlmsg_flags |= NLM_F_MULTI;
863 module_put(a_o->owner);
864 return skb->len;
865
866rtattr_failure:
867nlmsg_failure:
868 module_put(a_o->owner);
869 skb_trim(skb, b - skb->data);
870 return skb->len;
871}
872
873static int __init tc_action_init(void)
874{
875 struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC];
876
877 if (link_p) {
878 link_p[RTM_NEWACTION-RTM_BASE].doit = tc_ctl_action;
879 link_p[RTM_DELACTION-RTM_BASE].doit = tc_ctl_action;
880 link_p[RTM_GETACTION-RTM_BASE].doit = tc_ctl_action;
881 link_p[RTM_GETACTION-RTM_BASE].dumpit = tc_dump_action;
882 }
883
884 printk("TC classifier action (bugs to netdev@oss.sgi.com cc "
885 "hadi@cyberus.ca)\n");
886 return 0;
887}
888
889subsys_initcall(tc_action_init);
890
891EXPORT_SYMBOL(tcf_register_action);
892EXPORT_SYMBOL(tcf_unregister_action);
893EXPORT_SYMBOL(tcf_action_exec);
894EXPORT_SYMBOL(tcf_action_dump_1);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
new file mode 100644
index 000000000000..56e66c3fe0fa
--- /dev/null
+++ b/net/sched/cls_api.c
@@ -0,0 +1,642 @@
1/*
2 * net/sched/cls_api.c Packet classifier API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Changes:
12 *
13 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
14 *
15 */
16
17#include <asm/uaccess.h>
18#include <asm/system.h>
19#include <linux/bitops.h>
20#include <linux/config.h>
21#include <linux/module.h>
22#include <linux/types.h>
23#include <linux/kernel.h>
24#include <linux/sched.h>
25#include <linux/string.h>
26#include <linux/mm.h>
27#include <linux/socket.h>
28#include <linux/sockios.h>
29#include <linux/in.h>
30#include <linux/errno.h>
31#include <linux/interrupt.h>
32#include <linux/netdevice.h>
33#include <linux/skbuff.h>
34#include <linux/rtnetlink.h>
35#include <linux/init.h>
36#include <linux/kmod.h>
37#include <net/sock.h>
38#include <net/pkt_sched.h>
39#include <net/pkt_cls.h>
40
41#if 0 /* control */
42#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
43#else
44#define DPRINTK(format,args...)
45#endif
46
47/* The list of all installed classifier types */
48
49static struct tcf_proto_ops *tcf_proto_base;
50
51/* Protects list of registered TC modules. It is pure SMP lock. */
52static DEFINE_RWLOCK(cls_mod_lock);
53
54/* Find classifier type by string name */
55
56static struct tcf_proto_ops * tcf_proto_lookup_ops(struct rtattr *kind)
57{
58 struct tcf_proto_ops *t = NULL;
59
60 if (kind) {
61 read_lock(&cls_mod_lock);
62 for (t = tcf_proto_base; t; t = t->next) {
63 if (rtattr_strcmp(kind, t->kind) == 0) {
64 if (!try_module_get(t->owner))
65 t = NULL;
66 break;
67 }
68 }
69 read_unlock(&cls_mod_lock);
70 }
71 return t;
72}
73
74/* Register(unregister) new classifier type */
75
76int register_tcf_proto_ops(struct tcf_proto_ops *ops)
77{
78 struct tcf_proto_ops *t, **tp;
79 int rc = -EEXIST;
80
81 write_lock(&cls_mod_lock);
82 for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next)
83 if (!strcmp(ops->kind, t->kind))
84 goto out;
85
86 ops->next = NULL;
87 *tp = ops;
88 rc = 0;
89out:
90 write_unlock(&cls_mod_lock);
91 return rc;
92}
93
94int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
95{
96 struct tcf_proto_ops *t, **tp;
97 int rc = -ENOENT;
98
99 write_lock(&cls_mod_lock);
100 for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next)
101 if (t == ops)
102 break;
103
104 if (!t)
105 goto out;
106 *tp = t->next;
107 rc = 0;
108out:
109 write_unlock(&cls_mod_lock);
110 return rc;
111}
112
113static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
114 struct tcf_proto *tp, unsigned long fh, int event);
115
116
117/* Select new prio value from the range, managed by kernel. */
118
119static __inline__ u32 tcf_auto_prio(struct tcf_proto *tp)
120{
121 u32 first = TC_H_MAKE(0xC0000000U,0U);
122
123 if (tp)
124 first = tp->prio-1;
125
126 return first;
127}
128
129/* Add/change/delete/get a filter node */
130
131static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
132{
133 struct rtattr **tca;
134 struct tcmsg *t;
135 u32 protocol;
136 u32 prio;
137 u32 nprio;
138 u32 parent;
139 struct net_device *dev;
140 struct Qdisc *q;
141 struct tcf_proto **back, **chain;
142 struct tcf_proto *tp;
143 struct tcf_proto_ops *tp_ops;
144 struct Qdisc_class_ops *cops;
145 unsigned long cl;
146 unsigned long fh;
147 int err;
148
149replay:
150 tca = arg;
151 t = NLMSG_DATA(n);
152 protocol = TC_H_MIN(t->tcm_info);
153 prio = TC_H_MAJ(t->tcm_info);
154 nprio = prio;
155 parent = t->tcm_parent;
156 cl = 0;
157
158 if (prio == 0) {
159 /* If no priority is given, user wants we allocated it. */
160 if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
161 return -ENOENT;
162 prio = TC_H_MAKE(0x80000000U,0U);
163 }
164
165 /* Find head of filter chain. */
166
167 /* Find link */
168 if ((dev = __dev_get_by_index(t->tcm_ifindex)) == NULL)
169 return -ENODEV;
170
171 /* Find qdisc */
172 if (!parent) {
173 q = dev->qdisc_sleeping;
174 parent = q->handle;
175 } else if ((q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent))) == NULL)
176 return -EINVAL;
177
178 /* Is it classful? */
179 if ((cops = q->ops->cl_ops) == NULL)
180 return -EINVAL;
181
182 /* Do we search for filter, attached to class? */
183 if (TC_H_MIN(parent)) {
184 cl = cops->get(q, parent);
185 if (cl == 0)
186 return -ENOENT;
187 }
188
189 /* And the last stroke */
190 chain = cops->tcf_chain(q, cl);
191 err = -EINVAL;
192 if (chain == NULL)
193 goto errout;
194
195 /* Check the chain for existence of proto-tcf with this priority */
196 for (back = chain; (tp=*back) != NULL; back = &tp->next) {
197 if (tp->prio >= prio) {
198 if (tp->prio == prio) {
199 if (!nprio || (tp->protocol != protocol && protocol))
200 goto errout;
201 } else
202 tp = NULL;
203 break;
204 }
205 }
206
207 if (tp == NULL) {
208 /* Proto-tcf does not exist, create new one */
209
210 if (tca[TCA_KIND-1] == NULL || !protocol)
211 goto errout;
212
213 err = -ENOENT;
214 if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
215 goto errout;
216
217
218 /* Create new proto tcf */
219
220 err = -ENOBUFS;
221 if ((tp = kmalloc(sizeof(*tp), GFP_KERNEL)) == NULL)
222 goto errout;
223 err = -EINVAL;
224 tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND-1]);
225 if (tp_ops == NULL) {
226#ifdef CONFIG_KMOD
227 struct rtattr *kind = tca[TCA_KIND-1];
228 char name[IFNAMSIZ];
229
230 if (kind != NULL &&
231 rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
232 rtnl_unlock();
233 request_module("cls_%s", name);
234 rtnl_lock();
235 tp_ops = tcf_proto_lookup_ops(kind);
236 /* We dropped the RTNL semaphore in order to
237 * perform the module load. So, even if we
238 * succeeded in loading the module we have to
239 * replay the request. We indicate this using
240 * -EAGAIN.
241 */
242 if (tp_ops != NULL) {
243 module_put(tp_ops->owner);
244 err = -EAGAIN;
245 }
246 }
247#endif
248 kfree(tp);
249 goto errout;
250 }
251 memset(tp, 0, sizeof(*tp));
252 tp->ops = tp_ops;
253 tp->protocol = protocol;
254 tp->prio = nprio ? : tcf_auto_prio(*back);
255 tp->q = q;
256 tp->classify = tp_ops->classify;
257 tp->classid = parent;
258 if ((err = tp_ops->init(tp)) != 0) {
259 module_put(tp_ops->owner);
260 kfree(tp);
261 goto errout;
262 }
263
264 qdisc_lock_tree(dev);
265 tp->next = *back;
266 *back = tp;
267 qdisc_unlock_tree(dev);
268
269 } else if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], tp->ops->kind))
270 goto errout;
271
272 fh = tp->ops->get(tp, t->tcm_handle);
273
274 if (fh == 0) {
275 if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
276 qdisc_lock_tree(dev);
277 *back = tp->next;
278 qdisc_unlock_tree(dev);
279
280 tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER);
281 tcf_destroy(tp);
282 err = 0;
283 goto errout;
284 }
285
286 err = -ENOENT;
287 if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
288 goto errout;
289 } else {
290 switch (n->nlmsg_type) {
291 case RTM_NEWTFILTER:
292 err = -EEXIST;
293 if (n->nlmsg_flags&NLM_F_EXCL)
294 goto errout;
295 break;
296 case RTM_DELTFILTER:
297 err = tp->ops->delete(tp, fh);
298 if (err == 0)
299 tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER);
300 goto errout;
301 case RTM_GETTFILTER:
302 err = tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER);
303 goto errout;
304 default:
305 err = -EINVAL;
306 goto errout;
307 }
308 }
309
310 err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh);
311 if (err == 0)
312 tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER);
313
314errout:
315 if (cl)
316 cops->put(q, cl);
317 if (err == -EAGAIN)
318 /* Replay the request. */
319 goto replay;
320 return err;
321}
322
323static int
324tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh,
325 u32 pid, u32 seq, unsigned flags, int event)
326{
327 struct tcmsg *tcm;
328 struct nlmsghdr *nlh;
329 unsigned char *b = skb->tail;
330
331 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
332 nlh->nlmsg_flags = flags;
333 tcm = NLMSG_DATA(nlh);
334 tcm->tcm_family = AF_UNSPEC;
335 tcm->tcm_ifindex = tp->q->dev->ifindex;
336 tcm->tcm_parent = tp->classid;
337 tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
338 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, tp->ops->kind);
339 tcm->tcm_handle = fh;
340 if (RTM_DELTFILTER != event) {
341 tcm->tcm_handle = 0;
342 if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0)
343 goto rtattr_failure;
344 }
345 nlh->nlmsg_len = skb->tail - b;
346 return skb->len;
347
348nlmsg_failure:
349rtattr_failure:
350 skb_trim(skb, b - skb->data);
351 return -1;
352}
353
354static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
355 struct tcf_proto *tp, unsigned long fh, int event)
356{
357 struct sk_buff *skb;
358 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
359
360 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
361 if (!skb)
362 return -ENOBUFS;
363
364 if (tcf_fill_node(skb, tp, fh, pid, n->nlmsg_seq, 0, event) <= 0) {
365 kfree_skb(skb);
366 return -EINVAL;
367 }
368
369 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
370}
371
372struct tcf_dump_args
373{
374 struct tcf_walker w;
375 struct sk_buff *skb;
376 struct netlink_callback *cb;
377};
378
379static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, struct tcf_walker *arg)
380{
381 struct tcf_dump_args *a = (void*)arg;
382
383 return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid,
384 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER);
385}
386
387static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
388{
389 int t;
390 int s_t;
391 struct net_device *dev;
392 struct Qdisc *q;
393 struct tcf_proto *tp, **chain;
394 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
395 unsigned long cl = 0;
396 struct Qdisc_class_ops *cops;
397 struct tcf_dump_args arg;
398
399 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
400 return skb->len;
401 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
402 return skb->len;
403
404 read_lock_bh(&qdisc_tree_lock);
405 if (!tcm->tcm_parent)
406 q = dev->qdisc_sleeping;
407 else
408 q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
409 if (!q)
410 goto out;
411 if ((cops = q->ops->cl_ops) == NULL)
412 goto errout;
413 if (TC_H_MIN(tcm->tcm_parent)) {
414 cl = cops->get(q, tcm->tcm_parent);
415 if (cl == 0)
416 goto errout;
417 }
418 chain = cops->tcf_chain(q, cl);
419 if (chain == NULL)
420 goto errout;
421
422 s_t = cb->args[0];
423
424 for (tp=*chain, t=0; tp; tp = tp->next, t++) {
425 if (t < s_t) continue;
426 if (TC_H_MAJ(tcm->tcm_info) &&
427 TC_H_MAJ(tcm->tcm_info) != tp->prio)
428 continue;
429 if (TC_H_MIN(tcm->tcm_info) &&
430 TC_H_MIN(tcm->tcm_info) != tp->protocol)
431 continue;
432 if (t > s_t)
433 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
434 if (cb->args[1] == 0) {
435 if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid,
436 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) {
437 break;
438 }
439 cb->args[1] = 1;
440 }
441 if (tp->ops->walk == NULL)
442 continue;
443 arg.w.fn = tcf_node_dump;
444 arg.skb = skb;
445 arg.cb = cb;
446 arg.w.stop = 0;
447 arg.w.skip = cb->args[1]-1;
448 arg.w.count = 0;
449 tp->ops->walk(tp, &arg.w);
450 cb->args[1] = arg.w.count+1;
451 if (arg.w.stop)
452 break;
453 }
454
455 cb->args[0] = t;
456
457errout:
458 if (cl)
459 cops->put(q, cl);
460out:
461 read_unlock_bh(&qdisc_tree_lock);
462 dev_put(dev);
463 return skb->len;
464}
465
466void
467tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts)
468{
469#ifdef CONFIG_NET_CLS_ACT
470 if (exts->action) {
471 tcf_action_destroy(exts->action, TCA_ACT_UNBIND);
472 exts->action = NULL;
473 }
474#elif defined CONFIG_NET_CLS_POLICE
475 if (exts->police) {
476 tcf_police_release(exts->police, TCA_ACT_UNBIND);
477 exts->police = NULL;
478 }
479#endif
480}
481
482
483int
484tcf_exts_validate(struct tcf_proto *tp, struct rtattr **tb,
485 struct rtattr *rate_tlv, struct tcf_exts *exts,
486 struct tcf_ext_map *map)
487{
488 memset(exts, 0, sizeof(*exts));
489
490#ifdef CONFIG_NET_CLS_ACT
491 {
492 int err;
493 struct tc_action *act;
494
495 if (map->police && tb[map->police-1]) {
496 act = tcf_action_init_1(tb[map->police-1], rate_tlv, "police",
497 TCA_ACT_NOREPLACE, TCA_ACT_BIND, &err);
498 if (act == NULL)
499 return err;
500
501 act->type = TCA_OLD_COMPAT;
502 exts->action = act;
503 } else if (map->action && tb[map->action-1]) {
504 act = tcf_action_init(tb[map->action-1], rate_tlv, NULL,
505 TCA_ACT_NOREPLACE, TCA_ACT_BIND, &err);
506 if (act == NULL)
507 return err;
508
509 exts->action = act;
510 }
511 }
512#elif defined CONFIG_NET_CLS_POLICE
513 if (map->police && tb[map->police-1]) {
514 struct tcf_police *p;
515
516 p = tcf_police_locate(tb[map->police-1], rate_tlv);
517 if (p == NULL)
518 return -EINVAL;
519
520 exts->police = p;
521 } else if (map->action && tb[map->action-1])
522 return -EOPNOTSUPP;
523#else
524 if ((map->action && tb[map->action-1]) ||
525 (map->police && tb[map->police-1]))
526 return -EOPNOTSUPP;
527#endif
528
529 return 0;
530}
531
532void
533tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
534 struct tcf_exts *src)
535{
536#ifdef CONFIG_NET_CLS_ACT
537 if (src->action) {
538 struct tc_action *act;
539 tcf_tree_lock(tp);
540 act = xchg(&dst->action, src->action);
541 tcf_tree_unlock(tp);
542 if (act)
543 tcf_action_destroy(act, TCA_ACT_UNBIND);
544 }
545#elif defined CONFIG_NET_CLS_POLICE
546 if (src->police) {
547 struct tcf_police *p;
548 tcf_tree_lock(tp);
549 p = xchg(&dst->police, src->police);
550 tcf_tree_unlock(tp);
551 if (p)
552 tcf_police_release(p, TCA_ACT_UNBIND);
553 }
554#endif
555}
556
557int
558tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts,
559 struct tcf_ext_map *map)
560{
561#ifdef CONFIG_NET_CLS_ACT
562 if (map->action && exts->action) {
563 /*
564 * again for backward compatible mode - we want
565 * to work with both old and new modes of entering
566 * tc data even if iproute2 was newer - jhs
567 */
568 struct rtattr * p_rta = (struct rtattr*) skb->tail;
569
570 if (exts->action->type != TCA_OLD_COMPAT) {
571 RTA_PUT(skb, map->action, 0, NULL);
572 if (tcf_action_dump(skb, exts->action, 0, 0) < 0)
573 goto rtattr_failure;
574 p_rta->rta_len = skb->tail - (u8*)p_rta;
575 } else if (map->police) {
576 RTA_PUT(skb, map->police, 0, NULL);
577 if (tcf_action_dump_old(skb, exts->action, 0, 0) < 0)
578 goto rtattr_failure;
579 p_rta->rta_len = skb->tail - (u8*)p_rta;
580 }
581 }
582#elif defined CONFIG_NET_CLS_POLICE
583 if (map->police && exts->police) {
584 struct rtattr * p_rta = (struct rtattr*) skb->tail;
585
586 RTA_PUT(skb, map->police, 0, NULL);
587
588 if (tcf_police_dump(skb, exts->police) < 0)
589 goto rtattr_failure;
590
591 p_rta->rta_len = skb->tail - (u8*)p_rta;
592 }
593#endif
594 return 0;
595rtattr_failure: __attribute__ ((unused))
596 return -1;
597}
598
599int
600tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts,
601 struct tcf_ext_map *map)
602{
603#ifdef CONFIG_NET_CLS_ACT
604 if (exts->action)
605 if (tcf_action_copy_stats(skb, exts->action, 1) < 0)
606 goto rtattr_failure;
607#elif defined CONFIG_NET_CLS_POLICE
608 if (exts->police)
609 if (tcf_police_dump_stats(skb, exts->police) < 0)
610 goto rtattr_failure;
611#endif
612 return 0;
613rtattr_failure: __attribute__ ((unused))
614 return -1;
615}
616
617static int __init tc_filter_init(void)
618{
619 struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC];
620
621 /* Setup rtnetlink links. It is made here to avoid
622 exporting large number of public symbols.
623 */
624
625 if (link_p) {
626 link_p[RTM_NEWTFILTER-RTM_BASE].doit = tc_ctl_tfilter;
627 link_p[RTM_DELTFILTER-RTM_BASE].doit = tc_ctl_tfilter;
628 link_p[RTM_GETTFILTER-RTM_BASE].doit = tc_ctl_tfilter;
629 link_p[RTM_GETTFILTER-RTM_BASE].dumpit = tc_dump_tfilter;
630 }
631 return 0;
632}
633
634subsys_initcall(tc_filter_init);
635
636EXPORT_SYMBOL(register_tcf_proto_ops);
637EXPORT_SYMBOL(unregister_tcf_proto_ops);
638EXPORT_SYMBOL(tcf_exts_validate);
639EXPORT_SYMBOL(tcf_exts_destroy);
640EXPORT_SYMBOL(tcf_exts_change);
641EXPORT_SYMBOL(tcf_exts_dump);
642EXPORT_SYMBOL(tcf_exts_dump_stats);
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
new file mode 100644
index 000000000000..0d2d4415f334
--- /dev/null
+++ b/net/sched/cls_basic.c
@@ -0,0 +1,303 @@
1/*
2 * net/sched/cls_basic.c Basic Packet Classifier.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Thomas Graf <tgraf@suug.ch>
10 */
11
12#include <linux/config.h>
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/kernel.h>
16#include <linux/sched.h>
17#include <linux/string.h>
18#include <linux/mm.h>
19#include <linux/errno.h>
20#include <linux/rtnetlink.h>
21#include <linux/skbuff.h>
22#include <net/act_api.h>
23#include <net/pkt_cls.h>
24
25struct basic_head
26{
27 u32 hgenerator;
28 struct list_head flist;
29};
30
31struct basic_filter
32{
33 u32 handle;
34 struct tcf_exts exts;
35 struct tcf_ematch_tree ematches;
36 struct tcf_result res;
37 struct list_head link;
38};
39
40static struct tcf_ext_map basic_ext_map = {
41 .action = TCA_BASIC_ACT,
42 .police = TCA_BASIC_POLICE
43};
44
45static int basic_classify(struct sk_buff *skb, struct tcf_proto *tp,
46 struct tcf_result *res)
47{
48 int r;
49 struct basic_head *head = (struct basic_head *) tp->root;
50 struct basic_filter *f;
51
52 list_for_each_entry(f, &head->flist, link) {
53 if (!tcf_em_tree_match(skb, &f->ematches, NULL))
54 continue;
55 *res = f->res;
56 r = tcf_exts_exec(skb, &f->exts, res);
57 if (r < 0)
58 continue;
59 return r;
60 }
61 return -1;
62}
63
64static unsigned long basic_get(struct tcf_proto *tp, u32 handle)
65{
66 unsigned long l = 0UL;
67 struct basic_head *head = (struct basic_head *) tp->root;
68 struct basic_filter *f;
69
70 if (head == NULL)
71 return 0UL;
72
73 list_for_each_entry(f, &head->flist, link)
74 if (f->handle == handle)
75 l = (unsigned long) f;
76
77 return l;
78}
79
80static void basic_put(struct tcf_proto *tp, unsigned long f)
81{
82}
83
84static int basic_init(struct tcf_proto *tp)
85{
86 return 0;
87}
88
89static inline void basic_delete_filter(struct tcf_proto *tp,
90 struct basic_filter *f)
91{
92 tcf_unbind_filter(tp, &f->res);
93 tcf_exts_destroy(tp, &f->exts);
94 tcf_em_tree_destroy(tp, &f->ematches);
95 kfree(f);
96}
97
98static void basic_destroy(struct tcf_proto *tp)
99{
100 struct basic_head *head = (struct basic_head *) xchg(&tp->root, NULL);
101 struct basic_filter *f, *n;
102
103 list_for_each_entry_safe(f, n, &head->flist, link) {
104 list_del(&f->link);
105 basic_delete_filter(tp, f);
106 }
107}
108
109static int basic_delete(struct tcf_proto *tp, unsigned long arg)
110{
111 struct basic_head *head = (struct basic_head *) tp->root;
112 struct basic_filter *t, *f = (struct basic_filter *) arg;
113
114 list_for_each_entry(t, &head->flist, link)
115 if (t == f) {
116 tcf_tree_lock(tp);
117 list_del(&t->link);
118 tcf_tree_unlock(tp);
119 basic_delete_filter(tp, t);
120 return 0;
121 }
122
123 return -ENOENT;
124}
125
126static inline int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f,
127 unsigned long base, struct rtattr **tb,
128 struct rtattr *est)
129{
130 int err = -EINVAL;
131 struct tcf_exts e;
132 struct tcf_ematch_tree t;
133
134 if (tb[TCA_BASIC_CLASSID-1])
135 if (RTA_PAYLOAD(tb[TCA_BASIC_CLASSID-1]) < sizeof(u32))
136 return err;
137
138 err = tcf_exts_validate(tp, tb, est, &e, &basic_ext_map);
139 if (err < 0)
140 return err;
141
142 err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES-1], &t);
143 if (err < 0)
144 goto errout;
145
146 if (tb[TCA_BASIC_CLASSID-1]) {
147 f->res.classid = *(u32*)RTA_DATA(tb[TCA_BASIC_CLASSID-1]);
148 tcf_bind_filter(tp, &f->res, base);
149 }
150
151 tcf_exts_change(tp, &f->exts, &e);
152 tcf_em_tree_change(tp, &f->ematches, &t);
153
154 return 0;
155errout:
156 tcf_exts_destroy(tp, &e);
157 return err;
158}
159
160static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle,
161 struct rtattr **tca, unsigned long *arg)
162{
163 int err = -EINVAL;
164 struct basic_head *head = (struct basic_head *) tp->root;
165 struct rtattr *tb[TCA_BASIC_MAX];
166 struct basic_filter *f = (struct basic_filter *) *arg;
167
168 if (tca[TCA_OPTIONS-1] == NULL)
169 return -EINVAL;
170
171 if (rtattr_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS-1]) < 0)
172 return -EINVAL;
173
174 if (f != NULL) {
175 if (handle && f->handle != handle)
176 return -EINVAL;
177 return basic_set_parms(tp, f, base, tb, tca[TCA_RATE-1]);
178 }
179
180 err = -ENOBUFS;
181 if (head == NULL) {
182 head = kmalloc(sizeof(*head), GFP_KERNEL);
183 if (head == NULL)
184 goto errout;
185
186 memset(head, 0, sizeof(*head));
187 INIT_LIST_HEAD(&head->flist);
188 tp->root = head;
189 }
190
191 f = kmalloc(sizeof(*f), GFP_KERNEL);
192 if (f == NULL)
193 goto errout;
194 memset(f, 0, sizeof(*f));
195
196 err = -EINVAL;
197 if (handle)
198 f->handle = handle;
199 else {
200 int i = 0x80000000;
201 do {
202 if (++head->hgenerator == 0x7FFFFFFF)
203 head->hgenerator = 1;
204 } while (--i > 0 && basic_get(tp, head->hgenerator));
205
206 if (i <= 0) {
207 printk(KERN_ERR "Insufficient number of handles\n");
208 goto errout;
209 }
210
211 f->handle = head->hgenerator;
212 }
213
214 err = basic_set_parms(tp, f, base, tb, tca[TCA_RATE-1]);
215 if (err < 0)
216 goto errout;
217
218 tcf_tree_lock(tp);
219 list_add(&f->link, &head->flist);
220 tcf_tree_unlock(tp);
221 *arg = (unsigned long) f;
222
223 return 0;
224errout:
225 if (*arg == 0UL && f)
226 kfree(f);
227
228 return err;
229}
230
231static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg)
232{
233 struct basic_head *head = (struct basic_head *) tp->root;
234 struct basic_filter *f;
235
236 list_for_each_entry(f, &head->flist, link) {
237 if (arg->count < arg->skip)
238 goto skip;
239
240 if (arg->fn(tp, (unsigned long) f, arg) < 0) {
241 arg->stop = 1;
242 break;
243 }
244skip:
245 arg->count++;
246 }
247}
248
249static int basic_dump(struct tcf_proto *tp, unsigned long fh,
250 struct sk_buff *skb, struct tcmsg *t)
251{
252 struct basic_filter *f = (struct basic_filter *) fh;
253 unsigned char *b = skb->tail;
254 struct rtattr *rta;
255
256 if (f == NULL)
257 return skb->len;
258
259 t->tcm_handle = f->handle;
260
261 rta = (struct rtattr *) b;
262 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
263
264 if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 ||
265 tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0)
266 goto rtattr_failure;
267
268 rta->rta_len = (skb->tail - b);
269 return skb->len;
270
271rtattr_failure:
272 skb_trim(skb, b - skb->data);
273 return -1;
274}
275
276static struct tcf_proto_ops cls_basic_ops = {
277 .kind = "basic",
278 .classify = basic_classify,
279 .init = basic_init,
280 .destroy = basic_destroy,
281 .get = basic_get,
282 .put = basic_put,
283 .change = basic_change,
284 .delete = basic_delete,
285 .walk = basic_walk,
286 .dump = basic_dump,
287 .owner = THIS_MODULE,
288};
289
290static int __init init_basic(void)
291{
292 return register_tcf_proto_ops(&cls_basic_ops);
293}
294
295static void __exit exit_basic(void)
296{
297 unregister_tcf_proto_ops(&cls_basic_ops);
298}
299
300module_init(init_basic)
301module_exit(exit_basic)
302MODULE_LICENSE("GPL");
303
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
new file mode 100644
index 000000000000..fdfc83af3d1f
--- /dev/null
+++ b/net/sched/cls_fw.c
@@ -0,0 +1,378 @@
1/*
2 * net/sched/cls_fw.c Classifier mapping ipchains' fwmark to traffic class.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Changes:
12 * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one
13 * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel).
14 * Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension
15 *
16 * JHS: We should remove the CONFIG_NET_CLS_IND from here
17 * eventually when the meta match extension is made available
18 *
19 */
20
21#include <linux/config.h>
22#include <linux/module.h>
23#include <asm/uaccess.h>
24#include <asm/system.h>
25#include <linux/bitops.h>
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/sched.h>
29#include <linux/string.h>
30#include <linux/mm.h>
31#include <linux/socket.h>
32#include <linux/sockios.h>
33#include <linux/in.h>
34#include <linux/errno.h>
35#include <linux/interrupt.h>
36#include <linux/if_ether.h>
37#include <linux/inet.h>
38#include <linux/netdevice.h>
39#include <linux/etherdevice.h>
40#include <linux/notifier.h>
41#include <linux/netfilter.h>
42#include <net/ip.h>
43#include <net/route.h>
44#include <linux/skbuff.h>
45#include <net/sock.h>
46#include <net/act_api.h>
47#include <net/pkt_cls.h>
48
49struct fw_head
50{
51 struct fw_filter *ht[256];
52};
53
54struct fw_filter
55{
56 struct fw_filter *next;
57 u32 id;
58 struct tcf_result res;
59#ifdef CONFIG_NET_CLS_IND
60 char indev[IFNAMSIZ];
61#endif /* CONFIG_NET_CLS_IND */
62 struct tcf_exts exts;
63};
64
65static struct tcf_ext_map fw_ext_map = {
66 .action = TCA_FW_ACT,
67 .police = TCA_FW_POLICE
68};
69
70static __inline__ int fw_hash(u32 handle)
71{
72 return handle&0xFF;
73}
74
75static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
76 struct tcf_result *res)
77{
78 struct fw_head *head = (struct fw_head*)tp->root;
79 struct fw_filter *f;
80 int r;
81#ifdef CONFIG_NETFILTER
82 u32 id = skb->nfmark;
83#else
84 u32 id = 0;
85#endif
86
87 if (head != NULL) {
88 for (f=head->ht[fw_hash(id)]; f; f=f->next) {
89 if (f->id == id) {
90 *res = f->res;
91#ifdef CONFIG_NET_CLS_IND
92 if (!tcf_match_indev(skb, f->indev))
93 continue;
94#endif /* CONFIG_NET_CLS_IND */
95 r = tcf_exts_exec(skb, &f->exts, res);
96 if (r < 0)
97 continue;
98
99 return r;
100 }
101 }
102 } else {
103 /* old method */
104 if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id^tp->q->handle)))) {
105 res->classid = id;
106 res->class = 0;
107 return 0;
108 }
109 }
110
111 return -1;
112}
113
114static unsigned long fw_get(struct tcf_proto *tp, u32 handle)
115{
116 struct fw_head *head = (struct fw_head*)tp->root;
117 struct fw_filter *f;
118
119 if (head == NULL)
120 return 0;
121
122 for (f=head->ht[fw_hash(handle)]; f; f=f->next) {
123 if (f->id == handle)
124 return (unsigned long)f;
125 }
126 return 0;
127}
128
129static void fw_put(struct tcf_proto *tp, unsigned long f)
130{
131}
132
133static int fw_init(struct tcf_proto *tp)
134{
135 return 0;
136}
137
138static inline void
139fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f)
140{
141 tcf_unbind_filter(tp, &f->res);
142 tcf_exts_destroy(tp, &f->exts);
143 kfree(f);
144}
145
146static void fw_destroy(struct tcf_proto *tp)
147{
148 struct fw_head *head = (struct fw_head*)xchg(&tp->root, NULL);
149 struct fw_filter *f;
150 int h;
151
152 if (head == NULL)
153 return;
154
155 for (h=0; h<256; h++) {
156 while ((f=head->ht[h]) != NULL) {
157 head->ht[h] = f->next;
158 fw_delete_filter(tp, f);
159 }
160 }
161 kfree(head);
162}
163
164static int fw_delete(struct tcf_proto *tp, unsigned long arg)
165{
166 struct fw_head *head = (struct fw_head*)tp->root;
167 struct fw_filter *f = (struct fw_filter*)arg;
168 struct fw_filter **fp;
169
170 if (head == NULL || f == NULL)
171 goto out;
172
173 for (fp=&head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) {
174 if (*fp == f) {
175 tcf_tree_lock(tp);
176 *fp = f->next;
177 tcf_tree_unlock(tp);
178 fw_delete_filter(tp, f);
179 return 0;
180 }
181 }
182out:
183 return -EINVAL;
184}
185
186static int
187fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f,
188 struct rtattr **tb, struct rtattr **tca, unsigned long base)
189{
190 struct tcf_exts e;
191 int err;
192
193 err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &fw_ext_map);
194 if (err < 0)
195 return err;
196
197 err = -EINVAL;
198 if (tb[TCA_FW_CLASSID-1]) {
199 if (RTA_PAYLOAD(tb[TCA_FW_CLASSID-1]) != sizeof(u32))
200 goto errout;
201 f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]);
202 tcf_bind_filter(tp, &f->res, base);
203 }
204
205#ifdef CONFIG_NET_CLS_IND
206 if (tb[TCA_FW_INDEV-1]) {
207 err = tcf_change_indev(tp, f->indev, tb[TCA_FW_INDEV-1]);
208 if (err < 0)
209 goto errout;
210 }
211#endif /* CONFIG_NET_CLS_IND */
212
213 tcf_exts_change(tp, &f->exts, &e);
214
215 return 0;
216errout:
217 tcf_exts_destroy(tp, &e);
218 return err;
219}
220
221static int fw_change(struct tcf_proto *tp, unsigned long base,
222 u32 handle,
223 struct rtattr **tca,
224 unsigned long *arg)
225{
226 struct fw_head *head = (struct fw_head*)tp->root;
227 struct fw_filter *f = (struct fw_filter *) *arg;
228 struct rtattr *opt = tca[TCA_OPTIONS-1];
229 struct rtattr *tb[TCA_FW_MAX];
230 int err;
231
232 if (!opt)
233 return handle ? -EINVAL : 0;
234
235 if (rtattr_parse_nested(tb, TCA_FW_MAX, opt) < 0)
236 return -EINVAL;
237
238 if (f != NULL) {
239 if (f->id != handle && handle)
240 return -EINVAL;
241 return fw_change_attrs(tp, f, tb, tca, base);
242 }
243
244 if (!handle)
245 return -EINVAL;
246
247 if (head == NULL) {
248 head = kmalloc(sizeof(struct fw_head), GFP_KERNEL);
249 if (head == NULL)
250 return -ENOBUFS;
251 memset(head, 0, sizeof(*head));
252
253 tcf_tree_lock(tp);
254 tp->root = head;
255 tcf_tree_unlock(tp);
256 }
257
258 f = kmalloc(sizeof(struct fw_filter), GFP_KERNEL);
259 if (f == NULL)
260 return -ENOBUFS;
261 memset(f, 0, sizeof(*f));
262
263 f->id = handle;
264
265 err = fw_change_attrs(tp, f, tb, tca, base);
266 if (err < 0)
267 goto errout;
268
269 f->next = head->ht[fw_hash(handle)];
270 tcf_tree_lock(tp);
271 head->ht[fw_hash(handle)] = f;
272 tcf_tree_unlock(tp);
273
274 *arg = (unsigned long)f;
275 return 0;
276
277errout:
278 if (f)
279 kfree(f);
280 return err;
281}
282
283static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
284{
285 struct fw_head *head = (struct fw_head*)tp->root;
286 int h;
287
288 if (head == NULL)
289 arg->stop = 1;
290
291 if (arg->stop)
292 return;
293
294 for (h = 0; h < 256; h++) {
295 struct fw_filter *f;
296
297 for (f = head->ht[h]; f; f = f->next) {
298 if (arg->count < arg->skip) {
299 arg->count++;
300 continue;
301 }
302 if (arg->fn(tp, (unsigned long)f, arg) < 0) {
303 arg->stop = 1;
304 return;
305 }
306 arg->count++;
307 }
308 }
309}
310
311static int fw_dump(struct tcf_proto *tp, unsigned long fh,
312 struct sk_buff *skb, struct tcmsg *t)
313{
314 struct fw_filter *f = (struct fw_filter*)fh;
315 unsigned char *b = skb->tail;
316 struct rtattr *rta;
317
318 if (f == NULL)
319 return skb->len;
320
321 t->tcm_handle = f->id;
322
323 if (!f->res.classid && !tcf_exts_is_available(&f->exts))
324 return skb->len;
325
326 rta = (struct rtattr*)b;
327 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
328
329 if (f->res.classid)
330 RTA_PUT(skb, TCA_FW_CLASSID, 4, &f->res.classid);
331#ifdef CONFIG_NET_CLS_IND
332 if (strlen(f->indev))
333 RTA_PUT(skb, TCA_FW_INDEV, IFNAMSIZ, f->indev);
334#endif /* CONFIG_NET_CLS_IND */
335
336 if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0)
337 goto rtattr_failure;
338
339 rta->rta_len = skb->tail - b;
340
341 if (tcf_exts_dump_stats(skb, &f->exts, &fw_ext_map) < 0)
342 goto rtattr_failure;
343
344 return skb->len;
345
346rtattr_failure:
347 skb_trim(skb, b - skb->data);
348 return -1;
349}
350
351static struct tcf_proto_ops cls_fw_ops = {
352 .next = NULL,
353 .kind = "fw",
354 .classify = fw_classify,
355 .init = fw_init,
356 .destroy = fw_destroy,
357 .get = fw_get,
358 .put = fw_put,
359 .change = fw_change,
360 .delete = fw_delete,
361 .walk = fw_walk,
362 .dump = fw_dump,
363 .owner = THIS_MODULE,
364};
365
366static int __init init_fw(void)
367{
368 return register_tcf_proto_ops(&cls_fw_ops);
369}
370
371static void __exit exit_fw(void)
372{
373 unregister_tcf_proto_ops(&cls_fw_ops);
374}
375
376module_init(init_fw)
377module_exit(exit_fw)
378MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
new file mode 100644
index 000000000000..02996ac05c75
--- /dev/null
+++ b/net/sched/cls_route.c
@@ -0,0 +1,639 @@
1/*
2 * net/sched/cls_route.c ROUTE4 classifier.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 */
11
12#include <linux/module.h>
13#include <linux/config.h>
14#include <asm/uaccess.h>
15#include <asm/system.h>
16#include <linux/bitops.h>
17#include <linux/types.h>
18#include <linux/kernel.h>
19#include <linux/sched.h>
20#include <linux/string.h>
21#include <linux/mm.h>
22#include <linux/socket.h>
23#include <linux/sockios.h>
24#include <linux/in.h>
25#include <linux/errno.h>
26#include <linux/interrupt.h>
27#include <linux/if_ether.h>
28#include <linux/inet.h>
29#include <linux/netdevice.h>
30#include <linux/etherdevice.h>
31#include <linux/notifier.h>
32#include <net/ip.h>
33#include <net/route.h>
34#include <linux/skbuff.h>
35#include <net/sock.h>
36#include <net/act_api.h>
37#include <net/pkt_cls.h>
38
39/*
40 1. For now we assume that route tags < 256.
41 It allows to use direct table lookups, instead of hash tables.
42 2. For now we assume that "from TAG" and "fromdev DEV" statements
43 are mutually exclusive.
44 3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
45 */
46
47struct route4_fastmap
48{
49 struct route4_filter *filter;
50 u32 id;
51 int iif;
52};
53
54struct route4_head
55{
56 struct route4_fastmap fastmap[16];
57 struct route4_bucket *table[256+1];
58};
59
60struct route4_bucket
61{
62 /* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */
63 struct route4_filter *ht[16+16+1];
64};
65
66struct route4_filter
67{
68 struct route4_filter *next;
69 u32 id;
70 int iif;
71
72 struct tcf_result res;
73 struct tcf_exts exts;
74 u32 handle;
75 struct route4_bucket *bkt;
76};
77
78#define ROUTE4_FAILURE ((struct route4_filter*)(-1L))
79
80static struct tcf_ext_map route_ext_map = {
81 .police = TCA_ROUTE4_POLICE,
82 .action = TCA_ROUTE4_ACT
83};
84
85static __inline__ int route4_fastmap_hash(u32 id, int iif)
86{
87 return id&0xF;
88}
89
90static inline
91void route4_reset_fastmap(struct net_device *dev, struct route4_head *head, u32 id)
92{
93 spin_lock_bh(&dev->queue_lock);
94 memset(head->fastmap, 0, sizeof(head->fastmap));
95 spin_unlock_bh(&dev->queue_lock);
96}
97
98static void __inline__
99route4_set_fastmap(struct route4_head *head, u32 id, int iif,
100 struct route4_filter *f)
101{
102 int h = route4_fastmap_hash(id, iif);
103 head->fastmap[h].id = id;
104 head->fastmap[h].iif = iif;
105 head->fastmap[h].filter = f;
106}
107
108static __inline__ int route4_hash_to(u32 id)
109{
110 return id&0xFF;
111}
112
113static __inline__ int route4_hash_from(u32 id)
114{
115 return (id>>16)&0xF;
116}
117
118static __inline__ int route4_hash_iif(int iif)
119{
120 return 16 + ((iif>>16)&0xF);
121}
122
123static __inline__ int route4_hash_wild(void)
124{
125 return 32;
126}
127
128#define ROUTE4_APPLY_RESULT() \
129{ \
130 *res = f->res; \
131 if (tcf_exts_is_available(&f->exts)) { \
132 int r = tcf_exts_exec(skb, &f->exts, res); \
133 if (r < 0) { \
134 dont_cache = 1; \
135 continue; \
136 } \
137 return r; \
138 } else if (!dont_cache) \
139 route4_set_fastmap(head, id, iif, f); \
140 return 0; \
141}
142
143static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp,
144 struct tcf_result *res)
145{
146 struct route4_head *head = (struct route4_head*)tp->root;
147 struct dst_entry *dst;
148 struct route4_bucket *b;
149 struct route4_filter *f;
150 u32 id, h;
151 int iif, dont_cache = 0;
152
153 if ((dst = skb->dst) == NULL)
154 goto failure;
155
156 id = dst->tclassid;
157 if (head == NULL)
158 goto old_method;
159
160 iif = ((struct rtable*)dst)->fl.iif;
161
162 h = route4_fastmap_hash(id, iif);
163 if (id == head->fastmap[h].id &&
164 iif == head->fastmap[h].iif &&
165 (f = head->fastmap[h].filter) != NULL) {
166 if (f == ROUTE4_FAILURE)
167 goto failure;
168
169 *res = f->res;
170 return 0;
171 }
172
173 h = route4_hash_to(id);
174
175restart:
176 if ((b = head->table[h]) != NULL) {
177 for (f = b->ht[route4_hash_from(id)]; f; f = f->next)
178 if (f->id == id)
179 ROUTE4_APPLY_RESULT();
180
181 for (f = b->ht[route4_hash_iif(iif)]; f; f = f->next)
182 if (f->iif == iif)
183 ROUTE4_APPLY_RESULT();
184
185 for (f = b->ht[route4_hash_wild()]; f; f = f->next)
186 ROUTE4_APPLY_RESULT();
187
188 }
189 if (h < 256) {
190 h = 256;
191 id &= ~0xFFFF;
192 goto restart;
193 }
194
195 if (!dont_cache)
196 route4_set_fastmap(head, id, iif, ROUTE4_FAILURE);
197failure:
198 return -1;
199
200old_method:
201 if (id && (TC_H_MAJ(id) == 0 ||
202 !(TC_H_MAJ(id^tp->q->handle)))) {
203 res->classid = id;
204 res->class = 0;
205 return 0;
206 }
207 return -1;
208}
209
210static inline u32 to_hash(u32 id)
211{
212 u32 h = id&0xFF;
213 if (id&0x8000)
214 h += 256;
215 return h;
216}
217
218static inline u32 from_hash(u32 id)
219{
220 id &= 0xFFFF;
221 if (id == 0xFFFF)
222 return 32;
223 if (!(id & 0x8000)) {
224 if (id > 255)
225 return 256;
226 return id&0xF;
227 }
228 return 16 + (id&0xF);
229}
230
231static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
232{
233 struct route4_head *head = (struct route4_head*)tp->root;
234 struct route4_bucket *b;
235 struct route4_filter *f;
236 unsigned h1, h2;
237
238 if (!head)
239 return 0;
240
241 h1 = to_hash(handle);
242 if (h1 > 256)
243 return 0;
244
245 h2 = from_hash(handle>>16);
246 if (h2 > 32)
247 return 0;
248
249 if ((b = head->table[h1]) != NULL) {
250 for (f = b->ht[h2]; f; f = f->next)
251 if (f->handle == handle)
252 return (unsigned long)f;
253 }
254 return 0;
255}
256
257static void route4_put(struct tcf_proto *tp, unsigned long f)
258{
259}
260
261static int route4_init(struct tcf_proto *tp)
262{
263 return 0;
264}
265
266static inline void
267route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f)
268{
269 tcf_unbind_filter(tp, &f->res);
270 tcf_exts_destroy(tp, &f->exts);
271 kfree(f);
272}
273
274static void route4_destroy(struct tcf_proto *tp)
275{
276 struct route4_head *head = xchg(&tp->root, NULL);
277 int h1, h2;
278
279 if (head == NULL)
280 return;
281
282 for (h1=0; h1<=256; h1++) {
283 struct route4_bucket *b;
284
285 if ((b = head->table[h1]) != NULL) {
286 for (h2=0; h2<=32; h2++) {
287 struct route4_filter *f;
288
289 while ((f = b->ht[h2]) != NULL) {
290 b->ht[h2] = f->next;
291 route4_delete_filter(tp, f);
292 }
293 }
294 kfree(b);
295 }
296 }
297 kfree(head);
298}
299
300static int route4_delete(struct tcf_proto *tp, unsigned long arg)
301{
302 struct route4_head *head = (struct route4_head*)tp->root;
303 struct route4_filter **fp, *f = (struct route4_filter*)arg;
304 unsigned h = 0;
305 struct route4_bucket *b;
306 int i;
307
308 if (!head || !f)
309 return -EINVAL;
310
311 h = f->handle;
312 b = f->bkt;
313
314 for (fp = &b->ht[from_hash(h>>16)]; *fp; fp = &(*fp)->next) {
315 if (*fp == f) {
316 tcf_tree_lock(tp);
317 *fp = f->next;
318 tcf_tree_unlock(tp);
319
320 route4_reset_fastmap(tp->q->dev, head, f->id);
321 route4_delete_filter(tp, f);
322
323 /* Strip tree */
324
325 for (i=0; i<=32; i++)
326 if (b->ht[i])
327 return 0;
328
329 /* OK, session has no flows */
330 tcf_tree_lock(tp);
331 head->table[to_hash(h)] = NULL;
332 tcf_tree_unlock(tp);
333
334 kfree(b);
335 return 0;
336 }
337 }
338 return 0;
339}
340
341static int route4_set_parms(struct tcf_proto *tp, unsigned long base,
342 struct route4_filter *f, u32 handle, struct route4_head *head,
343 struct rtattr **tb, struct rtattr *est, int new)
344{
345 int err;
346 u32 id = 0, to = 0, nhandle = 0x8000;
347 struct route4_filter *fp;
348 unsigned int h1;
349 struct route4_bucket *b;
350 struct tcf_exts e;
351
352 err = tcf_exts_validate(tp, tb, est, &e, &route_ext_map);
353 if (err < 0)
354 return err;
355
356 err = -EINVAL;
357 if (tb[TCA_ROUTE4_CLASSID-1])
358 if (RTA_PAYLOAD(tb[TCA_ROUTE4_CLASSID-1]) < sizeof(u32))
359 goto errout;
360
361 if (tb[TCA_ROUTE4_TO-1]) {
362 if (new && handle & 0x8000)
363 goto errout;
364 if (RTA_PAYLOAD(tb[TCA_ROUTE4_TO-1]) < sizeof(u32))
365 goto errout;
366 to = *(u32*)RTA_DATA(tb[TCA_ROUTE4_TO-1]);
367 if (to > 0xFF)
368 goto errout;
369 nhandle = to;
370 }
371
372 if (tb[TCA_ROUTE4_FROM-1]) {
373 if (tb[TCA_ROUTE4_IIF-1])
374 goto errout;
375 if (RTA_PAYLOAD(tb[TCA_ROUTE4_FROM-1]) < sizeof(u32))
376 goto errout;
377 id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_FROM-1]);
378 if (id > 0xFF)
379 goto errout;
380 nhandle |= id << 16;
381 } else if (tb[TCA_ROUTE4_IIF-1]) {
382 if (RTA_PAYLOAD(tb[TCA_ROUTE4_IIF-1]) < sizeof(u32))
383 goto errout;
384 id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_IIF-1]);
385 if (id > 0x7FFF)
386 goto errout;
387 nhandle |= (id | 0x8000) << 16;
388 } else
389 nhandle |= 0xFFFF << 16;
390
391 if (handle && new) {
392 nhandle |= handle & 0x7F00;
393 if (nhandle != handle)
394 goto errout;
395 }
396
397 h1 = to_hash(nhandle);
398 if ((b = head->table[h1]) == NULL) {
399 err = -ENOBUFS;
400 b = kmalloc(sizeof(struct route4_bucket), GFP_KERNEL);
401 if (b == NULL)
402 goto errout;
403 memset(b, 0, sizeof(*b));
404
405 tcf_tree_lock(tp);
406 head->table[h1] = b;
407 tcf_tree_unlock(tp);
408 } else {
409 unsigned int h2 = from_hash(nhandle >> 16);
410 err = -EEXIST;
411 for (fp = b->ht[h2]; fp; fp = fp->next)
412 if (fp->handle == f->handle)
413 goto errout;
414 }
415
416 tcf_tree_lock(tp);
417 if (tb[TCA_ROUTE4_TO-1])
418 f->id = to;
419
420 if (tb[TCA_ROUTE4_FROM-1])
421 f->id = to | id<<16;
422 else if (tb[TCA_ROUTE4_IIF-1])
423 f->iif = id;
424
425 f->handle = nhandle;
426 f->bkt = b;
427 tcf_tree_unlock(tp);
428
429 if (tb[TCA_ROUTE4_CLASSID-1]) {
430 f->res.classid = *(u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID-1]);
431 tcf_bind_filter(tp, &f->res, base);
432 }
433
434 tcf_exts_change(tp, &f->exts, &e);
435
436 return 0;
437errout:
438 tcf_exts_destroy(tp, &e);
439 return err;
440}
441
442static int route4_change(struct tcf_proto *tp, unsigned long base,
443 u32 handle,
444 struct rtattr **tca,
445 unsigned long *arg)
446{
447 struct route4_head *head = tp->root;
448 struct route4_filter *f, *f1, **fp;
449 struct route4_bucket *b;
450 struct rtattr *opt = tca[TCA_OPTIONS-1];
451 struct rtattr *tb[TCA_ROUTE4_MAX];
452 unsigned int h, th;
453 u32 old_handle = 0;
454 int err;
455
456 if (opt == NULL)
457 return handle ? -EINVAL : 0;
458
459 if (rtattr_parse_nested(tb, TCA_ROUTE4_MAX, opt) < 0)
460 return -EINVAL;
461
462 if ((f = (struct route4_filter*)*arg) != NULL) {
463 if (f->handle != handle && handle)
464 return -EINVAL;
465
466 if (f->bkt)
467 old_handle = f->handle;
468
469 err = route4_set_parms(tp, base, f, handle, head, tb,
470 tca[TCA_RATE-1], 0);
471 if (err < 0)
472 return err;
473
474 goto reinsert;
475 }
476
477 err = -ENOBUFS;
478 if (head == NULL) {
479 head = kmalloc(sizeof(struct route4_head), GFP_KERNEL);
480 if (head == NULL)
481 goto errout;
482 memset(head, 0, sizeof(struct route4_head));
483
484 tcf_tree_lock(tp);
485 tp->root = head;
486 tcf_tree_unlock(tp);
487 }
488
489 f = kmalloc(sizeof(struct route4_filter), GFP_KERNEL);
490 if (f == NULL)
491 goto errout;
492 memset(f, 0, sizeof(*f));
493
494 err = route4_set_parms(tp, base, f, handle, head, tb,
495 tca[TCA_RATE-1], 1);
496 if (err < 0)
497 goto errout;
498
499reinsert:
500 h = from_hash(f->handle >> 16);
501 for (fp = &f->bkt->ht[h]; (f1=*fp) != NULL; fp = &f1->next)
502 if (f->handle < f1->handle)
503 break;
504
505 f->next = f1;
506 tcf_tree_lock(tp);
507 *fp = f;
508
509 if (old_handle && f->handle != old_handle) {
510 th = to_hash(old_handle);
511 h = from_hash(old_handle >> 16);
512 if ((b = head->table[th]) != NULL) {
513 for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) {
514 if (*fp == f) {
515 *fp = f->next;
516 break;
517 }
518 }
519 }
520 }
521 tcf_tree_unlock(tp);
522
523 route4_reset_fastmap(tp->q->dev, head, f->id);
524 *arg = (unsigned long)f;
525 return 0;
526
527errout:
528 if (f)
529 kfree(f);
530 return err;
531}
532
533static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
534{
535 struct route4_head *head = tp->root;
536 unsigned h, h1;
537
538 if (head == NULL)
539 arg->stop = 1;
540
541 if (arg->stop)
542 return;
543
544 for (h = 0; h <= 256; h++) {
545 struct route4_bucket *b = head->table[h];
546
547 if (b) {
548 for (h1 = 0; h1 <= 32; h1++) {
549 struct route4_filter *f;
550
551 for (f = b->ht[h1]; f; f = f->next) {
552 if (arg->count < arg->skip) {
553 arg->count++;
554 continue;
555 }
556 if (arg->fn(tp, (unsigned long)f, arg) < 0) {
557 arg->stop = 1;
558 return;
559 }
560 arg->count++;
561 }
562 }
563 }
564 }
565}
566
567static int route4_dump(struct tcf_proto *tp, unsigned long fh,
568 struct sk_buff *skb, struct tcmsg *t)
569{
570 struct route4_filter *f = (struct route4_filter*)fh;
571 unsigned char *b = skb->tail;
572 struct rtattr *rta;
573 u32 id;
574
575 if (f == NULL)
576 return skb->len;
577
578 t->tcm_handle = f->handle;
579
580 rta = (struct rtattr*)b;
581 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
582
583 if (!(f->handle&0x8000)) {
584 id = f->id&0xFF;
585 RTA_PUT(skb, TCA_ROUTE4_TO, sizeof(id), &id);
586 }
587 if (f->handle&0x80000000) {
588 if ((f->handle>>16) != 0xFFFF)
589 RTA_PUT(skb, TCA_ROUTE4_IIF, sizeof(f->iif), &f->iif);
590 } else {
591 id = f->id>>16;
592 RTA_PUT(skb, TCA_ROUTE4_FROM, sizeof(id), &id);
593 }
594 if (f->res.classid)
595 RTA_PUT(skb, TCA_ROUTE4_CLASSID, 4, &f->res.classid);
596
597 if (tcf_exts_dump(skb, &f->exts, &route_ext_map) < 0)
598 goto rtattr_failure;
599
600 rta->rta_len = skb->tail - b;
601
602 if (tcf_exts_dump_stats(skb, &f->exts, &route_ext_map) < 0)
603 goto rtattr_failure;
604
605 return skb->len;
606
607rtattr_failure:
608 skb_trim(skb, b - skb->data);
609 return -1;
610}
611
612static struct tcf_proto_ops cls_route4_ops = {
613 .next = NULL,
614 .kind = "route",
615 .classify = route4_classify,
616 .init = route4_init,
617 .destroy = route4_destroy,
618 .get = route4_get,
619 .put = route4_put,
620 .change = route4_change,
621 .delete = route4_delete,
622 .walk = route4_walk,
623 .dump = route4_dump,
624 .owner = THIS_MODULE,
625};
626
627static int __init init_route4(void)
628{
629 return register_tcf_proto_ops(&cls_route4_ops);
630}
631
632static void __exit exit_route4(void)
633{
634 unregister_tcf_proto_ops(&cls_route4_ops);
635}
636
637module_init(init_route4)
638module_exit(exit_route4)
639MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c
new file mode 100644
index 000000000000..ad2613790d85
--- /dev/null
+++ b/net/sched/cls_rsvp.c
@@ -0,0 +1,43 @@
1/*
2 * net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 */
11
12#include <linux/module.h>
13#include <asm/uaccess.h>
14#include <asm/system.h>
15#include <linux/bitops.h>
16#include <linux/types.h>
17#include <linux/kernel.h>
18#include <linux/sched.h>
19#include <linux/string.h>
20#include <linux/mm.h>
21#include <linux/socket.h>
22#include <linux/sockios.h>
23#include <linux/in.h>
24#include <linux/errno.h>
25#include <linux/interrupt.h>
26#include <linux/if_ether.h>
27#include <linux/inet.h>
28#include <linux/netdevice.h>
29#include <linux/etherdevice.h>
30#include <linux/notifier.h>
31#include <net/ip.h>
32#include <net/route.h>
33#include <linux/skbuff.h>
34#include <net/sock.h>
35#include <net/act_api.h>
36#include <net/pkt_cls.h>
37
38#define RSVP_DST_LEN 1
39#define RSVP_ID "rsvp"
40#define RSVP_OPS cls_rsvp_ops
41
42#include "cls_rsvp.h"
43MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
new file mode 100644
index 000000000000..232fb9196810
--- /dev/null
+++ b/net/sched/cls_rsvp.h
@@ -0,0 +1,667 @@
1/*
2 * net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 */
11
12/*
13 Comparing to general packet classification problem,
14 RSVP needs only sevaral relatively simple rules:
15
16 * (dst, protocol) are always specified,
17 so that we are able to hash them.
18 * src may be exact, or may be wildcard, so that
19 we can keep a hash table plus one wildcard entry.
20 * source port (or flow label) is important only if src is given.
21
22 IMPLEMENTATION.
23
24 We use a two level hash table: The top level is keyed by
25 destination address and protocol ID, every bucket contains a list
26 of "rsvp sessions", identified by destination address, protocol and
27 DPI(="Destination Port ID"): triple (key, mask, offset).
28
29 Every bucket has a smaller hash table keyed by source address
30 (cf. RSVP flowspec) and one wildcard entry for wildcard reservations.
31 Every bucket is again a list of "RSVP flows", selected by
32 source address and SPI(="Source Port ID" here rather than
33 "security parameter index"): triple (key, mask, offset).
34
35
36 NOTE 1. All the packets with IPv6 extension headers (but AH and ESP)
37 and all fragmented packets go to the best-effort traffic class.
38
39
40 NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires
41 only one "Generalized Port Identifier". So that for classic
42 ah, esp (and udp,tcp) both *pi should coincide or one of them
43 should be wildcard.
44
45 At first sight, this redundancy is just a waste of CPU
46 resources. But DPI and SPI add the possibility to assign different
47 priorities to GPIs. Look also at note 4 about tunnels below.
48
49
50 NOTE 3. One complication is the case of tunneled packets.
51 We implement it as following: if the first lookup
52 matches a special session with "tunnelhdr" value not zero,
53 flowid doesn't contain the true flow ID, but the tunnel ID (1...255).
54 In this case, we pull tunnelhdr bytes and restart lookup
55 with tunnel ID added to the list of keys. Simple and stupid 8)8)
56 It's enough for PIMREG and IPIP.
57
58
59 NOTE 4. Two GPIs make it possible to parse even GRE packets.
60 F.e. DPI can select ETH_P_IP (and necessary flags to make
61 tunnelhdr correct) in GRE protocol field and SPI matches
62 GRE key. Is it not nice? 8)8)
63
64
65 Well, as result, despite its simplicity, we get a pretty
66 powerful classification engine. */
67
68#include <linux/config.h>
69
70struct rsvp_head
71{
72 u32 tmap[256/32];
73 u32 hgenerator;
74 u8 tgenerator;
75 struct rsvp_session *ht[256];
76};
77
78struct rsvp_session
79{
80 struct rsvp_session *next;
81 u32 dst[RSVP_DST_LEN];
82 struct tc_rsvp_gpi dpi;
83 u8 protocol;
84 u8 tunnelid;
85 /* 16 (src,sport) hash slots, and one wildcard source slot */
86 struct rsvp_filter *ht[16+1];
87};
88
89
90struct rsvp_filter
91{
92 struct rsvp_filter *next;
93 u32 src[RSVP_DST_LEN];
94 struct tc_rsvp_gpi spi;
95 u8 tunnelhdr;
96
97 struct tcf_result res;
98 struct tcf_exts exts;
99
100 u32 handle;
101 struct rsvp_session *sess;
102};
103
104static __inline__ unsigned hash_dst(u32 *dst, u8 protocol, u8 tunnelid)
105{
106 unsigned h = dst[RSVP_DST_LEN-1];
107 h ^= h>>16;
108 h ^= h>>8;
109 return (h ^ protocol ^ tunnelid) & 0xFF;
110}
111
112static __inline__ unsigned hash_src(u32 *src)
113{
114 unsigned h = src[RSVP_DST_LEN-1];
115 h ^= h>>16;
116 h ^= h>>8;
117 h ^= h>>4;
118 return h & 0xF;
119}
120
121static struct tcf_ext_map rsvp_ext_map = {
122 .police = TCA_RSVP_POLICE,
123 .action = TCA_RSVP_ACT
124};
125
126#define RSVP_APPLY_RESULT() \
127{ \
128 int r = tcf_exts_exec(skb, &f->exts, res); \
129 if (r < 0) \
130 continue; \
131 else if (r > 0) \
132 return r; \
133}
134
135static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp,
136 struct tcf_result *res)
137{
138 struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht;
139 struct rsvp_session *s;
140 struct rsvp_filter *f;
141 unsigned h1, h2;
142 u32 *dst, *src;
143 u8 protocol;
144 u8 tunnelid = 0;
145 u8 *xprt;
146#if RSVP_DST_LEN == 4
147 struct ipv6hdr *nhptr = skb->nh.ipv6h;
148#else
149 struct iphdr *nhptr = skb->nh.iph;
150#endif
151
152restart:
153
154#if RSVP_DST_LEN == 4
155 src = &nhptr->saddr.s6_addr32[0];
156 dst = &nhptr->daddr.s6_addr32[0];
157 protocol = nhptr->nexthdr;
158 xprt = ((u8*)nhptr) + sizeof(struct ipv6hdr);
159#else
160 src = &nhptr->saddr;
161 dst = &nhptr->daddr;
162 protocol = nhptr->protocol;
163 xprt = ((u8*)nhptr) + (nhptr->ihl<<2);
164 if (nhptr->frag_off&__constant_htons(IP_MF|IP_OFFSET))
165 return -1;
166#endif
167
168 h1 = hash_dst(dst, protocol, tunnelid);
169 h2 = hash_src(src);
170
171 for (s = sht[h1]; s; s = s->next) {
172 if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
173 protocol == s->protocol &&
174 !(s->dpi.mask & (*(u32*)(xprt+s->dpi.offset)^s->dpi.key))
175#if RSVP_DST_LEN == 4
176 && dst[0] == s->dst[0]
177 && dst[1] == s->dst[1]
178 && dst[2] == s->dst[2]
179#endif
180 && tunnelid == s->tunnelid) {
181
182 for (f = s->ht[h2]; f; f = f->next) {
183 if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN-1] &&
184 !(f->spi.mask & (*(u32*)(xprt+f->spi.offset)^f->spi.key))
185#if RSVP_DST_LEN == 4
186 && src[0] == f->src[0]
187 && src[1] == f->src[1]
188 && src[2] == f->src[2]
189#endif
190 ) {
191 *res = f->res;
192 RSVP_APPLY_RESULT();
193
194matched:
195 if (f->tunnelhdr == 0)
196 return 0;
197
198 tunnelid = f->res.classid;
199 nhptr = (void*)(xprt + f->tunnelhdr - sizeof(*nhptr));
200 goto restart;
201 }
202 }
203
204 /* And wildcard bucket... */
205 for (f = s->ht[16]; f; f = f->next) {
206 *res = f->res;
207 RSVP_APPLY_RESULT();
208 goto matched;
209 }
210 return -1;
211 }
212 }
213 return -1;
214}
215
216static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle)
217{
218 struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht;
219 struct rsvp_session *s;
220 struct rsvp_filter *f;
221 unsigned h1 = handle&0xFF;
222 unsigned h2 = (handle>>8)&0xFF;
223
224 if (h2 > 16)
225 return 0;
226
227 for (s = sht[h1]; s; s = s->next) {
228 for (f = s->ht[h2]; f; f = f->next) {
229 if (f->handle == handle)
230 return (unsigned long)f;
231 }
232 }
233 return 0;
234}
235
236static void rsvp_put(struct tcf_proto *tp, unsigned long f)
237{
238}
239
240static int rsvp_init(struct tcf_proto *tp)
241{
242 struct rsvp_head *data;
243
244 data = kmalloc(sizeof(struct rsvp_head), GFP_KERNEL);
245 if (data) {
246 memset(data, 0, sizeof(struct rsvp_head));
247 tp->root = data;
248 return 0;
249 }
250 return -ENOBUFS;
251}
252
253static inline void
254rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
255{
256 tcf_unbind_filter(tp, &f->res);
257 tcf_exts_destroy(tp, &f->exts);
258 kfree(f);
259}
260
261static void rsvp_destroy(struct tcf_proto *tp)
262{
263 struct rsvp_head *data = xchg(&tp->root, NULL);
264 struct rsvp_session **sht;
265 int h1, h2;
266
267 if (data == NULL)
268 return;
269
270 sht = data->ht;
271
272 for (h1=0; h1<256; h1++) {
273 struct rsvp_session *s;
274
275 while ((s = sht[h1]) != NULL) {
276 sht[h1] = s->next;
277
278 for (h2=0; h2<=16; h2++) {
279 struct rsvp_filter *f;
280
281 while ((f = s->ht[h2]) != NULL) {
282 s->ht[h2] = f->next;
283 rsvp_delete_filter(tp, f);
284 }
285 }
286 kfree(s);
287 }
288 }
289 kfree(data);
290}
291
292static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
293{
294 struct rsvp_filter **fp, *f = (struct rsvp_filter*)arg;
295 unsigned h = f->handle;
296 struct rsvp_session **sp;
297 struct rsvp_session *s = f->sess;
298 int i;
299
300 for (fp = &s->ht[(h>>8)&0xFF]; *fp; fp = &(*fp)->next) {
301 if (*fp == f) {
302 tcf_tree_lock(tp);
303 *fp = f->next;
304 tcf_tree_unlock(tp);
305 rsvp_delete_filter(tp, f);
306
307 /* Strip tree */
308
309 for (i=0; i<=16; i++)
310 if (s->ht[i])
311 return 0;
312
313 /* OK, session has no flows */
314 for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF];
315 *sp; sp = &(*sp)->next) {
316 if (*sp == s) {
317 tcf_tree_lock(tp);
318 *sp = s->next;
319 tcf_tree_unlock(tp);
320
321 kfree(s);
322 return 0;
323 }
324 }
325
326 return 0;
327 }
328 }
329 return 0;
330}
331
332static unsigned gen_handle(struct tcf_proto *tp, unsigned salt)
333{
334 struct rsvp_head *data = tp->root;
335 int i = 0xFFFF;
336
337 while (i-- > 0) {
338 u32 h;
339 if ((data->hgenerator += 0x10000) == 0)
340 data->hgenerator = 0x10000;
341 h = data->hgenerator|salt;
342 if (rsvp_get(tp, h) == 0)
343 return h;
344 }
345 return 0;
346}
347
348static int tunnel_bts(struct rsvp_head *data)
349{
350 int n = data->tgenerator>>5;
351 u32 b = 1<<(data->tgenerator&0x1F);
352
353 if (data->tmap[n]&b)
354 return 0;
355 data->tmap[n] |= b;
356 return 1;
357}
358
359static void tunnel_recycle(struct rsvp_head *data)
360{
361 struct rsvp_session **sht = data->ht;
362 u32 tmap[256/32];
363 int h1, h2;
364
365 memset(tmap, 0, sizeof(tmap));
366
367 for (h1=0; h1<256; h1++) {
368 struct rsvp_session *s;
369 for (s = sht[h1]; s; s = s->next) {
370 for (h2=0; h2<=16; h2++) {
371 struct rsvp_filter *f;
372
373 for (f = s->ht[h2]; f; f = f->next) {
374 if (f->tunnelhdr == 0)
375 continue;
376 data->tgenerator = f->res.classid;
377 tunnel_bts(data);
378 }
379 }
380 }
381 }
382
383 memcpy(data->tmap, tmap, sizeof(tmap));
384}
385
386static u32 gen_tunnel(struct rsvp_head *data)
387{
388 int i, k;
389
390 for (k=0; k<2; k++) {
391 for (i=255; i>0; i--) {
392 if (++data->tgenerator == 0)
393 data->tgenerator = 1;
394 if (tunnel_bts(data))
395 return data->tgenerator;
396 }
397 tunnel_recycle(data);
398 }
399 return 0;
400}
401
402static int rsvp_change(struct tcf_proto *tp, unsigned long base,
403 u32 handle,
404 struct rtattr **tca,
405 unsigned long *arg)
406{
407 struct rsvp_head *data = tp->root;
408 struct rsvp_filter *f, **fp;
409 struct rsvp_session *s, **sp;
410 struct tc_rsvp_pinfo *pinfo = NULL;
411 struct rtattr *opt = tca[TCA_OPTIONS-1];
412 struct rtattr *tb[TCA_RSVP_MAX];
413 struct tcf_exts e;
414 unsigned h1, h2;
415 u32 *dst;
416 int err;
417
418 if (opt == NULL)
419 return handle ? -EINVAL : 0;
420
421 if (rtattr_parse_nested(tb, TCA_RSVP_MAX, opt) < 0)
422 return -EINVAL;
423
424 err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &rsvp_ext_map);
425 if (err < 0)
426 return err;
427
428 if ((f = (struct rsvp_filter*)*arg) != NULL) {
429 /* Node exists: adjust only classid */
430
431 if (f->handle != handle && handle)
432 goto errout2;
433 if (tb[TCA_RSVP_CLASSID-1]) {
434 f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]);
435 tcf_bind_filter(tp, &f->res, base);
436 }
437
438 tcf_exts_change(tp, &f->exts, &e);
439 return 0;
440 }
441
442 /* Now more serious part... */
443 err = -EINVAL;
444 if (handle)
445 goto errout2;
446 if (tb[TCA_RSVP_DST-1] == NULL)
447 goto errout2;
448
449 err = -ENOBUFS;
450 f = kmalloc(sizeof(struct rsvp_filter), GFP_KERNEL);
451 if (f == NULL)
452 goto errout2;
453
454 memset(f, 0, sizeof(*f));
455 h2 = 16;
456 if (tb[TCA_RSVP_SRC-1]) {
457 err = -EINVAL;
458 if (RTA_PAYLOAD(tb[TCA_RSVP_SRC-1]) != sizeof(f->src))
459 goto errout;
460 memcpy(f->src, RTA_DATA(tb[TCA_RSVP_SRC-1]), sizeof(f->src));
461 h2 = hash_src(f->src);
462 }
463 if (tb[TCA_RSVP_PINFO-1]) {
464 err = -EINVAL;
465 if (RTA_PAYLOAD(tb[TCA_RSVP_PINFO-1]) < sizeof(struct tc_rsvp_pinfo))
466 goto errout;
467 pinfo = RTA_DATA(tb[TCA_RSVP_PINFO-1]);
468 f->spi = pinfo->spi;
469 f->tunnelhdr = pinfo->tunnelhdr;
470 }
471 if (tb[TCA_RSVP_CLASSID-1]) {
472 err = -EINVAL;
473 if (RTA_PAYLOAD(tb[TCA_RSVP_CLASSID-1]) != 4)
474 goto errout;
475 f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]);
476 }
477
478 err = -EINVAL;
479 if (RTA_PAYLOAD(tb[TCA_RSVP_DST-1]) != sizeof(f->src))
480 goto errout;
481 dst = RTA_DATA(tb[TCA_RSVP_DST-1]);
482 h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0);
483
484 err = -ENOMEM;
485 if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0)
486 goto errout;
487
488 if (f->tunnelhdr) {
489 err = -EINVAL;
490 if (f->res.classid > 255)
491 goto errout;
492
493 err = -ENOMEM;
494 if (f->res.classid == 0 &&
495 (f->res.classid = gen_tunnel(data)) == 0)
496 goto errout;
497 }
498
499 for (sp = &data->ht[h1]; (s=*sp) != NULL; sp = &s->next) {
500 if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
501 pinfo && pinfo->protocol == s->protocol &&
502 memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0
503#if RSVP_DST_LEN == 4
504 && dst[0] == s->dst[0]
505 && dst[1] == s->dst[1]
506 && dst[2] == s->dst[2]
507#endif
508 && pinfo->tunnelid == s->tunnelid) {
509
510insert:
511 /* OK, we found appropriate session */
512
513 fp = &s->ht[h2];
514
515 f->sess = s;
516 if (f->tunnelhdr == 0)
517 tcf_bind_filter(tp, &f->res, base);
518
519 tcf_exts_change(tp, &f->exts, &e);
520
521 for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next)
522 if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask)
523 break;
524 f->next = *fp;
525 wmb();
526 *fp = f;
527
528 *arg = (unsigned long)f;
529 return 0;
530 }
531 }
532
533 /* No session found. Create new one. */
534
535 err = -ENOBUFS;
536 s = kmalloc(sizeof(struct rsvp_session), GFP_KERNEL);
537 if (s == NULL)
538 goto errout;
539 memset(s, 0, sizeof(*s));
540 memcpy(s->dst, dst, sizeof(s->dst));
541
542 if (pinfo) {
543 s->dpi = pinfo->dpi;
544 s->protocol = pinfo->protocol;
545 s->tunnelid = pinfo->tunnelid;
546 }
547 for (sp = &data->ht[h1]; *sp; sp = &(*sp)->next) {
548 if (((*sp)->dpi.mask&s->dpi.mask) != s->dpi.mask)
549 break;
550 }
551 s->next = *sp;
552 wmb();
553 *sp = s;
554
555 goto insert;
556
557errout:
558 if (f)
559 kfree(f);
560errout2:
561 tcf_exts_destroy(tp, &e);
562 return err;
563}
564
565static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
566{
567 struct rsvp_head *head = tp->root;
568 unsigned h, h1;
569
570 if (arg->stop)
571 return;
572
573 for (h = 0; h < 256; h++) {
574 struct rsvp_session *s;
575
576 for (s = head->ht[h]; s; s = s->next) {
577 for (h1 = 0; h1 <= 16; h1++) {
578 struct rsvp_filter *f;
579
580 for (f = s->ht[h1]; f; f = f->next) {
581 if (arg->count < arg->skip) {
582 arg->count++;
583 continue;
584 }
585 if (arg->fn(tp, (unsigned long)f, arg) < 0) {
586 arg->stop = 1;
587 return;
588 }
589 arg->count++;
590 }
591 }
592 }
593 }
594}
595
596static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
597 struct sk_buff *skb, struct tcmsg *t)
598{
599 struct rsvp_filter *f = (struct rsvp_filter*)fh;
600 struct rsvp_session *s;
601 unsigned char *b = skb->tail;
602 struct rtattr *rta;
603 struct tc_rsvp_pinfo pinfo;
604
605 if (f == NULL)
606 return skb->len;
607 s = f->sess;
608
609 t->tcm_handle = f->handle;
610
611
612 rta = (struct rtattr*)b;
613 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
614
615 RTA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst);
616 pinfo.dpi = s->dpi;
617 pinfo.spi = f->spi;
618 pinfo.protocol = s->protocol;
619 pinfo.tunnelid = s->tunnelid;
620 pinfo.tunnelhdr = f->tunnelhdr;
621 RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo);
622 if (f->res.classid)
623 RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid);
624 if (((f->handle>>8)&0xFF) != 16)
625 RTA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src);
626
627 if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0)
628 goto rtattr_failure;
629
630 rta->rta_len = skb->tail - b;
631
632 if (tcf_exts_dump_stats(skb, &f->exts, &rsvp_ext_map) < 0)
633 goto rtattr_failure;
634 return skb->len;
635
636rtattr_failure:
637 skb_trim(skb, b - skb->data);
638 return -1;
639}
640
641static struct tcf_proto_ops RSVP_OPS = {
642 .next = NULL,
643 .kind = RSVP_ID,
644 .classify = rsvp_classify,
645 .init = rsvp_init,
646 .destroy = rsvp_destroy,
647 .get = rsvp_get,
648 .put = rsvp_put,
649 .change = rsvp_change,
650 .delete = rsvp_delete,
651 .walk = rsvp_walk,
652 .dump = rsvp_dump,
653 .owner = THIS_MODULE,
654};
655
656static int __init init_rsvp(void)
657{
658 return register_tcf_proto_ops(&RSVP_OPS);
659}
660
661static void __exit exit_rsvp(void)
662{
663 unregister_tcf_proto_ops(&RSVP_OPS);
664}
665
666module_init(init_rsvp)
667module_exit(exit_rsvp)
diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c
new file mode 100644
index 000000000000..fde51f7848eb
--- /dev/null
+++ b/net/sched/cls_rsvp6.c
@@ -0,0 +1,44 @@
1/*
2 * net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 */
11
12#include <linux/module.h>
13#include <asm/uaccess.h>
14#include <asm/system.h>
15#include <linux/bitops.h>
16#include <linux/types.h>
17#include <linux/kernel.h>
18#include <linux/sched.h>
19#include <linux/string.h>
20#include <linux/mm.h>
21#include <linux/socket.h>
22#include <linux/sockios.h>
23#include <linux/in.h>
24#include <linux/errno.h>
25#include <linux/interrupt.h>
26#include <linux/if_ether.h>
27#include <linux/inet.h>
28#include <linux/netdevice.h>
29#include <linux/etherdevice.h>
30#include <linux/notifier.h>
31#include <net/ip.h>
32#include <linux/ipv6.h>
33#include <net/route.h>
34#include <linux/skbuff.h>
35#include <net/sock.h>
36#include <net/act_api.h>
37#include <net/pkt_cls.h>
38
39#define RSVP_DST_LEN 4
40#define RSVP_ID "rsvp6"
41#define RSVP_OPS cls_rsvp6_ops
42
43#include "cls_rsvp.h"
44MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
new file mode 100644
index 000000000000..404d9d83a7fa
--- /dev/null
+++ b/net/sched/cls_tcindex.c
@@ -0,0 +1,537 @@
1/*
2 * net/sched/cls_tcindex.c Packet classifier for skb->tc_index
3 *
4 * Written 1998,1999 by Werner Almesberger, EPFL ICA
5 */
6
7#include <linux/config.h>
8#include <linux/module.h>
9#include <linux/types.h>
10#include <linux/kernel.h>
11#include <linux/skbuff.h>
12#include <linux/errno.h>
13#include <linux/netdevice.h>
14#include <net/ip.h>
15#include <net/act_api.h>
16#include <net/pkt_cls.h>
17#include <net/route.h>
18
19
20/*
21 * Not quite sure if we need all the xchgs Alexey uses when accessing things.
22 * Can always add them later ... :)
23 */
24
25/*
26 * Passing parameters to the root seems to be done more awkwardly than really
27 * necessary. At least, u32 doesn't seem to use such dirty hacks. To be
28 * verified. FIXME.
29 */
30
31#define PERFECT_HASH_THRESHOLD 64 /* use perfect hash if not bigger */
32#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */
33
34
35#if 1 /* control */
36#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
37#else
38#define DPRINTK(format,args...)
39#endif
40
41#if 0 /* data */
42#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
43#else
44#define D2PRINTK(format,args...)
45#endif
46
47
48#define PRIV(tp) ((struct tcindex_data *) (tp)->root)
49
50
51struct tcindex_filter_result {
52 struct tcf_exts exts;
53 struct tcf_result res;
54};
55
56struct tcindex_filter {
57 u16 key;
58 struct tcindex_filter_result result;
59 struct tcindex_filter *next;
60};
61
62
63struct tcindex_data {
64 struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */
65 struct tcindex_filter **h; /* imperfect hash; only used if !perfect;
66 NULL if unused */
67 u16 mask; /* AND key with mask */
68 int shift; /* shift ANDed key to the right */
69 int hash; /* hash table size; 0 if undefined */
70 int alloc_hash; /* allocated size */
71 int fall_through; /* 0: only classify if explicit match */
72};
73
74static struct tcf_ext_map tcindex_ext_map = {
75 .police = TCA_TCINDEX_POLICE,
76 .action = TCA_TCINDEX_ACT
77};
78
79static inline int
80tcindex_filter_is_set(struct tcindex_filter_result *r)
81{
82 return tcf_exts_is_predicative(&r->exts) || r->res.classid;
83}
84
85static struct tcindex_filter_result *
86tcindex_lookup(struct tcindex_data *p, u16 key)
87{
88 struct tcindex_filter *f;
89
90 if (p->perfect)
91 return tcindex_filter_is_set(p->perfect + key) ?
92 p->perfect + key : NULL;
93 else if (p->h) {
94 for (f = p->h[key % p->hash]; f; f = f->next)
95 if (f->key == key)
96 return &f->result;
97 }
98
99 return NULL;
100}
101
102
103static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp,
104 struct tcf_result *res)
105{
106 struct tcindex_data *p = PRIV(tp);
107 struct tcindex_filter_result *f;
108 int key = (skb->tc_index & p->mask) >> p->shift;
109
110 D2PRINTK("tcindex_classify(skb %p,tp %p,res %p),p %p\n",skb,tp,res,p);
111
112 f = tcindex_lookup(p, key);
113 if (!f) {
114 if (!p->fall_through)
115 return -1;
116 res->classid = TC_H_MAKE(TC_H_MAJ(tp->q->handle), key);
117 res->class = 0;
118 D2PRINTK("alg 0x%x\n",res->classid);
119 return 0;
120 }
121 *res = f->res;
122 D2PRINTK("map 0x%x\n",res->classid);
123
124 return tcf_exts_exec(skb, &f->exts, res);
125}
126
127
128static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle)
129{
130 struct tcindex_data *p = PRIV(tp);
131 struct tcindex_filter_result *r;
132
133 DPRINTK("tcindex_get(tp %p,handle 0x%08x)\n",tp,handle);
134 if (p->perfect && handle >= p->alloc_hash)
135 return 0;
136 r = tcindex_lookup(p, handle);
137 return r && tcindex_filter_is_set(r) ? (unsigned long) r : 0UL;
138}
139
140
141static void tcindex_put(struct tcf_proto *tp, unsigned long f)
142{
143 DPRINTK("tcindex_put(tp %p,f 0x%lx)\n",tp,f);
144}
145
146
147static int tcindex_init(struct tcf_proto *tp)
148{
149 struct tcindex_data *p;
150
151 DPRINTK("tcindex_init(tp %p)\n",tp);
152 p = kmalloc(sizeof(struct tcindex_data),GFP_KERNEL);
153 if (!p)
154 return -ENOMEM;
155
156 memset(p, 0, sizeof(*p));
157 p->mask = 0xffff;
158 p->hash = DEFAULT_HASH_SIZE;
159 p->fall_through = 1;
160
161 tp->root = p;
162 return 0;
163}
164
165
166static int
167__tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock)
168{
169 struct tcindex_data *p = PRIV(tp);
170 struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg;
171 struct tcindex_filter *f = NULL;
172
173 DPRINTK("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n",tp,arg,p,f);
174 if (p->perfect) {
175 if (!r->res.class)
176 return -ENOENT;
177 } else {
178 int i;
179 struct tcindex_filter **walk = NULL;
180
181 for (i = 0; i < p->hash; i++)
182 for (walk = p->h+i; *walk; walk = &(*walk)->next)
183 if (&(*walk)->result == r)
184 goto found;
185 return -ENOENT;
186
187found:
188 f = *walk;
189 if (lock)
190 tcf_tree_lock(tp);
191 *walk = f->next;
192 if (lock)
193 tcf_tree_unlock(tp);
194 }
195 tcf_unbind_filter(tp, &r->res);
196 tcf_exts_destroy(tp, &r->exts);
197 if (f)
198 kfree(f);
199 return 0;
200}
201
202static int tcindex_delete(struct tcf_proto *tp, unsigned long arg)
203{
204 return __tcindex_delete(tp, arg, 1);
205}
206
207static inline int
208valid_perfect_hash(struct tcindex_data *p)
209{
210 return p->hash > (p->mask >> p->shift);
211}
212
213static int
214tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
215 struct tcindex_data *p, struct tcindex_filter_result *r,
216 struct rtattr **tb, struct rtattr *est)
217{
218 int err, balloc = 0;
219 struct tcindex_filter_result new_filter_result, *old_r = r;
220 struct tcindex_filter_result cr;
221 struct tcindex_data cp;
222 struct tcindex_filter *f = NULL; /* make gcc behave */
223 struct tcf_exts e;
224
225 err = tcf_exts_validate(tp, tb, est, &e, &tcindex_ext_map);
226 if (err < 0)
227 return err;
228
229 memcpy(&cp, p, sizeof(cp));
230 memset(&new_filter_result, 0, sizeof(new_filter_result));
231
232 if (old_r)
233 memcpy(&cr, r, sizeof(cr));
234 else
235 memset(&cr, 0, sizeof(cr));
236
237 err = -EINVAL;
238 if (tb[TCA_TCINDEX_HASH-1]) {
239 if (RTA_PAYLOAD(tb[TCA_TCINDEX_HASH-1]) < sizeof(u32))
240 goto errout;
241 cp.hash = *(u32 *) RTA_DATA(tb[TCA_TCINDEX_HASH-1]);
242 }
243
244 if (tb[TCA_TCINDEX_MASK-1]) {
245 if (RTA_PAYLOAD(tb[TCA_TCINDEX_MASK-1]) < sizeof(u16))
246 goto errout;
247 cp.mask = *(u16 *) RTA_DATA(tb[TCA_TCINDEX_MASK-1]);
248 }
249
250 if (tb[TCA_TCINDEX_SHIFT-1]) {
251 if (RTA_PAYLOAD(tb[TCA_TCINDEX_SHIFT-1]) < sizeof(u16))
252 goto errout;
253 cp.shift = *(u16 *) RTA_DATA(tb[TCA_TCINDEX_SHIFT-1]);
254 }
255
256 err = -EBUSY;
257 /* Hash already allocated, make sure that we still meet the
258 * requirements for the allocated hash.
259 */
260 if (cp.perfect) {
261 if (!valid_perfect_hash(&cp) ||
262 cp.hash > cp.alloc_hash)
263 goto errout;
264 } else if (cp.h && cp.hash != cp.alloc_hash)
265 goto errout;
266
267 err = -EINVAL;
268 if (tb[TCA_TCINDEX_FALL_THROUGH-1]) {
269 if (RTA_PAYLOAD(tb[TCA_TCINDEX_FALL_THROUGH-1]) < sizeof(u32))
270 goto errout;
271 cp.fall_through =
272 *(u32 *) RTA_DATA(tb[TCA_TCINDEX_FALL_THROUGH-1]);
273 }
274
275 if (!cp.hash) {
276 /* Hash not specified, use perfect hash if the upper limit
277 * of the hashing index is below the threshold.
278 */
279 if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD)
280 cp.hash = (cp.mask >> cp.shift)+1;
281 else
282 cp.hash = DEFAULT_HASH_SIZE;
283 }
284
285 if (!cp.perfect && !cp.h)
286 cp.alloc_hash = cp.hash;
287
288 /* Note: this could be as restrictive as if (handle & ~(mask >> shift))
289 * but then, we'd fail handles that may become valid after some future
290 * mask change. While this is extremely unlikely to ever matter,
291 * the check below is safer (and also more backwards-compatible).
292 */
293 if (cp.perfect || valid_perfect_hash(&cp))
294 if (handle >= cp.alloc_hash)
295 goto errout;
296
297
298 err = -ENOMEM;
299 if (!cp.perfect && !cp.h) {
300 if (valid_perfect_hash(&cp)) {
301 cp.perfect = kmalloc(cp.hash * sizeof(*r), GFP_KERNEL);
302 if (!cp.perfect)
303 goto errout;
304 memset(cp.perfect, 0, cp.hash * sizeof(*r));
305 balloc = 1;
306 } else {
307 cp.h = kmalloc(cp.hash * sizeof(f), GFP_KERNEL);
308 if (!cp.h)
309 goto errout;
310 memset(cp.h, 0, cp.hash * sizeof(f));
311 balloc = 2;
312 }
313 }
314
315 if (cp.perfect)
316 r = cp.perfect + handle;
317 else
318 r = tcindex_lookup(&cp, handle) ? : &new_filter_result;
319
320 if (r == &new_filter_result) {
321 f = kmalloc(sizeof(*f), GFP_KERNEL);
322 if (!f)
323 goto errout_alloc;
324 memset(f, 0, sizeof(*f));
325 }
326
327 if (tb[TCA_TCINDEX_CLASSID-1]) {
328 cr.res.classid = *(u32 *) RTA_DATA(tb[TCA_TCINDEX_CLASSID-1]);
329 tcf_bind_filter(tp, &cr.res, base);
330 }
331
332 tcf_exts_change(tp, &cr.exts, &e);
333
334 tcf_tree_lock(tp);
335 if (old_r && old_r != r)
336 memset(old_r, 0, sizeof(*old_r));
337
338 memcpy(p, &cp, sizeof(cp));
339 memcpy(r, &cr, sizeof(cr));
340
341 if (r == &new_filter_result) {
342 struct tcindex_filter **fp;
343
344 f->key = handle;
345 f->result = new_filter_result;
346 f->next = NULL;
347 for (fp = p->h+(handle % p->hash); *fp; fp = &(*fp)->next)
348 /* nothing */;
349 *fp = f;
350 }
351 tcf_tree_unlock(tp);
352
353 return 0;
354
355errout_alloc:
356 if (balloc == 1)
357 kfree(cp.perfect);
358 else if (balloc == 2)
359 kfree(cp.h);
360errout:
361 tcf_exts_destroy(tp, &e);
362 return err;
363}
364
365static int
366tcindex_change(struct tcf_proto *tp, unsigned long base, u32 handle,
367 struct rtattr **tca, unsigned long *arg)
368{
369 struct rtattr *opt = tca[TCA_OPTIONS-1];
370 struct rtattr *tb[TCA_TCINDEX_MAX];
371 struct tcindex_data *p = PRIV(tp);
372 struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg;
373
374 DPRINTK("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p,"
375 "p %p,r %p,*arg 0x%lx\n",
376 tp, handle, tca, arg, opt, p, r, arg ? *arg : 0L);
377
378 if (!opt)
379 return 0;
380
381 if (rtattr_parse_nested(tb, TCA_TCINDEX_MAX, opt) < 0)
382 return -EINVAL;
383
384 return tcindex_set_parms(tp, base, handle, p, r, tb, tca[TCA_RATE-1]);
385}
386
387
388static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
389{
390 struct tcindex_data *p = PRIV(tp);
391 struct tcindex_filter *f,*next;
392 int i;
393
394 DPRINTK("tcindex_walk(tp %p,walker %p),p %p\n",tp,walker,p);
395 if (p->perfect) {
396 for (i = 0; i < p->hash; i++) {
397 if (!p->perfect[i].res.class)
398 continue;
399 if (walker->count >= walker->skip) {
400 if (walker->fn(tp,
401 (unsigned long) (p->perfect+i), walker)
402 < 0) {
403 walker->stop = 1;
404 return;
405 }
406 }
407 walker->count++;
408 }
409 }
410 if (!p->h)
411 return;
412 for (i = 0; i < p->hash; i++) {
413 for (f = p->h[i]; f; f = next) {
414 next = f->next;
415 if (walker->count >= walker->skip) {
416 if (walker->fn(tp,(unsigned long) &f->result,
417 walker) < 0) {
418 walker->stop = 1;
419 return;
420 }
421 }
422 walker->count++;
423 }
424 }
425}
426
427
428static int tcindex_destroy_element(struct tcf_proto *tp,
429 unsigned long arg, struct tcf_walker *walker)
430{
431 return __tcindex_delete(tp, arg, 0);
432}
433
434
435static void tcindex_destroy(struct tcf_proto *tp)
436{
437 struct tcindex_data *p = PRIV(tp);
438 struct tcf_walker walker;
439
440 DPRINTK("tcindex_destroy(tp %p),p %p\n",tp,p);
441 walker.count = 0;
442 walker.skip = 0;
443 walker.fn = &tcindex_destroy_element;
444 tcindex_walk(tp,&walker);
445 if (p->perfect)
446 kfree(p->perfect);
447 if (p->h)
448 kfree(p->h);
449 kfree(p);
450 tp->root = NULL;
451}
452
453
454static int tcindex_dump(struct tcf_proto *tp, unsigned long fh,
455 struct sk_buff *skb, struct tcmsg *t)
456{
457 struct tcindex_data *p = PRIV(tp);
458 struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh;
459 unsigned char *b = skb->tail;
460 struct rtattr *rta;
461
462 DPRINTK("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p,b %p\n",
463 tp,fh,skb,t,p,r,b);
464 DPRINTK("p->perfect %p p->h %p\n",p->perfect,p->h);
465 rta = (struct rtattr *) b;
466 RTA_PUT(skb,TCA_OPTIONS,0,NULL);
467 if (!fh) {
468 t->tcm_handle = ~0; /* whatever ... */
469 RTA_PUT(skb,TCA_TCINDEX_HASH,sizeof(p->hash),&p->hash);
470 RTA_PUT(skb,TCA_TCINDEX_MASK,sizeof(p->mask),&p->mask);
471 RTA_PUT(skb,TCA_TCINDEX_SHIFT,sizeof(p->shift),&p->shift);
472 RTA_PUT(skb,TCA_TCINDEX_FALL_THROUGH,sizeof(p->fall_through),
473 &p->fall_through);
474 rta->rta_len = skb->tail-b;
475 } else {
476 if (p->perfect) {
477 t->tcm_handle = r-p->perfect;
478 } else {
479 struct tcindex_filter *f;
480 int i;
481
482 t->tcm_handle = 0;
483 for (i = 0; !t->tcm_handle && i < p->hash; i++) {
484 for (f = p->h[i]; !t->tcm_handle && f;
485 f = f->next) {
486 if (&f->result == r)
487 t->tcm_handle = f->key;
488 }
489 }
490 }
491 DPRINTK("handle = %d\n",t->tcm_handle);
492 if (r->res.class)
493 RTA_PUT(skb, TCA_TCINDEX_CLASSID, 4, &r->res.classid);
494
495 if (tcf_exts_dump(skb, &r->exts, &tcindex_ext_map) < 0)
496 goto rtattr_failure;
497 rta->rta_len = skb->tail-b;
498
499 if (tcf_exts_dump_stats(skb, &r->exts, &tcindex_ext_map) < 0)
500 goto rtattr_failure;
501 }
502
503 return skb->len;
504
505rtattr_failure:
506 skb_trim(skb, b - skb->data);
507 return -1;
508}
509
510static struct tcf_proto_ops cls_tcindex_ops = {
511 .next = NULL,
512 .kind = "tcindex",
513 .classify = tcindex_classify,
514 .init = tcindex_init,
515 .destroy = tcindex_destroy,
516 .get = tcindex_get,
517 .put = tcindex_put,
518 .change = tcindex_change,
519 .delete = tcindex_delete,
520 .walk = tcindex_walk,
521 .dump = tcindex_dump,
522 .owner = THIS_MODULE,
523};
524
525static int __init init_tcindex(void)
526{
527 return register_tcf_proto_ops(&cls_tcindex_ops);
528}
529
530static void __exit exit_tcindex(void)
531{
532 unregister_tcf_proto_ops(&cls_tcindex_ops);
533}
534
535module_init(init_tcindex)
536module_exit(exit_tcindex)
537MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
new file mode 100644
index 000000000000..364b87d86455
--- /dev/null
+++ b/net/sched/cls_u32.c
@@ -0,0 +1,828 @@
1/*
2 * net/sched/cls_u32.c Ugly (or Universal) 32bit key Packet Classifier.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * The filters are packed to hash tables of key nodes
12 * with a set of 32bit key/mask pairs at every node.
13 * Nodes reference next level hash tables etc.
14 *
15 * This scheme is the best universal classifier I managed to
16 * invent; it is not super-fast, but it is not slow (provided you
17 * program it correctly), and general enough. And its relative
18 * speed grows as the number of rules becomes larger.
19 *
20 * It seems that it represents the best middle point between
21 * speed and manageability both by human and by machine.
22 *
23 * It is especially useful for link sharing combined with QoS;
24 * pure RSVP doesn't need such a general approach and can use
25 * much simpler (and faster) schemes, sort of cls_rsvp.c.
26 *
27 * JHS: We should remove the CONFIG_NET_CLS_IND from here
28 * eventually when the meta match extension is made available
29 *
30 * nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro>
31 */
32
33#include <asm/uaccess.h>
34#include <asm/system.h>
35#include <linux/bitops.h>
36#include <linux/config.h>
37#include <linux/module.h>
38#include <linux/types.h>
39#include <linux/kernel.h>
40#include <linux/sched.h>
41#include <linux/string.h>
42#include <linux/mm.h>
43#include <linux/socket.h>
44#include <linux/sockios.h>
45#include <linux/in.h>
46#include <linux/errno.h>
47#include <linux/interrupt.h>
48#include <linux/if_ether.h>
49#include <linux/inet.h>
50#include <linux/netdevice.h>
51#include <linux/etherdevice.h>
52#include <linux/notifier.h>
53#include <linux/rtnetlink.h>
54#include <net/ip.h>
55#include <net/route.h>
56#include <linux/skbuff.h>
57#include <net/sock.h>
58#include <net/act_api.h>
59#include <net/pkt_cls.h>
60
61struct tc_u_knode
62{
63 struct tc_u_knode *next;
64 u32 handle;
65 struct tc_u_hnode *ht_up;
66 struct tcf_exts exts;
67#ifdef CONFIG_NET_CLS_IND
68 char indev[IFNAMSIZ];
69#endif
70 u8 fshift;
71 struct tcf_result res;
72 struct tc_u_hnode *ht_down;
73#ifdef CONFIG_CLS_U32_PERF
74 struct tc_u32_pcnt *pf;
75#endif
76#ifdef CONFIG_CLS_U32_MARK
77 struct tc_u32_mark mark;
78#endif
79 struct tc_u32_sel sel;
80};
81
82struct tc_u_hnode
83{
84 struct tc_u_hnode *next;
85 u32 handle;
86 u32 prio;
87 struct tc_u_common *tp_c;
88 int refcnt;
89 unsigned divisor;
90 struct tc_u_knode *ht[1];
91};
92
93struct tc_u_common
94{
95 struct tc_u_common *next;
96 struct tc_u_hnode *hlist;
97 struct Qdisc *q;
98 int refcnt;
99 u32 hgenerator;
100};
101
102static struct tcf_ext_map u32_ext_map = {
103 .action = TCA_U32_ACT,
104 .police = TCA_U32_POLICE
105};
106
107static struct tc_u_common *u32_list;
108
109static __inline__ unsigned u32_hash_fold(u32 key, struct tc_u32_sel *sel, u8 fshift)
110{
111 unsigned h = (key & sel->hmask)>>fshift;
112
113 return h;
114}
115
116static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res)
117{
118 struct {
119 struct tc_u_knode *knode;
120 u8 *ptr;
121 } stack[TC_U32_MAXDEPTH];
122
123 struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root;
124 u8 *ptr = skb->nh.raw;
125 struct tc_u_knode *n;
126 int sdepth = 0;
127 int off2 = 0;
128 int sel = 0;
129#ifdef CONFIG_CLS_U32_PERF
130 int j;
131#endif
132 int i, r;
133
134next_ht:
135 n = ht->ht[sel];
136
137next_knode:
138 if (n) {
139 struct tc_u32_key *key = n->sel.keys;
140
141#ifdef CONFIG_CLS_U32_PERF
142 n->pf->rcnt +=1;
143 j = 0;
144#endif
145
146#ifdef CONFIG_CLS_U32_MARK
147 if ((skb->nfmark & n->mark.mask) != n->mark.val) {
148 n = n->next;
149 goto next_knode;
150 } else {
151 n->mark.success++;
152 }
153#endif
154
155 for (i = n->sel.nkeys; i>0; i--, key++) {
156
157 if ((*(u32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) {
158 n = n->next;
159 goto next_knode;
160 }
161#ifdef CONFIG_CLS_U32_PERF
162 n->pf->kcnts[j] +=1;
163 j++;
164#endif
165 }
166 if (n->ht_down == NULL) {
167check_terminal:
168 if (n->sel.flags&TC_U32_TERMINAL) {
169
170 *res = n->res;
171#ifdef CONFIG_NET_CLS_IND
172 if (!tcf_match_indev(skb, n->indev)) {
173 n = n->next;
174 goto next_knode;
175 }
176#endif
177#ifdef CONFIG_CLS_U32_PERF
178 n->pf->rhit +=1;
179#endif
180 r = tcf_exts_exec(skb, &n->exts, res);
181 if (r < 0) {
182 n = n->next;
183 goto next_knode;
184 }
185
186 return r;
187 }
188 n = n->next;
189 goto next_knode;
190 }
191
192 /* PUSH */
193 if (sdepth >= TC_U32_MAXDEPTH)
194 goto deadloop;
195 stack[sdepth].knode = n;
196 stack[sdepth].ptr = ptr;
197 sdepth++;
198
199 ht = n->ht_down;
200 sel = 0;
201 if (ht->divisor)
202 sel = ht->divisor&u32_hash_fold(*(u32*)(ptr+n->sel.hoff), &n->sel,n->fshift);
203
204 if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT)))
205 goto next_ht;
206
207 if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) {
208 off2 = n->sel.off + 3;
209 if (n->sel.flags&TC_U32_VAROFFSET)
210 off2 += ntohs(n->sel.offmask & *(u16*)(ptr+n->sel.offoff)) >>n->sel.offshift;
211 off2 &= ~3;
212 }
213 if (n->sel.flags&TC_U32_EAT) {
214 ptr += off2;
215 off2 = 0;
216 }
217
218 if (ptr < skb->tail)
219 goto next_ht;
220 }
221
222 /* POP */
223 if (sdepth--) {
224 n = stack[sdepth].knode;
225 ht = n->ht_up;
226 ptr = stack[sdepth].ptr;
227 goto check_terminal;
228 }
229 return -1;
230
231deadloop:
232 if (net_ratelimit())
233 printk("cls_u32: dead loop\n");
234 return -1;
235}
236
237static __inline__ struct tc_u_hnode *
238u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
239{
240 struct tc_u_hnode *ht;
241
242 for (ht = tp_c->hlist; ht; ht = ht->next)
243 if (ht->handle == handle)
244 break;
245
246 return ht;
247}
248
249static __inline__ struct tc_u_knode *
250u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
251{
252 unsigned sel;
253 struct tc_u_knode *n = NULL;
254
255 sel = TC_U32_HASH(handle);
256 if (sel > ht->divisor)
257 goto out;
258
259 for (n = ht->ht[sel]; n; n = n->next)
260 if (n->handle == handle)
261 break;
262out:
263 return n;
264}
265
266
267static unsigned long u32_get(struct tcf_proto *tp, u32 handle)
268{
269 struct tc_u_hnode *ht;
270 struct tc_u_common *tp_c = tp->data;
271
272 if (TC_U32_HTID(handle) == TC_U32_ROOT)
273 ht = tp->root;
274 else
275 ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle));
276
277 if (!ht)
278 return 0;
279
280 if (TC_U32_KEY(handle) == 0)
281 return (unsigned long)ht;
282
283 return (unsigned long)u32_lookup_key(ht, handle);
284}
285
286static void u32_put(struct tcf_proto *tp, unsigned long f)
287{
288}
289
290static u32 gen_new_htid(struct tc_u_common *tp_c)
291{
292 int i = 0x800;
293
294 do {
295 if (++tp_c->hgenerator == 0x7FF)
296 tp_c->hgenerator = 1;
297 } while (--i>0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
298
299 return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;
300}
301
302static int u32_init(struct tcf_proto *tp)
303{
304 struct tc_u_hnode *root_ht;
305 struct tc_u_common *tp_c;
306
307 for (tp_c = u32_list; tp_c; tp_c = tp_c->next)
308 if (tp_c->q == tp->q)
309 break;
310
311 root_ht = kmalloc(sizeof(*root_ht), GFP_KERNEL);
312 if (root_ht == NULL)
313 return -ENOBUFS;
314
315 memset(root_ht, 0, sizeof(*root_ht));
316 root_ht->divisor = 0;
317 root_ht->refcnt++;
318 root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000;
319 root_ht->prio = tp->prio;
320
321 if (tp_c == NULL) {
322 tp_c = kmalloc(sizeof(*tp_c), GFP_KERNEL);
323 if (tp_c == NULL) {
324 kfree(root_ht);
325 return -ENOBUFS;
326 }
327 memset(tp_c, 0, sizeof(*tp_c));
328 tp_c->q = tp->q;
329 tp_c->next = u32_list;
330 u32_list = tp_c;
331 }
332
333 tp_c->refcnt++;
334 root_ht->next = tp_c->hlist;
335 tp_c->hlist = root_ht;
336 root_ht->tp_c = tp_c;
337
338 tp->root = root_ht;
339 tp->data = tp_c;
340 return 0;
341}
342
343static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n)
344{
345 tcf_unbind_filter(tp, &n->res);
346 tcf_exts_destroy(tp, &n->exts);
347 if (n->ht_down)
348 n->ht_down->refcnt--;
349#ifdef CONFIG_CLS_U32_PERF
350 if (n && (NULL != n->pf))
351 kfree(n->pf);
352#endif
353 kfree(n);
354 return 0;
355}
356
357static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key)
358{
359 struct tc_u_knode **kp;
360 struct tc_u_hnode *ht = key->ht_up;
361
362 if (ht) {
363 for (kp = &ht->ht[TC_U32_HASH(key->handle)]; *kp; kp = &(*kp)->next) {
364 if (*kp == key) {
365 tcf_tree_lock(tp);
366 *kp = key->next;
367 tcf_tree_unlock(tp);
368
369 u32_destroy_key(tp, key);
370 return 0;
371 }
372 }
373 }
374 BUG_TRAP(0);
375 return 0;
376}
377
378static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
379{
380 struct tc_u_knode *n;
381 unsigned h;
382
383 for (h=0; h<=ht->divisor; h++) {
384 while ((n = ht->ht[h]) != NULL) {
385 ht->ht[h] = n->next;
386
387 u32_destroy_key(tp, n);
388 }
389 }
390}
391
392static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
393{
394 struct tc_u_common *tp_c = tp->data;
395 struct tc_u_hnode **hn;
396
397 BUG_TRAP(!ht->refcnt);
398
399 u32_clear_hnode(tp, ht);
400
401 for (hn = &tp_c->hlist; *hn; hn = &(*hn)->next) {
402 if (*hn == ht) {
403 *hn = ht->next;
404 kfree(ht);
405 return 0;
406 }
407 }
408
409 BUG_TRAP(0);
410 return -ENOENT;
411}
412
413static void u32_destroy(struct tcf_proto *tp)
414{
415 struct tc_u_common *tp_c = tp->data;
416 struct tc_u_hnode *root_ht = xchg(&tp->root, NULL);
417
418 BUG_TRAP(root_ht != NULL);
419
420 if (root_ht && --root_ht->refcnt == 0)
421 u32_destroy_hnode(tp, root_ht);
422
423 if (--tp_c->refcnt == 0) {
424 struct tc_u_hnode *ht;
425 struct tc_u_common **tp_cp;
426
427 for (tp_cp = &u32_list; *tp_cp; tp_cp = &(*tp_cp)->next) {
428 if (*tp_cp == tp_c) {
429 *tp_cp = tp_c->next;
430 break;
431 }
432 }
433
434 for (ht=tp_c->hlist; ht; ht = ht->next)
435 u32_clear_hnode(tp, ht);
436
437 while ((ht = tp_c->hlist) != NULL) {
438 tp_c->hlist = ht->next;
439
440 BUG_TRAP(ht->refcnt == 0);
441
442 kfree(ht);
443 };
444
445 kfree(tp_c);
446 }
447
448 tp->data = NULL;
449}
450
451static int u32_delete(struct tcf_proto *tp, unsigned long arg)
452{
453 struct tc_u_hnode *ht = (struct tc_u_hnode*)arg;
454
455 if (ht == NULL)
456 return 0;
457
458 if (TC_U32_KEY(ht->handle))
459 return u32_delete_key(tp, (struct tc_u_knode*)ht);
460
461 if (tp->root == ht)
462 return -EINVAL;
463
464 if (--ht->refcnt == 0)
465 u32_destroy_hnode(tp, ht);
466
467 return 0;
468}
469
470static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
471{
472 struct tc_u_knode *n;
473 unsigned i = 0x7FF;
474
475 for (n=ht->ht[TC_U32_HASH(handle)]; n; n = n->next)
476 if (i < TC_U32_NODE(n->handle))
477 i = TC_U32_NODE(n->handle);
478 i++;
479
480 return handle|(i>0xFFF ? 0xFFF : i);
481}
482
483static int u32_set_parms(struct tcf_proto *tp, unsigned long base,
484 struct tc_u_hnode *ht,
485 struct tc_u_knode *n, struct rtattr **tb,
486 struct rtattr *est)
487{
488 int err;
489 struct tcf_exts e;
490
491 err = tcf_exts_validate(tp, tb, est, &e, &u32_ext_map);
492 if (err < 0)
493 return err;
494
495 err = -EINVAL;
496 if (tb[TCA_U32_LINK-1]) {
497 u32 handle = *(u32*)RTA_DATA(tb[TCA_U32_LINK-1]);
498 struct tc_u_hnode *ht_down = NULL;
499
500 if (TC_U32_KEY(handle))
501 goto errout;
502
503 if (handle) {
504 ht_down = u32_lookup_ht(ht->tp_c, handle);
505
506 if (ht_down == NULL)
507 goto errout;
508 ht_down->refcnt++;
509 }
510
511 tcf_tree_lock(tp);
512 ht_down = xchg(&n->ht_down, ht_down);
513 tcf_tree_unlock(tp);
514
515 if (ht_down)
516 ht_down->refcnt--;
517 }
518 if (tb[TCA_U32_CLASSID-1]) {
519 n->res.classid = *(u32*)RTA_DATA(tb[TCA_U32_CLASSID-1]);
520 tcf_bind_filter(tp, &n->res, base);
521 }
522
523#ifdef CONFIG_NET_CLS_IND
524 if (tb[TCA_U32_INDEV-1]) {
525 int err = tcf_change_indev(tp, n->indev, tb[TCA_U32_INDEV-1]);
526 if (err < 0)
527 goto errout;
528 }
529#endif
530 tcf_exts_change(tp, &n->exts, &e);
531
532 return 0;
533errout:
534 tcf_exts_destroy(tp, &e);
535 return err;
536}
537
538static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
539 struct rtattr **tca,
540 unsigned long *arg)
541{
542 struct tc_u_common *tp_c = tp->data;
543 struct tc_u_hnode *ht;
544 struct tc_u_knode *n;
545 struct tc_u32_sel *s;
546 struct rtattr *opt = tca[TCA_OPTIONS-1];
547 struct rtattr *tb[TCA_U32_MAX];
548 u32 htid;
549 int err;
550
551 if (opt == NULL)
552 return handle ? -EINVAL : 0;
553
554 if (rtattr_parse_nested(tb, TCA_U32_MAX, opt) < 0)
555 return -EINVAL;
556
557 if ((n = (struct tc_u_knode*)*arg) != NULL) {
558 if (TC_U32_KEY(n->handle) == 0)
559 return -EINVAL;
560
561 return u32_set_parms(tp, base, n->ht_up, n, tb, tca[TCA_RATE-1]);
562 }
563
564 if (tb[TCA_U32_DIVISOR-1]) {
565 unsigned divisor = *(unsigned*)RTA_DATA(tb[TCA_U32_DIVISOR-1]);
566
567 if (--divisor > 0x100)
568 return -EINVAL;
569 if (TC_U32_KEY(handle))
570 return -EINVAL;
571 if (handle == 0) {
572 handle = gen_new_htid(tp->data);
573 if (handle == 0)
574 return -ENOMEM;
575 }
576 ht = kmalloc(sizeof(*ht) + divisor*sizeof(void*), GFP_KERNEL);
577 if (ht == NULL)
578 return -ENOBUFS;
579 memset(ht, 0, sizeof(*ht) + divisor*sizeof(void*));
580 ht->tp_c = tp_c;
581 ht->refcnt = 0;
582 ht->divisor = divisor;
583 ht->handle = handle;
584 ht->prio = tp->prio;
585 ht->next = tp_c->hlist;
586 tp_c->hlist = ht;
587 *arg = (unsigned long)ht;
588 return 0;
589 }
590
591 if (tb[TCA_U32_HASH-1]) {
592 htid = *(unsigned*)RTA_DATA(tb[TCA_U32_HASH-1]);
593 if (TC_U32_HTID(htid) == TC_U32_ROOT) {
594 ht = tp->root;
595 htid = ht->handle;
596 } else {
597 ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid));
598 if (ht == NULL)
599 return -EINVAL;
600 }
601 } else {
602 ht = tp->root;
603 htid = ht->handle;
604 }
605
606 if (ht->divisor < TC_U32_HASH(htid))
607 return -EINVAL;
608
609 if (handle) {
610 if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid))
611 return -EINVAL;
612 handle = htid | TC_U32_NODE(handle);
613 } else
614 handle = gen_new_kid(ht, htid);
615
616 if (tb[TCA_U32_SEL-1] == 0 ||
617 RTA_PAYLOAD(tb[TCA_U32_SEL-1]) < sizeof(struct tc_u32_sel))
618 return -EINVAL;
619
620 s = RTA_DATA(tb[TCA_U32_SEL-1]);
621
622 n = kmalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL);
623 if (n == NULL)
624 return -ENOBUFS;
625
626 memset(n, 0, sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key));
627#ifdef CONFIG_CLS_U32_PERF
628 n->pf = kmalloc(sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64), GFP_KERNEL);
629 if (n->pf == NULL) {
630 kfree(n);
631 return -ENOBUFS;
632 }
633 memset(n->pf, 0, sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64));
634#endif
635
636 memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
637 n->ht_up = ht;
638 n->handle = handle;
639{
640 u8 i = 0;
641 u32 mask = s->hmask;
642 if (mask) {
643 while (!(mask & 1)) {
644 i++;
645 mask>>=1;
646 }
647 }
648 n->fshift = i;
649}
650
651#ifdef CONFIG_CLS_U32_MARK
652 if (tb[TCA_U32_MARK-1]) {
653 struct tc_u32_mark *mark;
654
655 if (RTA_PAYLOAD(tb[TCA_U32_MARK-1]) < sizeof(struct tc_u32_mark)) {
656#ifdef CONFIG_CLS_U32_PERF
657 kfree(n->pf);
658#endif
659 kfree(n);
660 return -EINVAL;
661 }
662 mark = RTA_DATA(tb[TCA_U32_MARK-1]);
663 memcpy(&n->mark, mark, sizeof(struct tc_u32_mark));
664 n->mark.success = 0;
665 }
666#endif
667
668 err = u32_set_parms(tp, base, ht, n, tb, tca[TCA_RATE-1]);
669 if (err == 0) {
670 struct tc_u_knode **ins;
671 for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next)
672 if (TC_U32_NODE(handle) < TC_U32_NODE((*ins)->handle))
673 break;
674
675 n->next = *ins;
676 wmb();
677 *ins = n;
678
679 *arg = (unsigned long)n;
680 return 0;
681 }
682#ifdef CONFIG_CLS_U32_PERF
683 if (n && (NULL != n->pf))
684 kfree(n->pf);
685#endif
686 kfree(n);
687 return err;
688}
689
690static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
691{
692 struct tc_u_common *tp_c = tp->data;
693 struct tc_u_hnode *ht;
694 struct tc_u_knode *n;
695 unsigned h;
696
697 if (arg->stop)
698 return;
699
700 for (ht = tp_c->hlist; ht; ht = ht->next) {
701 if (ht->prio != tp->prio)
702 continue;
703 if (arg->count >= arg->skip) {
704 if (arg->fn(tp, (unsigned long)ht, arg) < 0) {
705 arg->stop = 1;
706 return;
707 }
708 }
709 arg->count++;
710 for (h = 0; h <= ht->divisor; h++) {
711 for (n = ht->ht[h]; n; n = n->next) {
712 if (arg->count < arg->skip) {
713 arg->count++;
714 continue;
715 }
716 if (arg->fn(tp, (unsigned long)n, arg) < 0) {
717 arg->stop = 1;
718 return;
719 }
720 arg->count++;
721 }
722 }
723 }
724}
725
726static int u32_dump(struct tcf_proto *tp, unsigned long fh,
727 struct sk_buff *skb, struct tcmsg *t)
728{
729 struct tc_u_knode *n = (struct tc_u_knode*)fh;
730 unsigned char *b = skb->tail;
731 struct rtattr *rta;
732
733 if (n == NULL)
734 return skb->len;
735
736 t->tcm_handle = n->handle;
737
738 rta = (struct rtattr*)b;
739 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
740
741 if (TC_U32_KEY(n->handle) == 0) {
742 struct tc_u_hnode *ht = (struct tc_u_hnode*)fh;
743 u32 divisor = ht->divisor+1;
744 RTA_PUT(skb, TCA_U32_DIVISOR, 4, &divisor);
745 } else {
746 RTA_PUT(skb, TCA_U32_SEL,
747 sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
748 &n->sel);
749 if (n->ht_up) {
750 u32 htid = n->handle & 0xFFFFF000;
751 RTA_PUT(skb, TCA_U32_HASH, 4, &htid);
752 }
753 if (n->res.classid)
754 RTA_PUT(skb, TCA_U32_CLASSID, 4, &n->res.classid);
755 if (n->ht_down)
756 RTA_PUT(skb, TCA_U32_LINK, 4, &n->ht_down->handle);
757
758#ifdef CONFIG_CLS_U32_MARK
759 if (n->mark.val || n->mark.mask)
760 RTA_PUT(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark);
761#endif
762
763 if (tcf_exts_dump(skb, &n->exts, &u32_ext_map) < 0)
764 goto rtattr_failure;
765
766#ifdef CONFIG_NET_CLS_IND
767 if(strlen(n->indev))
768 RTA_PUT(skb, TCA_U32_INDEV, IFNAMSIZ, n->indev);
769#endif
770#ifdef CONFIG_CLS_U32_PERF
771 RTA_PUT(skb, TCA_U32_PCNT,
772 sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64),
773 n->pf);
774#endif
775 }
776
777 rta->rta_len = skb->tail - b;
778 if (TC_U32_KEY(n->handle))
779 if (tcf_exts_dump_stats(skb, &n->exts, &u32_ext_map) < 0)
780 goto rtattr_failure;
781 return skb->len;
782
783rtattr_failure:
784 skb_trim(skb, b - skb->data);
785 return -1;
786}
787
788static struct tcf_proto_ops cls_u32_ops = {
789 .next = NULL,
790 .kind = "u32",
791 .classify = u32_classify,
792 .init = u32_init,
793 .destroy = u32_destroy,
794 .get = u32_get,
795 .put = u32_put,
796 .change = u32_change,
797 .delete = u32_delete,
798 .walk = u32_walk,
799 .dump = u32_dump,
800 .owner = THIS_MODULE,
801};
802
803static int __init init_u32(void)
804{
805 printk("u32 classifier\n");
806#ifdef CONFIG_CLS_U32_PERF
807 printk(" Perfomance counters on\n");
808#endif
809#ifdef CONFIG_NET_CLS_POLICE
810 printk(" OLD policer on \n");
811#endif
812#ifdef CONFIG_NET_CLS_IND
813 printk(" input device check on \n");
814#endif
815#ifdef CONFIG_NET_CLS_ACT
816 printk(" Actions configured \n");
817#endif
818 return register_tcf_proto_ops(&cls_u32_ops);
819}
820
821static void __exit exit_u32(void)
822{
823 unregister_tcf_proto_ops(&cls_u32_ops);
824}
825
826module_init(init_u32)
827module_exit(exit_u32)
828MODULE_LICENSE("GPL");
diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c
new file mode 100644
index 000000000000..bf1f00f8b1bf
--- /dev/null
+++ b/net/sched/em_cmp.c
@@ -0,0 +1,101 @@
1/*
2 * net/sched/em_cmp.c Simple packet data comparison ematch
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Thomas Graf <tgraf@suug.ch>
10 */
11
12#include <linux/config.h>
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/kernel.h>
16#include <linux/skbuff.h>
17#include <linux/tc_ematch/tc_em_cmp.h>
18#include <net/pkt_cls.h>
19
20static inline int cmp_needs_transformation(struct tcf_em_cmp *cmp)
21{
22 return unlikely(cmp->flags & TCF_EM_CMP_TRANS);
23}
24
25static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
26 struct tcf_pkt_info *info)
27{
28 struct tcf_em_cmp *cmp = (struct tcf_em_cmp *) em->data;
29 unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer) + cmp->off;
30 u32 val = 0;
31
32 if (!tcf_valid_offset(skb, ptr, cmp->align))
33 return 0;
34
35 switch (cmp->align) {
36 case TCF_EM_ALIGN_U8:
37 val = *ptr;
38 break;
39
40 case TCF_EM_ALIGN_U16:
41 val = *ptr << 8;
42 val |= *(ptr+1);
43
44 if (cmp_needs_transformation(cmp))
45 val = be16_to_cpu(val);
46 break;
47
48 case TCF_EM_ALIGN_U32:
49 /* Worth checking boundries? The branching seems
50 * to get worse. Visit again. */
51 val = *ptr << 24;
52 val |= *(ptr+1) << 16;
53 val |= *(ptr+2) << 8;
54 val |= *(ptr+3);
55
56 if (cmp_needs_transformation(cmp))
57 val = be32_to_cpu(val);
58 break;
59
60 default:
61 return 0;
62 }
63
64 if (cmp->mask)
65 val &= cmp->mask;
66
67 switch (cmp->opnd) {
68 case TCF_EM_OPND_EQ:
69 return val == cmp->val;
70 case TCF_EM_OPND_LT:
71 return val < cmp->val;
72 case TCF_EM_OPND_GT:
73 return val > cmp->val;
74 }
75
76 return 0;
77}
78
79static struct tcf_ematch_ops em_cmp_ops = {
80 .kind = TCF_EM_CMP,
81 .datalen = sizeof(struct tcf_em_cmp),
82 .match = em_cmp_match,
83 .owner = THIS_MODULE,
84 .link = LIST_HEAD_INIT(em_cmp_ops.link)
85};
86
87static int __init init_em_cmp(void)
88{
89 return tcf_em_register(&em_cmp_ops);
90}
91
92static void __exit exit_em_cmp(void)
93{
94 tcf_em_unregister(&em_cmp_ops);
95}
96
97MODULE_LICENSE("GPL");
98
99module_init(init_em_cmp);
100module_exit(exit_em_cmp);
101
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
new file mode 100644
index 000000000000..f1eeaf65cee5
--- /dev/null
+++ b/net/sched/em_meta.c
@@ -0,0 +1,661 @@
1/*
2 * net/sched/em_meta.c Metadata ematch
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Thomas Graf <tgraf@suug.ch>
10 *
11 * ==========================================================================
12 *
13 * The metadata ematch compares two meta objects where each object
14 * represents either a meta value stored in the kernel or a static
15 * value provided by userspace. The objects are not provided by
16 * userspace itself but rather a definition providing the information
17 * to build them. Every object is of a certain type which must be
18 * equal to the object it is being compared to.
19 *
20 * The definition of a objects conists of the type (meta type), a
21 * identifier (meta id) and additional type specific information.
22 * The meta id is either TCF_META_TYPE_VALUE for values provided by
23 * userspace or a index to the meta operations table consisting of
24 * function pointers to type specific meta data collectors returning
25 * the value of the requested meta value.
26 *
27 * lvalue rvalue
28 * +-----------+ +-----------+
29 * | type: INT | | type: INT |
30 * def | id: INDEV | | id: VALUE |
31 * | data: | | data: 3 |
32 * +-----------+ +-----------+
33 * | |
34 * ---> meta_ops[INT][INDEV](...) |
35 * | |
36 * ----------- |
37 * V V
38 * +-----------+ +-----------+
39 * | type: INT | | type: INT |
40 * obj | id: INDEV | | id: VALUE |
41 * | data: 2 |<--data got filled out | data: 3 |
42 * +-----------+ +-----------+
43 * | |
44 * --------------> 2 equals 3 <--------------
45 *
46 * This is a simplified schema, the complexity varies depending
47 * on the meta type. Obviously, the length of the data must also
48 * be provided for non-numeric types.
49 *
50 * Additionaly, type dependant modifiers such as shift operators
51 * or mask may be applied to extend the functionaliy. As of now,
52 * the variable length type supports shifting the byte string to
53 * the right, eating up any number of octets and thus supporting
54 * wildcard interface name comparisons such as "ppp%" matching
55 * ppp0..9.
56 *
57 * NOTE: Certain meta values depend on other subsystems and are
58 * only available if that subsytem is enabled in the kernel.
59 */
60
61#include <linux/config.h>
62#include <linux/module.h>
63#include <linux/types.h>
64#include <linux/kernel.h>
65#include <linux/sched.h>
66#include <linux/string.h>
67#include <linux/skbuff.h>
68#include <linux/random.h>
69#include <linux/tc_ematch/tc_em_meta.h>
70#include <net/dst.h>
71#include <net/route.h>
72#include <net/pkt_cls.h>
73
74struct meta_obj
75{
76 unsigned long value;
77 unsigned int len;
78};
79
80struct meta_value
81{
82 struct tcf_meta_val hdr;
83 unsigned long val;
84 unsigned int len;
85};
86
87struct meta_match
88{
89 struct meta_value lvalue;
90 struct meta_value rvalue;
91};
92
93static inline int meta_id(struct meta_value *v)
94{
95 return TCF_META_ID(v->hdr.kind);
96}
97
98static inline int meta_type(struct meta_value *v)
99{
100 return TCF_META_TYPE(v->hdr.kind);
101}
102
103#define META_COLLECTOR(FUNC) static void meta_##FUNC(struct sk_buff *skb, \
104 struct tcf_pkt_info *info, struct meta_value *v, \
105 struct meta_obj *dst, int *err)
106
107/**************************************************************************
108 * System status & misc
109 **************************************************************************/
110
111META_COLLECTOR(int_random)
112{
113 get_random_bytes(&dst->value, sizeof(dst->value));
114}
115
116static inline unsigned long fixed_loadavg(int load)
117{
118 int rnd_load = load + (FIXED_1/200);
119 int rnd_frac = ((rnd_load & (FIXED_1-1)) * 100) >> FSHIFT;
120
121 return ((rnd_load >> FSHIFT) * 100) + rnd_frac;
122}
123
124META_COLLECTOR(int_loadavg_0)
125{
126 dst->value = fixed_loadavg(avenrun[0]);
127}
128
129META_COLLECTOR(int_loadavg_1)
130{
131 dst->value = fixed_loadavg(avenrun[1]);
132}
133
134META_COLLECTOR(int_loadavg_2)
135{
136 dst->value = fixed_loadavg(avenrun[2]);
137}
138
139/**************************************************************************
140 * Device names & indices
141 **************************************************************************/
142
143static inline int int_dev(struct net_device *dev, struct meta_obj *dst)
144{
145 if (unlikely(dev == NULL))
146 return -1;
147
148 dst->value = dev->ifindex;
149 return 0;
150}
151
152static inline int var_dev(struct net_device *dev, struct meta_obj *dst)
153{
154 if (unlikely(dev == NULL))
155 return -1;
156
157 dst->value = (unsigned long) dev->name;
158 dst->len = strlen(dev->name);
159 return 0;
160}
161
162META_COLLECTOR(int_dev)
163{
164 *err = int_dev(skb->dev, dst);
165}
166
167META_COLLECTOR(var_dev)
168{
169 *err = var_dev(skb->dev, dst);
170}
171
172META_COLLECTOR(int_indev)
173{
174 *err = int_dev(skb->input_dev, dst);
175}
176
177META_COLLECTOR(var_indev)
178{
179 *err = var_dev(skb->input_dev, dst);
180}
181
182META_COLLECTOR(int_realdev)
183{
184 *err = int_dev(skb->real_dev, dst);
185}
186
187META_COLLECTOR(var_realdev)
188{
189 *err = var_dev(skb->real_dev, dst);
190}
191
192/**************************************************************************
193 * skb attributes
194 **************************************************************************/
195
196META_COLLECTOR(int_priority)
197{
198 dst->value = skb->priority;
199}
200
201META_COLLECTOR(int_protocol)
202{
203 /* Let userspace take care of the byte ordering */
204 dst->value = skb->protocol;
205}
206
207META_COLLECTOR(int_security)
208{
209 dst->value = skb->security;
210}
211
212META_COLLECTOR(int_pkttype)
213{
214 dst->value = skb->pkt_type;
215}
216
217META_COLLECTOR(int_pktlen)
218{
219 dst->value = skb->len;
220}
221
222META_COLLECTOR(int_datalen)
223{
224 dst->value = skb->data_len;
225}
226
227META_COLLECTOR(int_maclen)
228{
229 dst->value = skb->mac_len;
230}
231
232/**************************************************************************
233 * Netfilter
234 **************************************************************************/
235
236#ifdef CONFIG_NETFILTER
237META_COLLECTOR(int_nfmark)
238{
239 dst->value = skb->nfmark;
240}
241#endif
242
243/**************************************************************************
244 * Traffic Control
245 **************************************************************************/
246
247META_COLLECTOR(int_tcindex)
248{
249 dst->value = skb->tc_index;
250}
251
252#ifdef CONFIG_NET_CLS_ACT
253META_COLLECTOR(int_tcverd)
254{
255 dst->value = skb->tc_verd;
256}
257
258META_COLLECTOR(int_tcclassid)
259{
260 dst->value = skb->tc_classid;
261}
262#endif
263
264/**************************************************************************
265 * Routing
266 **************************************************************************/
267
268#ifdef CONFIG_NET_CLS_ROUTE
269META_COLLECTOR(int_rtclassid)
270{
271 if (unlikely(skb->dst == NULL))
272 *err = -1;
273 else
274 dst->value = skb->dst->tclassid;
275}
276#endif
277
278META_COLLECTOR(int_rtiif)
279{
280 if (unlikely(skb->dst == NULL))
281 *err = -1;
282 else
283 dst->value = ((struct rtable*) skb->dst)->fl.iif;
284}
285
286/**************************************************************************
287 * Meta value collectors assignment table
288 **************************************************************************/
289
290struct meta_ops
291{
292 void (*get)(struct sk_buff *, struct tcf_pkt_info *,
293 struct meta_value *, struct meta_obj *, int *);
294};
295
296/* Meta value operations table listing all meta value collectors and
297 * assigns them to a type and meta id. */
298static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
299 [TCF_META_TYPE_VAR] = {
300 [TCF_META_ID_DEV] = { .get = meta_var_dev },
301 [TCF_META_ID_INDEV] = { .get = meta_var_indev },
302 [TCF_META_ID_REALDEV] = { .get = meta_var_realdev }
303 },
304 [TCF_META_TYPE_INT] = {
305 [TCF_META_ID_RANDOM] = { .get = meta_int_random },
306 [TCF_META_ID_LOADAVG_0] = { .get = meta_int_loadavg_0 },
307 [TCF_META_ID_LOADAVG_1] = { .get = meta_int_loadavg_1 },
308 [TCF_META_ID_LOADAVG_2] = { .get = meta_int_loadavg_2 },
309 [TCF_META_ID_DEV] = { .get = meta_int_dev },
310 [TCF_META_ID_INDEV] = { .get = meta_int_indev },
311 [TCF_META_ID_REALDEV] = { .get = meta_int_realdev },
312 [TCF_META_ID_PRIORITY] = { .get = meta_int_priority },
313 [TCF_META_ID_PROTOCOL] = { .get = meta_int_protocol },
314 [TCF_META_ID_SECURITY] = { .get = meta_int_security },
315 [TCF_META_ID_PKTTYPE] = { .get = meta_int_pkttype },
316 [TCF_META_ID_PKTLEN] = { .get = meta_int_pktlen },
317 [TCF_META_ID_DATALEN] = { .get = meta_int_datalen },
318 [TCF_META_ID_MACLEN] = { .get = meta_int_maclen },
319#ifdef CONFIG_NETFILTER
320 [TCF_META_ID_NFMARK] = { .get = meta_int_nfmark },
321#endif
322 [TCF_META_ID_TCINDEX] = { .get = meta_int_tcindex },
323#ifdef CONFIG_NET_CLS_ACT
324 [TCF_META_ID_TCVERDICT] = { .get = meta_int_tcverd },
325 [TCF_META_ID_TCCLASSID] = { .get = meta_int_tcclassid },
326#endif
327#ifdef CONFIG_NET_CLS_ROUTE
328 [TCF_META_ID_RTCLASSID] = { .get = meta_int_rtclassid },
329#endif
330 [TCF_META_ID_RTIIF] = { .get = meta_int_rtiif }
331 }
332};
333
334static inline struct meta_ops * meta_ops(struct meta_value *val)
335{
336 return &__meta_ops[meta_type(val)][meta_id(val)];
337}
338
339/**************************************************************************
340 * Type specific operations for TCF_META_TYPE_VAR
341 **************************************************************************/
342
343static int meta_var_compare(struct meta_obj *a, struct meta_obj *b)
344{
345 int r = a->len - b->len;
346
347 if (r == 0)
348 r = memcmp((void *) a->value, (void *) b->value, a->len);
349
350 return r;
351}
352
353static int meta_var_change(struct meta_value *dst, struct rtattr *rta)
354{
355 int len = RTA_PAYLOAD(rta);
356
357 dst->val = (unsigned long) kmalloc(len, GFP_KERNEL);
358 if (dst->val == 0UL)
359 return -ENOMEM;
360 memcpy((void *) dst->val, RTA_DATA(rta), len);
361 dst->len = len;
362 return 0;
363}
364
365static void meta_var_destroy(struct meta_value *v)
366{
367 if (v->val)
368 kfree((void *) v->val);
369}
370
371static void meta_var_apply_extras(struct meta_value *v,
372 struct meta_obj *dst)
373{
374 int shift = v->hdr.shift;
375
376 if (shift && shift < dst->len)
377 dst->len -= shift;
378}
379
380static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
381{
382 if (v->val && v->len)
383 RTA_PUT(skb, tlv, v->len, (void *) v->val);
384 return 0;
385
386rtattr_failure:
387 return -1;
388}
389
390/**************************************************************************
391 * Type specific operations for TCF_META_TYPE_INT
392 **************************************************************************/
393
394static int meta_int_compare(struct meta_obj *a, struct meta_obj *b)
395{
396 /* Let gcc optimize it, the unlikely is not really based on
397 * some numbers but jump free code for mismatches seems
398 * more logical. */
399 if (unlikely(a == b))
400 return 0;
401 else if (a < b)
402 return -1;
403 else
404 return 1;
405}
406
407static int meta_int_change(struct meta_value *dst, struct rtattr *rta)
408{
409 if (RTA_PAYLOAD(rta) >= sizeof(unsigned long)) {
410 dst->val = *(unsigned long *) RTA_DATA(rta);
411 dst->len = sizeof(unsigned long);
412 } else if (RTA_PAYLOAD(rta) == sizeof(u32)) {
413 dst->val = *(u32 *) RTA_DATA(rta);
414 dst->len = sizeof(u32);
415 } else
416 return -EINVAL;
417
418 return 0;
419}
420
421static void meta_int_apply_extras(struct meta_value *v,
422 struct meta_obj *dst)
423{
424 if (v->hdr.shift)
425 dst->value >>= v->hdr.shift;
426
427 if (v->val)
428 dst->value &= v->val;
429}
430
431static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
432{
433 if (v->len == sizeof(unsigned long))
434 RTA_PUT(skb, tlv, sizeof(unsigned long), &v->val);
435 else if (v->len == sizeof(u32)) {
436 u32 d = v->val;
437 RTA_PUT(skb, tlv, sizeof(d), &d);
438 }
439
440 return 0;
441
442rtattr_failure:
443 return -1;
444}
445
446/**************************************************************************
447 * Type specific operations table
448 **************************************************************************/
449
450struct meta_type_ops
451{
452 void (*destroy)(struct meta_value *);
453 int (*compare)(struct meta_obj *, struct meta_obj *);
454 int (*change)(struct meta_value *, struct rtattr *);
455 void (*apply_extras)(struct meta_value *, struct meta_obj *);
456 int (*dump)(struct sk_buff *, struct meta_value *, int);
457};
458
459static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = {
460 [TCF_META_TYPE_VAR] = {
461 .destroy = meta_var_destroy,
462 .compare = meta_var_compare,
463 .change = meta_var_change,
464 .apply_extras = meta_var_apply_extras,
465 .dump = meta_var_dump
466 },
467 [TCF_META_TYPE_INT] = {
468 .compare = meta_int_compare,
469 .change = meta_int_change,
470 .apply_extras = meta_int_apply_extras,
471 .dump = meta_int_dump
472 }
473};
474
475static inline struct meta_type_ops * meta_type_ops(struct meta_value *v)
476{
477 return &__meta_type_ops[meta_type(v)];
478}
479
480/**************************************************************************
481 * Core
482 **************************************************************************/
483
484static inline int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info,
485 struct meta_value *v, struct meta_obj *dst)
486{
487 int err = 0;
488
489 if (meta_id(v) == TCF_META_ID_VALUE) {
490 dst->value = v->val;
491 dst->len = v->len;
492 return 0;
493 }
494
495 meta_ops(v)->get(skb, info, v, dst, &err);
496 if (err < 0)
497 return err;
498
499 if (meta_type_ops(v)->apply_extras)
500 meta_type_ops(v)->apply_extras(v, dst);
501
502 return 0;
503}
504
505static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,
506 struct tcf_pkt_info *info)
507{
508 int r;
509 struct meta_match *meta = (struct meta_match *) m->data;
510 struct meta_obj l_value, r_value;
511
512 if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 ||
513 meta_get(skb, info, &meta->rvalue, &r_value) < 0)
514 return 0;
515
516 r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value);
517
518 switch (meta->lvalue.hdr.op) {
519 case TCF_EM_OPND_EQ:
520 return !r;
521 case TCF_EM_OPND_LT:
522 return r < 0;
523 case TCF_EM_OPND_GT:
524 return r > 0;
525 }
526
527 return 0;
528}
529
530static inline void meta_delete(struct meta_match *meta)
531{
532 struct meta_type_ops *ops = meta_type_ops(&meta->lvalue);
533
534 if (ops && ops->destroy) {
535 ops->destroy(&meta->lvalue);
536 ops->destroy(&meta->rvalue);
537 }
538
539 kfree(meta);
540}
541
542static inline int meta_change_data(struct meta_value *dst, struct rtattr *rta)
543{
544 if (rta) {
545 if (RTA_PAYLOAD(rta) == 0)
546 return -EINVAL;
547
548 return meta_type_ops(dst)->change(dst, rta);
549 }
550
551 return 0;
552}
553
554static inline int meta_is_supported(struct meta_value *val)
555{
556 return (!meta_id(val) || meta_ops(val)->get);
557}
558
559static int em_meta_change(struct tcf_proto *tp, void *data, int len,
560 struct tcf_ematch *m)
561{
562 int err = -EINVAL;
563 struct rtattr *tb[TCA_EM_META_MAX];
564 struct tcf_meta_hdr *hdr;
565 struct meta_match *meta = NULL;
566
567 if (rtattr_parse(tb, TCA_EM_META_MAX, data, len) < 0)
568 goto errout;
569
570 if (tb[TCA_EM_META_HDR-1] == NULL ||
571 RTA_PAYLOAD(tb[TCA_EM_META_HDR-1]) < sizeof(*hdr))
572 goto errout;
573 hdr = RTA_DATA(tb[TCA_EM_META_HDR-1]);
574
575 if (TCF_META_TYPE(hdr->left.kind) != TCF_META_TYPE(hdr->right.kind) ||
576 TCF_META_TYPE(hdr->left.kind) > TCF_META_TYPE_MAX ||
577 TCF_META_ID(hdr->left.kind) > TCF_META_ID_MAX ||
578 TCF_META_ID(hdr->right.kind) > TCF_META_ID_MAX)
579 goto errout;
580
581 meta = kmalloc(sizeof(*meta), GFP_KERNEL);
582 if (meta == NULL)
583 goto errout;
584 memset(meta, 0, sizeof(*meta));
585
586 memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left));
587 memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right));
588
589 if (!meta_is_supported(&meta->lvalue) ||
590 !meta_is_supported(&meta->rvalue)) {
591 err = -EOPNOTSUPP;
592 goto errout;
593 }
594
595 if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE-1]) < 0 ||
596 meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE-1]) < 0)
597 goto errout;
598
599 m->datalen = sizeof(*meta);
600 m->data = (unsigned long) meta;
601
602 err = 0;
603errout:
604 if (err && meta)
605 meta_delete(meta);
606 return err;
607}
608
609static void em_meta_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
610{
611 if (m)
612 meta_delete((struct meta_match *) m->data);
613}
614
615static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em)
616{
617 struct meta_match *meta = (struct meta_match *) em->data;
618 struct tcf_meta_hdr hdr;
619 struct meta_type_ops *ops;
620
621 memset(&hdr, 0, sizeof(hdr));
622 memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left));
623 memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right));
624
625 RTA_PUT(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr);
626
627 ops = meta_type_ops(&meta->lvalue);
628 if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 ||
629 ops->dump(skb, &meta->rvalue, TCA_EM_META_RVALUE) < 0)
630 goto rtattr_failure;
631
632 return 0;
633
634rtattr_failure:
635 return -1;
636}
637
638static struct tcf_ematch_ops em_meta_ops = {
639 .kind = TCF_EM_META,
640 .change = em_meta_change,
641 .match = em_meta_match,
642 .destroy = em_meta_destroy,
643 .dump = em_meta_dump,
644 .owner = THIS_MODULE,
645 .link = LIST_HEAD_INIT(em_meta_ops.link)
646};
647
648static int __init init_em_meta(void)
649{
650 return tcf_em_register(&em_meta_ops);
651}
652
653static void __exit exit_em_meta(void)
654{
655 tcf_em_unregister(&em_meta_ops);
656}
657
658MODULE_LICENSE("GPL");
659
660module_init(init_em_meta);
661module_exit(exit_em_meta);
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
new file mode 100644
index 000000000000..71ea926a9f09
--- /dev/null
+++ b/net/sched/em_nbyte.c
@@ -0,0 +1,82 @@
1/*
2 * net/sched/em_nbyte.c N-Byte ematch
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Thomas Graf <tgraf@suug.ch>
10 */
11
12#include <linux/config.h>
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/kernel.h>
16#include <linux/sched.h>
17#include <linux/string.h>
18#include <linux/skbuff.h>
19#include <linux/tc_ematch/tc_em_nbyte.h>
20#include <net/pkt_cls.h>
21
22struct nbyte_data
23{
24 struct tcf_em_nbyte hdr;
25 char pattern[0];
26};
27
28static int em_nbyte_change(struct tcf_proto *tp, void *data, int data_len,
29 struct tcf_ematch *em)
30{
31 struct tcf_em_nbyte *nbyte = data;
32
33 if (data_len < sizeof(*nbyte) ||
34 data_len < (sizeof(*nbyte) + nbyte->len))
35 return -EINVAL;
36
37 em->datalen = sizeof(*nbyte) + nbyte->len;
38 em->data = (unsigned long) kmalloc(em->datalen, GFP_KERNEL);
39 if (em->data == 0UL)
40 return -ENOBUFS;
41
42 memcpy((void *) em->data, data, em->datalen);
43
44 return 0;
45}
46
47static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em,
48 struct tcf_pkt_info *info)
49{
50 struct nbyte_data *nbyte = (struct nbyte_data *) em->data;
51 unsigned char *ptr = tcf_get_base_ptr(skb, nbyte->hdr.layer);
52
53 ptr += nbyte->hdr.off;
54
55 if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len))
56 return 0;
57
58 return !memcmp(ptr + nbyte->hdr.off, nbyte->pattern, nbyte->hdr.len);
59}
60
61static struct tcf_ematch_ops em_nbyte_ops = {
62 .kind = TCF_EM_NBYTE,
63 .change = em_nbyte_change,
64 .match = em_nbyte_match,
65 .owner = THIS_MODULE,
66 .link = LIST_HEAD_INIT(em_nbyte_ops.link)
67};
68
69static int __init init_em_nbyte(void)
70{
71 return tcf_em_register(&em_nbyte_ops);
72}
73
74static void __exit exit_em_nbyte(void)
75{
76 tcf_em_unregister(&em_nbyte_ops);
77}
78
79MODULE_LICENSE("GPL");
80
81module_init(init_em_nbyte);
82module_exit(exit_em_nbyte);
diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c
new file mode 100644
index 000000000000..34e7e51e601e
--- /dev/null
+++ b/net/sched/em_u32.c
@@ -0,0 +1,63 @@
1/*
2 * net/sched/em_u32.c U32 Ematch
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Thomas Graf <tgraf@suug.ch>
10 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11 *
12 * Based on net/sched/cls_u32.c
13 */
14
15#include <linux/config.h>
16#include <linux/module.h>
17#include <linux/types.h>
18#include <linux/kernel.h>
19#include <linux/skbuff.h>
20#include <net/pkt_cls.h>
21
22static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em,
23 struct tcf_pkt_info *info)
24{
25 struct tc_u32_key *key = (struct tc_u32_key *) em->data;
26 unsigned char *ptr = skb->nh.raw;
27
28 if (info) {
29 if (info->ptr)
30 ptr = info->ptr;
31 ptr += (info->nexthdr & key->offmask);
32 }
33
34 ptr += key->off;
35
36 if (!tcf_valid_offset(skb, ptr, sizeof(u32)))
37 return 0;
38
39 return !(((*(u32*) ptr) ^ key->val) & key->mask);
40}
41
42static struct tcf_ematch_ops em_u32_ops = {
43 .kind = TCF_EM_U32,
44 .datalen = sizeof(struct tc_u32_key),
45 .match = em_u32_match,
46 .owner = THIS_MODULE,
47 .link = LIST_HEAD_INIT(em_u32_ops.link)
48};
49
50static int __init init_em_u32(void)
51{
52 return tcf_em_register(&em_u32_ops);
53}
54
55static void __exit exit_em_u32(void)
56{
57 tcf_em_unregister(&em_u32_ops);
58}
59
60MODULE_LICENSE("GPL");
61
62module_init(init_em_u32);
63module_exit(exit_em_u32);
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
new file mode 100644
index 000000000000..ebfe2e7d21bd
--- /dev/null
+++ b/net/sched/ematch.c
@@ -0,0 +1,524 @@
1/*
2 * net/sched/ematch.c Extended Match API
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Thomas Graf <tgraf@suug.ch>
10 *
11 * ==========================================================================
12 *
13 * An extended match (ematch) is a small classification tool not worth
14 * writing a full classifier for. Ematches can be interconnected to form
15 * a logic expression and get attached to classifiers to extend their
16 * functionatlity.
17 *
18 * The userspace part transforms the logic expressions into an array
19 * consisting of multiple sequences of interconnected ematches separated
20 * by markers. Precedence is implemented by a special ematch kind
21 * referencing a sequence beyond the marker of the current sequence
22 * causing the current position in the sequence to be pushed onto a stack
23 * to allow the current position to be overwritten by the position referenced
24 * in the special ematch. Matching continues in the new sequence until a
25 * marker is reached causing the position to be restored from the stack.
26 *
27 * Example:
28 * A AND (B1 OR B2) AND C AND D
29 *
30 * ------->-PUSH-------
31 * -->-- / -->-- \ -->--
32 * / \ / / \ \ / \
33 * +-------+-------+-------+-------+-------+--------+
34 * | A AND | B AND | C AND | D END | B1 OR | B2 END |
35 * +-------+-------+-------+-------+-------+--------+
36 * \ /
37 * --------<-POP---------
38 *
39 * where B is a virtual ematch referencing to sequence starting with B1.
40 *
41 * ==========================================================================
42 *
43 * How to write an ematch in 60 seconds
44 * ------------------------------------
45 *
46 * 1) Provide a matcher function:
47 * static int my_match(struct sk_buff *skb, struct tcf_ematch *m,
48 * struct tcf_pkt_info *info)
49 * {
50 * struct mydata *d = (struct mydata *) m->data;
51 *
52 * if (...matching goes here...)
53 * return 1;
54 * else
55 * return 0;
56 * }
57 *
58 * 2) Fill out a struct tcf_ematch_ops:
59 * static struct tcf_ematch_ops my_ops = {
60 * .kind = unique id,
61 * .datalen = sizeof(struct mydata),
62 * .match = my_match,
63 * .owner = THIS_MODULE,
64 * };
65 *
66 * 3) Register/Unregister your ematch:
67 * static int __init init_my_ematch(void)
68 * {
69 * return tcf_em_register(&my_ops);
70 * }
71 *
72 * static void __exit exit_my_ematch(void)
73 * {
74 * return tcf_em_unregister(&my_ops);
75 * }
76 *
77 * module_init(init_my_ematch);
78 * module_exit(exit_my_ematch);
79 *
80 * 4) By now you should have two more seconds left, barely enough to
81 * open up a beer to watch the compilation going.
82 */
83
84#include <linux/config.h>
85#include <linux/module.h>
86#include <linux/types.h>
87#include <linux/kernel.h>
88#include <linux/sched.h>
89#include <linux/mm.h>
90#include <linux/errno.h>
91#include <linux/interrupt.h>
92#include <linux/rtnetlink.h>
93#include <linux/skbuff.h>
94#include <net/pkt_cls.h>
95#include <config/net/ematch/stack.h>
96
97static LIST_HEAD(ematch_ops);
98static DEFINE_RWLOCK(ematch_mod_lock);
99
100static inline struct tcf_ematch_ops * tcf_em_lookup(u16 kind)
101{
102 struct tcf_ematch_ops *e = NULL;
103
104 read_lock(&ematch_mod_lock);
105 list_for_each_entry(e, &ematch_ops, link) {
106 if (kind == e->kind) {
107 if (!try_module_get(e->owner))
108 e = NULL;
109 read_unlock(&ematch_mod_lock);
110 return e;
111 }
112 }
113 read_unlock(&ematch_mod_lock);
114
115 return NULL;
116}
117
118/**
119 * tcf_em_register - register an extended match
120 *
121 * @ops: ematch operations lookup table
122 *
123 * This function must be called by ematches to announce their presence.
124 * The given @ops must have kind set to a unique identifier and the
125 * callback match() must be implemented. All other callbacks are optional
126 * and a fallback implementation is used instead.
127 *
128 * Returns -EEXISTS if an ematch of the same kind has already registered.
129 */
130int tcf_em_register(struct tcf_ematch_ops *ops)
131{
132 int err = -EEXIST;
133 struct tcf_ematch_ops *e;
134
135 if (ops->match == NULL)
136 return -EINVAL;
137
138 write_lock(&ematch_mod_lock);
139 list_for_each_entry(e, &ematch_ops, link)
140 if (ops->kind == e->kind)
141 goto errout;
142
143 list_add_tail(&ops->link, &ematch_ops);
144 err = 0;
145errout:
146 write_unlock(&ematch_mod_lock);
147 return err;
148}
149
150/**
151 * tcf_em_unregister - unregster and extended match
152 *
153 * @ops: ematch operations lookup table
154 *
155 * This function must be called by ematches to announce their disappearance
156 * for examples when the module gets unloaded. The @ops parameter must be
157 * the same as the one used for registration.
158 *
159 * Returns -ENOENT if no matching ematch was found.
160 */
161int tcf_em_unregister(struct tcf_ematch_ops *ops)
162{
163 int err = 0;
164 struct tcf_ematch_ops *e;
165
166 write_lock(&ematch_mod_lock);
167 list_for_each_entry(e, &ematch_ops, link) {
168 if (e == ops) {
169 list_del(&e->link);
170 goto out;
171 }
172 }
173
174 err = -ENOENT;
175out:
176 write_unlock(&ematch_mod_lock);
177 return err;
178}
179
180static inline struct tcf_ematch * tcf_em_get_match(struct tcf_ematch_tree *tree,
181 int index)
182{
183 return &tree->matches[index];
184}
185
186
187static int tcf_em_validate(struct tcf_proto *tp,
188 struct tcf_ematch_tree_hdr *tree_hdr,
189 struct tcf_ematch *em, struct rtattr *rta, int idx)
190{
191 int err = -EINVAL;
192 struct tcf_ematch_hdr *em_hdr = RTA_DATA(rta);
193 int data_len = RTA_PAYLOAD(rta) - sizeof(*em_hdr);
194 void *data = (void *) em_hdr + sizeof(*em_hdr);
195
196 if (!TCF_EM_REL_VALID(em_hdr->flags))
197 goto errout;
198
199 if (em_hdr->kind == TCF_EM_CONTAINER) {
200 /* Special ematch called "container", carries an index
201 * referencing an external ematch sequence. */
202 u32 ref;
203
204 if (data_len < sizeof(ref))
205 goto errout;
206 ref = *(u32 *) data;
207
208 if (ref >= tree_hdr->nmatches)
209 goto errout;
210
211 /* We do not allow backward jumps to avoid loops and jumps
212 * to our own position are of course illegal. */
213 if (ref <= idx)
214 goto errout;
215
216
217 em->data = ref;
218 } else {
219 /* Note: This lookup will increase the module refcnt
220 * of the ematch module referenced. In case of a failure,
221 * a destroy function is called by the underlying layer
222 * which automatically releases the reference again, therefore
223 * the module MUST not be given back under any circumstances
224 * here. Be aware, the destroy function assumes that the
225 * module is held if the ops field is non zero. */
226 em->ops = tcf_em_lookup(em_hdr->kind);
227
228 if (em->ops == NULL) {
229 err = -ENOENT;
230 goto errout;
231 }
232
233 /* ematch module provides expected length of data, so we
234 * can do a basic sanity check. */
235 if (em->ops->datalen && data_len < em->ops->datalen)
236 goto errout;
237
238 if (em->ops->change) {
239 err = em->ops->change(tp, data, data_len, em);
240 if (err < 0)
241 goto errout;
242 } else if (data_len > 0) {
243 /* ematch module doesn't provide an own change
244 * procedure and expects us to allocate and copy
245 * the ematch data.
246 *
247 * TCF_EM_SIMPLE may be specified stating that the
248 * data only consists of a u32 integer and the module
249 * does not expected a memory reference but rather
250 * the value carried. */
251 if (em_hdr->flags & TCF_EM_SIMPLE) {
252 if (data_len < sizeof(u32))
253 goto errout;
254 em->data = *(u32 *) data;
255 } else {
256 void *v = kmalloc(data_len, GFP_KERNEL);
257 if (v == NULL) {
258 err = -ENOBUFS;
259 goto errout;
260 }
261 memcpy(v, data, data_len);
262 em->data = (unsigned long) v;
263 }
264 }
265 }
266
267 em->matchid = em_hdr->matchid;
268 em->flags = em_hdr->flags;
269 em->datalen = data_len;
270
271 err = 0;
272errout:
273 return err;
274}
275
276/**
277 * tcf_em_tree_validate - validate ematch config TLV and build ematch tree
278 *
279 * @tp: classifier kind handle
280 * @rta: ematch tree configuration TLV
281 * @tree: destination ematch tree variable to store the resulting
282 * ematch tree.
283 *
284 * This function validates the given configuration TLV @rta and builds an
285 * ematch tree in @tree. The resulting tree must later be copied into
286 * the private classifier data using tcf_em_tree_change(). You MUST NOT
287 * provide the ematch tree variable of the private classifier data directly,
288 * the changes would not be locked properly.
289 *
290 * Returns a negative error code if the configuration TLV contains errors.
291 */
292int tcf_em_tree_validate(struct tcf_proto *tp, struct rtattr *rta,
293 struct tcf_ematch_tree *tree)
294{
295 int idx, list_len, matches_len, err = -EINVAL;
296 struct rtattr *tb[TCA_EMATCH_TREE_MAX];
297 struct rtattr *rt_match, *rt_hdr, *rt_list;
298 struct tcf_ematch_tree_hdr *tree_hdr;
299 struct tcf_ematch *em;
300
301 if (rtattr_parse_nested(tb, TCA_EMATCH_TREE_MAX, rta) < 0)
302 goto errout;
303
304 rt_hdr = tb[TCA_EMATCH_TREE_HDR-1];
305 rt_list = tb[TCA_EMATCH_TREE_LIST-1];
306
307 if (rt_hdr == NULL || rt_list == NULL)
308 goto errout;
309
310 if (RTA_PAYLOAD(rt_hdr) < sizeof(*tree_hdr) ||
311 RTA_PAYLOAD(rt_list) < sizeof(*rt_match))
312 goto errout;
313
314 tree_hdr = RTA_DATA(rt_hdr);
315 memcpy(&tree->hdr, tree_hdr, sizeof(*tree_hdr));
316
317 rt_match = RTA_DATA(rt_list);
318 list_len = RTA_PAYLOAD(rt_list);
319 matches_len = tree_hdr->nmatches * sizeof(*em);
320
321 tree->matches = kmalloc(matches_len, GFP_KERNEL);
322 if (tree->matches == NULL)
323 goto errout;
324 memset(tree->matches, 0, matches_len);
325
326 /* We do not use rtattr_parse_nested here because the maximum
327 * number of attributes is unknown. This saves us the allocation
328 * for a tb buffer which would serve no purpose at all.
329 *
330 * The array of rt attributes is parsed in the order as they are
331 * provided, their type must be incremental from 1 to n. Even
332 * if it does not serve any real purpose, a failure of sticking
333 * to this policy will result in parsing failure. */
334 for (idx = 0; RTA_OK(rt_match, list_len); idx++) {
335 err = -EINVAL;
336
337 if (rt_match->rta_type != (idx + 1))
338 goto errout_abort;
339
340 if (idx >= tree_hdr->nmatches)
341 goto errout_abort;
342
343 if (RTA_PAYLOAD(rt_match) < sizeof(struct tcf_ematch_hdr))
344 goto errout_abort;
345
346 em = tcf_em_get_match(tree, idx);
347
348 err = tcf_em_validate(tp, tree_hdr, em, rt_match, idx);
349 if (err < 0)
350 goto errout_abort;
351
352 rt_match = RTA_NEXT(rt_match, list_len);
353 }
354
355 /* Check if the number of matches provided by userspace actually
356 * complies with the array of matches. The number was used for
357 * the validation of references and a mismatch could lead to
358 * undefined references during the matching process. */
359 if (idx != tree_hdr->nmatches) {
360 err = -EINVAL;
361 goto errout_abort;
362 }
363
364 err = 0;
365errout:
366 return err;
367
368errout_abort:
369 tcf_em_tree_destroy(tp, tree);
370 return err;
371}
372
373/**
374 * tcf_em_tree_destroy - destroy an ematch tree
375 *
376 * @tp: classifier kind handle
377 * @tree: ematch tree to be deleted
378 *
379 * This functions destroys an ematch tree previously created by
380 * tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that
381 * the ematch tree is not in use before calling this function.
382 */
383void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree)
384{
385 int i;
386
387 if (tree->matches == NULL)
388 return;
389
390 for (i = 0; i < tree->hdr.nmatches; i++) {
391 struct tcf_ematch *em = tcf_em_get_match(tree, i);
392
393 if (em->ops) {
394 if (em->ops->destroy)
395 em->ops->destroy(tp, em);
396 else if (!tcf_em_is_simple(em) && em->data)
397 kfree((void *) em->data);
398 module_put(em->ops->owner);
399 }
400 }
401
402 tree->hdr.nmatches = 0;
403 kfree(tree->matches);
404}
405
406/**
407 * tcf_em_tree_dump - dump ematch tree into a rtnl message
408 *
409 * @skb: skb holding the rtnl message
410 * @t: ematch tree to be dumped
411 * @tlv: TLV type to be used to encapsulate the tree
412 *
413 * This function dumps a ematch tree into a rtnl message. It is valid to
414 * call this function while the ematch tree is in use.
415 *
416 * Returns -1 if the skb tailroom is insufficient.
417 */
418int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)
419{
420 int i;
421 struct rtattr * top_start = (struct rtattr*) skb->tail;
422 struct rtattr * list_start;
423
424 RTA_PUT(skb, tlv, 0, NULL);
425 RTA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr);
426
427 list_start = (struct rtattr *) skb->tail;
428 RTA_PUT(skb, TCA_EMATCH_TREE_LIST, 0, NULL);
429
430 for (i = 0; i < tree->hdr.nmatches; i++) {
431 struct rtattr *match_start = (struct rtattr*) skb->tail;
432 struct tcf_ematch *em = tcf_em_get_match(tree, i);
433 struct tcf_ematch_hdr em_hdr = {
434 .kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER,
435 .matchid = em->matchid,
436 .flags = em->flags
437 };
438
439 RTA_PUT(skb, i+1, sizeof(em_hdr), &em_hdr);
440
441 if (em->ops && em->ops->dump) {
442 if (em->ops->dump(skb, em) < 0)
443 goto rtattr_failure;
444 } else if (tcf_em_is_container(em) || tcf_em_is_simple(em)) {
445 u32 u = em->data;
446 RTA_PUT_NOHDR(skb, sizeof(u), &u);
447 } else if (em->datalen > 0)
448 RTA_PUT_NOHDR(skb, em->datalen, (void *) em->data);
449
450 match_start->rta_len = skb->tail - (u8*) match_start;
451 }
452
453 list_start->rta_len = skb->tail - (u8 *) list_start;
454 top_start->rta_len = skb->tail - (u8 *) top_start;
455
456 return 0;
457
458rtattr_failure:
459 return -1;
460}
461
462static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em,
463 struct tcf_pkt_info *info)
464{
465 int r = em->ops->match(skb, em, info);
466 return tcf_em_is_inverted(em) ? !r : r;
467}
468
469/* Do not use this function directly, use tcf_em_tree_match instead */
470int __tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree,
471 struct tcf_pkt_info *info)
472{
473 int stackp = 0, match_idx = 0, res = 0;
474 struct tcf_ematch *cur_match;
475 int stack[CONFIG_NET_EMATCH_STACK];
476
477proceed:
478 while (match_idx < tree->hdr.nmatches) {
479 cur_match = tcf_em_get_match(tree, match_idx);
480
481 if (tcf_em_is_container(cur_match)) {
482 if (unlikely(stackp >= CONFIG_NET_EMATCH_STACK))
483 goto stack_overflow;
484
485 stack[stackp++] = match_idx;
486 match_idx = cur_match->data;
487 goto proceed;
488 }
489
490 res = tcf_em_match(skb, cur_match, info);
491
492 if (tcf_em_early_end(cur_match, res))
493 break;
494
495 match_idx++;
496 }
497
498pop_stack:
499 if (stackp > 0) {
500 match_idx = stack[--stackp];
501 cur_match = tcf_em_get_match(tree, match_idx);
502
503 if (tcf_em_early_end(cur_match, res))
504 goto pop_stack;
505 else {
506 match_idx++;
507 goto proceed;
508 }
509 }
510
511 return res;
512
513stack_overflow:
514 if (net_ratelimit())
515 printk("Local stack overflow, increase NET_EMATCH_STACK\n");
516 return -1;
517}
518
519EXPORT_SYMBOL(tcf_em_register);
520EXPORT_SYMBOL(tcf_em_unregister);
521EXPORT_SYMBOL(tcf_em_tree_validate);
522EXPORT_SYMBOL(tcf_em_tree_destroy);
523EXPORT_SYMBOL(tcf_em_tree_dump);
524EXPORT_SYMBOL(__tcf_em_tree_match);
diff --git a/net/sched/estimator.c b/net/sched/estimator.c
new file mode 100644
index 000000000000..5d3ae03e22a7
--- /dev/null
+++ b/net/sched/estimator.c
@@ -0,0 +1,197 @@
1/*
2 * net/sched/estimator.c Simple rate estimator.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 */
11
12#include <asm/uaccess.h>
13#include <asm/system.h>
14#include <linux/bitops.h>
15#include <linux/module.h>
16#include <linux/types.h>
17#include <linux/kernel.h>
18#include <linux/jiffies.h>
19#include <linux/string.h>
20#include <linux/mm.h>
21#include <linux/socket.h>
22#include <linux/sockios.h>
23#include <linux/in.h>
24#include <linux/errno.h>
25#include <linux/interrupt.h>
26#include <linux/netdevice.h>
27#include <linux/skbuff.h>
28#include <linux/rtnetlink.h>
29#include <linux/init.h>
30#include <net/sock.h>
31#include <net/pkt_sched.h>
32
33/*
34 This code is NOT intended to be used for statistics collection,
35 its purpose is to provide a base for statistical multiplexing
36 for controlled load service.
37 If you need only statistics, run a user level daemon which
38 periodically reads byte counters.
39
40 Unfortunately, rate estimation is not a very easy task.
41 F.e. I did not find a simple way to estimate the current peak rate
42 and even failed to formulate the problem 8)8)
43
44 So I preferred not to built an estimator into the scheduler,
45 but run this task separately.
46 Ideally, it should be kernel thread(s), but for now it runs
47 from timers, which puts apparent top bounds on the number of rated
48 flows, has minimal overhead on small, but is enough
49 to handle controlled load service, sets of aggregates.
50
51 We measure rate over A=(1<<interval) seconds and evaluate EWMA:
52
53 avrate = avrate*(1-W) + rate*W
54
55 where W is chosen as negative power of 2: W = 2^(-ewma_log)
56
57 The resulting time constant is:
58
59 T = A/(-ln(1-W))
60
61
62 NOTES.
63
64 * The stored value for avbps is scaled by 2^5, so that maximal
65 rate is ~1Gbit, avpps is scaled by 2^10.
66
67 * Minimal interval is HZ/4=250msec (it is the greatest common divisor
68 for HZ=100 and HZ=1024 8)), maximal interval
69 is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
70 are too expensive, longer ones can be implemented
71 at user level painlessly.
72 */
73
74#define EST_MAX_INTERVAL 5
75
76struct qdisc_estimator
77{
78 struct qdisc_estimator *next;
79 struct tc_stats *stats;
80 spinlock_t *stats_lock;
81 unsigned interval;
82 int ewma_log;
83 u64 last_bytes;
84 u32 last_packets;
85 u32 avpps;
86 u32 avbps;
87};
88
89struct qdisc_estimator_head
90{
91 struct timer_list timer;
92 struct qdisc_estimator *list;
93};
94
95static struct qdisc_estimator_head elist[EST_MAX_INTERVAL+1];
96
97/* Estimator array lock */
98static DEFINE_RWLOCK(est_lock);
99
100static void est_timer(unsigned long arg)
101{
102 int idx = (int)arg;
103 struct qdisc_estimator *e;
104
105 read_lock(&est_lock);
106 for (e = elist[idx].list; e; e = e->next) {
107 struct tc_stats *st = e->stats;
108 u64 nbytes;
109 u32 npackets;
110 u32 rate;
111
112 spin_lock(e->stats_lock);
113 nbytes = st->bytes;
114 npackets = st->packets;
115 rate = (nbytes - e->last_bytes)<<(7 - idx);
116 e->last_bytes = nbytes;
117 e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log;
118 st->bps = (e->avbps+0xF)>>5;
119
120 rate = (npackets - e->last_packets)<<(12 - idx);
121 e->last_packets = npackets;
122 e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
123 e->stats->pps = (e->avpps+0x1FF)>>10;
124 spin_unlock(e->stats_lock);
125 }
126
127 mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
128 read_unlock(&est_lock);
129}
130
131int qdisc_new_estimator(struct tc_stats *stats, spinlock_t *stats_lock, struct rtattr *opt)
132{
133 struct qdisc_estimator *est;
134 struct tc_estimator *parm = RTA_DATA(opt);
135
136 if (RTA_PAYLOAD(opt) < sizeof(*parm))
137 return -EINVAL;
138
139 if (parm->interval < -2 || parm->interval > 3)
140 return -EINVAL;
141
142 est = kmalloc(sizeof(*est), GFP_KERNEL);
143 if (est == NULL)
144 return -ENOBUFS;
145
146 memset(est, 0, sizeof(*est));
147 est->interval = parm->interval + 2;
148 est->stats = stats;
149 est->stats_lock = stats_lock;
150 est->ewma_log = parm->ewma_log;
151 est->last_bytes = stats->bytes;
152 est->avbps = stats->bps<<5;
153 est->last_packets = stats->packets;
154 est->avpps = stats->pps<<10;
155
156 est->next = elist[est->interval].list;
157 if (est->next == NULL) {
158 init_timer(&elist[est->interval].timer);
159 elist[est->interval].timer.data = est->interval;
160 elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4);
161 elist[est->interval].timer.function = est_timer;
162 add_timer(&elist[est->interval].timer);
163 }
164 write_lock_bh(&est_lock);
165 elist[est->interval].list = est;
166 write_unlock_bh(&est_lock);
167 return 0;
168}
169
170void qdisc_kill_estimator(struct tc_stats *stats)
171{
172 int idx;
173 struct qdisc_estimator *est, **pest;
174
175 for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
176 int killed = 0;
177 pest = &elist[idx].list;
178 while ((est=*pest) != NULL) {
179 if (est->stats != stats) {
180 pest = &est->next;
181 continue;
182 }
183
184 write_lock_bh(&est_lock);
185 *pest = est->next;
186 write_unlock_bh(&est_lock);
187
188 kfree(est);
189 killed++;
190 }
191 if (killed && elist[idx].list == NULL)
192 del_timer(&elist[idx].timer);
193 }
194}
195
196EXPORT_SYMBOL(qdisc_kill_estimator);
197EXPORT_SYMBOL(qdisc_new_estimator);
diff --git a/net/sched/gact.c b/net/sched/gact.c
new file mode 100644
index 000000000000..a811c89fef7f
--- /dev/null
+++ b/net/sched/gact.c
@@ -0,0 +1,231 @@
1/*
2 * net/sched/gact.c Generic actions
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * copyright Jamal Hadi Salim (2002-4)
10 *
11 */
12
13#include <asm/uaccess.h>
14#include <asm/system.h>
15#include <linux/bitops.h>
16#include <linux/config.h>
17#include <linux/types.h>
18#include <linux/kernel.h>
19#include <linux/sched.h>
20#include <linux/string.h>
21#include <linux/mm.h>
22#include <linux/socket.h>
23#include <linux/sockios.h>
24#include <linux/in.h>
25#include <linux/errno.h>
26#include <linux/interrupt.h>
27#include <linux/netdevice.h>
28#include <linux/skbuff.h>
29#include <linux/rtnetlink.h>
30#include <linux/module.h>
31#include <linux/init.h>
32#include <linux/proc_fs.h>
33#include <net/sock.h>
34#include <net/pkt_sched.h>
35#include <linux/tc_act/tc_gact.h>
36#include <net/tc_act/tc_gact.h>
37
38/* use generic hash table */
39#define MY_TAB_SIZE 16
40#define MY_TAB_MASK 15
41
42static u32 idx_gen;
43static struct tcf_gact *tcf_gact_ht[MY_TAB_SIZE];
44static DEFINE_RWLOCK(gact_lock);
45
46/* ovewrride the defaults */
47#define tcf_st tcf_gact
48#define tc_st tc_gact
49#define tcf_t_lock gact_lock
50#define tcf_ht tcf_gact_ht
51
52#define CONFIG_NET_ACT_INIT 1
53#include <net/pkt_act.h>
54
55#ifdef CONFIG_GACT_PROB
56static int gact_net_rand(struct tcf_gact *p)
57{
58 if (net_random()%p->pval)
59 return p->action;
60 return p->paction;
61}
62
63static int gact_determ(struct tcf_gact *p)
64{
65 if (p->bstats.packets%p->pval)
66 return p->action;
67 return p->paction;
68}
69
70typedef int (*g_rand)(struct tcf_gact *p);
71static g_rand gact_rand[MAX_RAND]= { NULL, gact_net_rand, gact_determ };
72#endif
73
74static int tcf_gact_init(struct rtattr *rta, struct rtattr *est,
75 struct tc_action *a, int ovr, int bind)
76{
77 struct rtattr *tb[TCA_GACT_MAX];
78 struct tc_gact *parm;
79 struct tcf_gact *p;
80 int ret = 0;
81
82 if (rta == NULL || rtattr_parse_nested(tb, TCA_GACT_MAX, rta) < 0)
83 return -EINVAL;
84
85 if (tb[TCA_GACT_PARMS - 1] == NULL ||
86 RTA_PAYLOAD(tb[TCA_GACT_PARMS - 1]) < sizeof(*parm))
87 return -EINVAL;
88 parm = RTA_DATA(tb[TCA_GACT_PARMS - 1]);
89
90 if (tb[TCA_GACT_PROB-1] != NULL)
91#ifdef CONFIG_GACT_PROB
92 if (RTA_PAYLOAD(tb[TCA_GACT_PROB-1]) < sizeof(struct tc_gact_p))
93 return -EINVAL;
94#else
95 return -EOPNOTSUPP;
96#endif
97
98 p = tcf_hash_check(parm->index, a, ovr, bind);
99 if (p == NULL) {
100 p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind);
101 if (p == NULL)
102 return -ENOMEM;
103 ret = ACT_P_CREATED;
104 } else {
105 if (!ovr) {
106 tcf_hash_release(p, bind);
107 return -EEXIST;
108 }
109 }
110
111 spin_lock_bh(&p->lock);
112 p->action = parm->action;
113#ifdef CONFIG_GACT_PROB
114 if (tb[TCA_GACT_PROB-1] != NULL) {
115 struct tc_gact_p *p_parm = RTA_DATA(tb[TCA_GACT_PROB-1]);
116 p->paction = p_parm->paction;
117 p->pval = p_parm->pval;
118 p->ptype = p_parm->ptype;
119 }
120#endif
121 spin_unlock_bh(&p->lock);
122 if (ret == ACT_P_CREATED)
123 tcf_hash_insert(p);
124 return ret;
125}
126
127static int
128tcf_gact_cleanup(struct tc_action *a, int bind)
129{
130 struct tcf_gact *p = PRIV(a, gact);
131
132 if (p != NULL)
133 return tcf_hash_release(p, bind);
134 return 0;
135}
136
137static int
138tcf_gact(struct sk_buff **pskb, struct tc_action *a)
139{
140 struct tcf_gact *p = PRIV(a, gact);
141 struct sk_buff *skb = *pskb;
142 int action = TC_ACT_SHOT;
143
144 spin_lock(&p->lock);
145#ifdef CONFIG_GACT_PROB
146 if (p->ptype && gact_rand[p->ptype] != NULL)
147 action = gact_rand[p->ptype](p);
148 else
149 action = p->action;
150#else
151 action = p->action;
152#endif
153 p->bstats.bytes += skb->len;
154 p->bstats.packets++;
155 if (action == TC_ACT_SHOT)
156 p->qstats.drops++;
157 p->tm.lastuse = jiffies;
158 spin_unlock(&p->lock);
159
160 return action;
161}
162
163static int
164tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
165{
166 unsigned char *b = skb->tail;
167 struct tc_gact opt;
168 struct tcf_gact *p = PRIV(a, gact);
169 struct tcf_t t;
170
171 opt.index = p->index;
172 opt.refcnt = p->refcnt - ref;
173 opt.bindcnt = p->bindcnt - bind;
174 opt.action = p->action;
175 RTA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt);
176#ifdef CONFIG_GACT_PROB
177 if (p->ptype) {
178 struct tc_gact_p p_opt;
179 p_opt.paction = p->paction;
180 p_opt.pval = p->pval;
181 p_opt.ptype = p->ptype;
182 RTA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt);
183 }
184#endif
185 t.install = jiffies_to_clock_t(jiffies - p->tm.install);
186 t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse);
187 t.expires = jiffies_to_clock_t(p->tm.expires);
188 RTA_PUT(skb, TCA_GACT_TM, sizeof(t), &t);
189 return skb->len;
190
191 rtattr_failure:
192 skb_trim(skb, b - skb->data);
193 return -1;
194}
195
196static struct tc_action_ops act_gact_ops = {
197 .kind = "gact",
198 .type = TCA_ACT_GACT,
199 .capab = TCA_CAP_NONE,
200 .owner = THIS_MODULE,
201 .act = tcf_gact,
202 .dump = tcf_gact_dump,
203 .cleanup = tcf_gact_cleanup,
204 .lookup = tcf_hash_search,
205 .init = tcf_gact_init,
206 .walk = tcf_generic_walker
207};
208
209MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
210MODULE_DESCRIPTION("Generic Classifier actions");
211MODULE_LICENSE("GPL");
212
213static int __init
214gact_init_module(void)
215{
216#ifdef CONFIG_GACT_PROB
217 printk("GACT probability on\n");
218#else
219 printk("GACT probability NOT on\n");
220#endif
221 return tcf_register_action(&act_gact_ops);
222}
223
224static void __exit
225gact_cleanup_module(void)
226{
227 tcf_unregister_action(&act_gact_ops);
228}
229
230module_init(gact_init_module);
231module_exit(gact_cleanup_module);
diff --git a/net/sched/ipt.c b/net/sched/ipt.c
new file mode 100644
index 000000000000..b114d994d523
--- /dev/null
+++ b/net/sched/ipt.c
@@ -0,0 +1,326 @@
1/*
2 * net/sched/ipt.c iptables target interface
3 *
4 *TODO: Add other tables. For now we only support the ipv4 table targets
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Copyright: Jamal Hadi Salim (2002-4)
12 */
13
14#include <asm/uaccess.h>
15#include <asm/system.h>
16#include <asm/bitops.h>
17#include <linux/config.h>
18#include <linux/types.h>
19#include <linux/kernel.h>
20#include <linux/sched.h>
21#include <linux/string.h>
22#include <linux/mm.h>
23#include <linux/socket.h>
24#include <linux/sockios.h>
25#include <linux/in.h>
26#include <linux/errno.h>
27#include <linux/interrupt.h>
28#include <linux/netdevice.h>
29#include <linux/skbuff.h>
30#include <linux/rtnetlink.h>
31#include <linux/module.h>
32#include <linux/init.h>
33#include <linux/proc_fs.h>
34#include <linux/kmod.h>
35#include <net/sock.h>
36#include <net/pkt_sched.h>
37#include <linux/tc_act/tc_ipt.h>
38#include <net/tc_act/tc_ipt.h>
39
40#include <linux/netfilter_ipv4/ip_tables.h>
41
42/* use generic hash table */
43#define MY_TAB_SIZE 16
44#define MY_TAB_MASK 15
45
46static u32 idx_gen;
47static struct tcf_ipt *tcf_ipt_ht[MY_TAB_SIZE];
48/* ipt hash table lock */
49static DEFINE_RWLOCK(ipt_lock);
50
51/* ovewrride the defaults */
52#define tcf_st tcf_ipt
53#define tcf_t_lock ipt_lock
54#define tcf_ht tcf_ipt_ht
55
56#define CONFIG_NET_ACT_INIT
57#include <net/pkt_act.h>
58
59static int
60ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook)
61{
62 struct ipt_target *target;
63 int ret = 0;
64
65 target = ipt_find_target(t->u.user.name, t->u.user.revision);
66 if (!target)
67 return -ENOENT;
68
69 DPRINTK("ipt_init_target: found %s\n", target->name);
70 t->u.kernel.target = target;
71
72 if (t->u.kernel.target->checkentry
73 && !t->u.kernel.target->checkentry(table, NULL, t->data,
74 t->u.target_size - sizeof(*t),
75 hook)) {
76 DPRINTK("ipt_init_target: check failed for `%s'.\n",
77 t->u.kernel.target->name);
78 module_put(t->u.kernel.target->me);
79 ret = -EINVAL;
80 }
81
82 return ret;
83}
84
85static void
86ipt_destroy_target(struct ipt_entry_target *t)
87{
88 if (t->u.kernel.target->destroy)
89 t->u.kernel.target->destroy(t->data,
90 t->u.target_size - sizeof(*t));
91 module_put(t->u.kernel.target->me);
92}
93
94static int
95tcf_ipt_release(struct tcf_ipt *p, int bind)
96{
97 int ret = 0;
98 if (p) {
99 if (bind)
100 p->bindcnt--;
101 p->refcnt--;
102 if (p->bindcnt <= 0 && p->refcnt <= 0) {
103 ipt_destroy_target(p->t);
104 kfree(p->tname);
105 kfree(p->t);
106 tcf_hash_destroy(p);
107 ret = ACT_P_DELETED;
108 }
109 }
110 return ret;
111}
112
113static int
114tcf_ipt_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a,
115 int ovr, int bind)
116{
117 struct rtattr *tb[TCA_IPT_MAX];
118 struct tcf_ipt *p;
119 struct ipt_entry_target *td, *t;
120 char *tname;
121 int ret = 0, err;
122 u32 hook = 0;
123 u32 index = 0;
124
125 if (rta == NULL || rtattr_parse_nested(tb, TCA_IPT_MAX, rta) < 0)
126 return -EINVAL;
127
128 if (tb[TCA_IPT_HOOK-1] == NULL ||
129 RTA_PAYLOAD(tb[TCA_IPT_HOOK-1]) < sizeof(u32))
130 return -EINVAL;
131 if (tb[TCA_IPT_TARG-1] == NULL ||
132 RTA_PAYLOAD(tb[TCA_IPT_TARG-1]) < sizeof(*t))
133 return -EINVAL;
134 td = (struct ipt_entry_target *)RTA_DATA(tb[TCA_IPT_TARG-1]);
135 if (RTA_PAYLOAD(tb[TCA_IPT_TARG-1]) < td->u.target_size)
136 return -EINVAL;
137
138 if (tb[TCA_IPT_INDEX-1] != NULL &&
139 RTA_PAYLOAD(tb[TCA_IPT_INDEX-1]) >= sizeof(u32))
140 index = *(u32 *)RTA_DATA(tb[TCA_IPT_INDEX-1]);
141
142 p = tcf_hash_check(index, a, ovr, bind);
143 if (p == NULL) {
144 p = tcf_hash_create(index, est, a, sizeof(*p), ovr, bind);
145 if (p == NULL)
146 return -ENOMEM;
147 ret = ACT_P_CREATED;
148 } else {
149 if (!ovr) {
150 tcf_ipt_release(p, bind);
151 return -EEXIST;
152 }
153 }
154
155 hook = *(u32 *)RTA_DATA(tb[TCA_IPT_HOOK-1]);
156
157 err = -ENOMEM;
158 tname = kmalloc(IFNAMSIZ, GFP_KERNEL);
159 if (tname == NULL)
160 goto err1;
161 if (tb[TCA_IPT_TABLE - 1] == NULL ||
162 rtattr_strlcpy(tname, tb[TCA_IPT_TABLE-1], IFNAMSIZ) >= IFNAMSIZ)
163 strcpy(tname, "mangle");
164
165 t = kmalloc(td->u.target_size, GFP_KERNEL);
166 if (t == NULL)
167 goto err2;
168 memcpy(t, td, td->u.target_size);
169
170 if ((err = ipt_init_target(t, tname, hook)) < 0)
171 goto err3;
172
173 spin_lock_bh(&p->lock);
174 if (ret != ACT_P_CREATED) {
175 ipt_destroy_target(p->t);
176 kfree(p->tname);
177 kfree(p->t);
178 }
179 p->tname = tname;
180 p->t = t;
181 p->hook = hook;
182 spin_unlock_bh(&p->lock);
183 if (ret == ACT_P_CREATED)
184 tcf_hash_insert(p);
185 return ret;
186
187err3:
188 kfree(t);
189err2:
190 kfree(tname);
191err1:
192 kfree(p);
193 return err;
194}
195
196static int
197tcf_ipt_cleanup(struct tc_action *a, int bind)
198{
199 struct tcf_ipt *p = PRIV(a, ipt);
200 return tcf_ipt_release(p, bind);
201}
202
203static int
204tcf_ipt(struct sk_buff **pskb, struct tc_action *a)
205{
206 int ret = 0, result = 0;
207 struct tcf_ipt *p = PRIV(a, ipt);
208 struct sk_buff *skb = *pskb;
209
210 if (skb_cloned(skb)) {
211 if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
212 return TC_ACT_UNSPEC;
213 }
214
215 spin_lock(&p->lock);
216
217 p->tm.lastuse = jiffies;
218 p->bstats.bytes += skb->len;
219 p->bstats.packets++;
220
221 /* yes, we have to worry about both in and out dev
222 worry later - danger - this API seems to have changed
223 from earlier kernels */
224
225 ret = p->t->u.kernel.target->target(&skb, skb->dev, NULL,
226 p->hook, p->t->data, NULL);
227 switch (ret) {
228 case NF_ACCEPT:
229 result = TC_ACT_OK;
230 break;
231 case NF_DROP:
232 result = TC_ACT_SHOT;
233 p->qstats.drops++;
234 break;
235 case IPT_CONTINUE:
236 result = TC_ACT_PIPE;
237 break;
238 default:
239 if (net_ratelimit())
240 printk("Bogus netfilter code %d assume ACCEPT\n", ret);
241 result = TC_POLICE_OK;
242 break;
243 }
244 spin_unlock(&p->lock);
245 return result;
246
247}
248
249static int
250tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
251{
252 struct ipt_entry_target *t;
253 struct tcf_t tm;
254 struct tc_cnt c;
255 unsigned char *b = skb->tail;
256 struct tcf_ipt *p = PRIV(a, ipt);
257
258 /* for simple targets kernel size == user size
259 ** user name = target name
260 ** for foolproof you need to not assume this
261 */
262
263 t = kmalloc(p->t->u.user.target_size, GFP_ATOMIC);
264 if (t == NULL)
265 goto rtattr_failure;
266
267 c.bindcnt = p->bindcnt - bind;
268 c.refcnt = p->refcnt - ref;
269 memcpy(t, p->t, p->t->u.user.target_size);
270 strcpy(t->u.user.name, p->t->u.kernel.target->name);
271
272 DPRINTK("\ttcf_ipt_dump tablename %s length %d\n", p->tname,
273 strlen(p->tname));
274 DPRINTK("\tdump target name %s size %d size user %d "
275 "data[0] %x data[1] %x\n", p->t->u.kernel.target->name,
276 p->t->u.target_size, p->t->u.user.target_size,
277 p->t->data[0], p->t->data[1]);
278 RTA_PUT(skb, TCA_IPT_TARG, p->t->u.user.target_size, t);
279 RTA_PUT(skb, TCA_IPT_INDEX, 4, &p->index);
280 RTA_PUT(skb, TCA_IPT_HOOK, 4, &p->hook);
281 RTA_PUT(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c);
282 RTA_PUT(skb, TCA_IPT_TABLE, IFNAMSIZ, p->tname);
283 tm.install = jiffies_to_clock_t(jiffies - p->tm.install);
284 tm.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse);
285 tm.expires = jiffies_to_clock_t(p->tm.expires);
286 RTA_PUT(skb, TCA_IPT_TM, sizeof (tm), &tm);
287 kfree(t);
288 return skb->len;
289
290 rtattr_failure:
291 skb_trim(skb, b - skb->data);
292 kfree(t);
293 return -1;
294}
295
296static struct tc_action_ops act_ipt_ops = {
297 .kind = "ipt",
298 .type = TCA_ACT_IPT,
299 .capab = TCA_CAP_NONE,
300 .owner = THIS_MODULE,
301 .act = tcf_ipt,
302 .dump = tcf_ipt_dump,
303 .cleanup = tcf_ipt_cleanup,
304 .lookup = tcf_hash_search,
305 .init = tcf_ipt_init,
306 .walk = tcf_generic_walker
307};
308
309MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
310MODULE_DESCRIPTION("Iptables target actions");
311MODULE_LICENSE("GPL");
312
313static int __init
314ipt_init_module(void)
315{
316 return tcf_register_action(&act_ipt_ops);
317}
318
319static void __exit
320ipt_cleanup_module(void)
321{
322 tcf_unregister_action(&act_ipt_ops);
323}
324
325module_init(ipt_init_module);
326module_exit(ipt_cleanup_module);
diff --git a/net/sched/mirred.c b/net/sched/mirred.c
new file mode 100644
index 000000000000..f309ce336803
--- /dev/null
+++ b/net/sched/mirred.c
@@ -0,0 +1,276 @@
1/*
2 * net/sched/mirred.c packet mirroring and redirect actions
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Jamal Hadi Salim (2002-4)
10 *
11 * TODO: Add ingress support (and socket redirect support)
12 *
13 */
14
15#include <asm/uaccess.h>
16#include <asm/system.h>
17#include <asm/bitops.h>
18#include <linux/config.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/sched.h>
22#include <linux/string.h>
23#include <linux/mm.h>
24#include <linux/socket.h>
25#include <linux/sockios.h>
26#include <linux/in.h>
27#include <linux/errno.h>
28#include <linux/interrupt.h>
29#include <linux/netdevice.h>
30#include <linux/skbuff.h>
31#include <linux/rtnetlink.h>
32#include <linux/module.h>
33#include <linux/init.h>
34#include <linux/proc_fs.h>
35#include <net/sock.h>
36#include <net/pkt_sched.h>
37#include <linux/tc_act/tc_mirred.h>
38#include <net/tc_act/tc_mirred.h>
39
40#include <linux/etherdevice.h>
41#include <linux/if_arp.h>
42
43
44/* use generic hash table */
45#define MY_TAB_SIZE 8
46#define MY_TAB_MASK (MY_TAB_SIZE - 1)
47static u32 idx_gen;
48static struct tcf_mirred *tcf_mirred_ht[MY_TAB_SIZE];
49static DEFINE_RWLOCK(mirred_lock);
50
51/* ovewrride the defaults */
52#define tcf_st tcf_mirred
53#define tc_st tc_mirred
54#define tcf_t_lock mirred_lock
55#define tcf_ht tcf_mirred_ht
56
57#define CONFIG_NET_ACT_INIT 1
58#include <net/pkt_act.h>
59
60static inline int
61tcf_mirred_release(struct tcf_mirred *p, int bind)
62{
63 if (p) {
64 if (bind)
65 p->bindcnt--;
66 p->refcnt--;
67 if(!p->bindcnt && p->refcnt <= 0) {
68 dev_put(p->dev);
69 tcf_hash_destroy(p);
70 return 1;
71 }
72 }
73 return 0;
74}
75
76static int
77tcf_mirred_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a,
78 int ovr, int bind)
79{
80 struct rtattr *tb[TCA_MIRRED_MAX];
81 struct tc_mirred *parm;
82 struct tcf_mirred *p;
83 struct net_device *dev = NULL;
84 int ret = 0;
85 int ok_push = 0;
86
87 if (rta == NULL || rtattr_parse_nested(tb, TCA_MIRRED_MAX, rta) < 0)
88 return -EINVAL;
89
90 if (tb[TCA_MIRRED_PARMS-1] == NULL ||
91 RTA_PAYLOAD(tb[TCA_MIRRED_PARMS-1]) < sizeof(*parm))
92 return -EINVAL;
93 parm = RTA_DATA(tb[TCA_MIRRED_PARMS-1]);
94
95 if (parm->ifindex) {
96 dev = __dev_get_by_index(parm->ifindex);
97 if (dev == NULL)
98 return -ENODEV;
99 switch (dev->type) {
100 case ARPHRD_TUNNEL:
101 case ARPHRD_TUNNEL6:
102 case ARPHRD_SIT:
103 case ARPHRD_IPGRE:
104 case ARPHRD_VOID:
105 case ARPHRD_NONE:
106 ok_push = 0;
107 break;
108 default:
109 ok_push = 1;
110 break;
111 }
112 }
113
114 p = tcf_hash_check(parm->index, a, ovr, bind);
115 if (p == NULL) {
116 if (!parm->ifindex)
117 return -EINVAL;
118 p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind);
119 if (p == NULL)
120 return -ENOMEM;
121 ret = ACT_P_CREATED;
122 } else {
123 if (!ovr) {
124 tcf_mirred_release(p, bind);
125 return -EEXIST;
126 }
127 }
128
129 spin_lock_bh(&p->lock);
130 p->action = parm->action;
131 p->eaction = parm->eaction;
132 if (parm->ifindex) {
133 p->ifindex = parm->ifindex;
134 if (ret != ACT_P_CREATED)
135 dev_put(p->dev);
136 p->dev = dev;
137 dev_hold(dev);
138 p->ok_push = ok_push;
139 }
140 spin_unlock_bh(&p->lock);
141 if (ret == ACT_P_CREATED)
142 tcf_hash_insert(p);
143
144 DPRINTK("tcf_mirred_init index %d action %d eaction %d device %s "
145 "ifindex %d\n", parm->index, parm->action, parm->eaction,
146 dev->name, parm->ifindex);
147 return ret;
148}
149
150static int
151tcf_mirred_cleanup(struct tc_action *a, int bind)
152{
153 struct tcf_mirred *p = PRIV(a, mirred);
154
155 if (p != NULL)
156 return tcf_mirred_release(p, bind);
157 return 0;
158}
159
160static int
161tcf_mirred(struct sk_buff **pskb, struct tc_action *a)
162{
163 struct tcf_mirred *p = PRIV(a, mirred);
164 struct net_device *dev;
165 struct sk_buff *skb2 = NULL;
166 struct sk_buff *skb = *pskb;
167 u32 at = G_TC_AT(skb->tc_verd);
168
169 spin_lock(&p->lock);
170
171 dev = p->dev;
172 p->tm.lastuse = jiffies;
173
174 if (!(dev->flags&IFF_UP) ) {
175 if (net_ratelimit())
176 printk("mirred to Houston: device %s is gone!\n",
177 dev->name);
178bad_mirred:
179 if (skb2 != NULL)
180 kfree_skb(skb2);
181 p->qstats.overlimits++;
182 p->bstats.bytes += skb->len;
183 p->bstats.packets++;
184 spin_unlock(&p->lock);
185 /* should we be asking for packet to be dropped?
186 * may make sense for redirect case only
187 */
188 return TC_ACT_SHOT;
189 }
190
191 skb2 = skb_clone(skb, GFP_ATOMIC);
192 if (skb2 == NULL)
193 goto bad_mirred;
194 if (p->eaction != TCA_EGRESS_MIRROR && p->eaction != TCA_EGRESS_REDIR) {
195 if (net_ratelimit())
196 printk("tcf_mirred unknown action %d\n", p->eaction);
197 goto bad_mirred;
198 }
199
200 p->bstats.bytes += skb2->len;
201 p->bstats.packets++;
202 if (!(at & AT_EGRESS))
203 if (p->ok_push)
204 skb_push(skb2, skb2->dev->hard_header_len);
205
206 /* mirror is always swallowed */
207 if (p->eaction != TCA_EGRESS_MIRROR)
208 skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
209
210 skb2->dev = dev;
211 skb2->input_dev = skb->dev;
212 dev_queue_xmit(skb2);
213 spin_unlock(&p->lock);
214 return p->action;
215}
216
217static int
218tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
219{
220 unsigned char *b = skb->tail;
221 struct tc_mirred opt;
222 struct tcf_mirred *p = PRIV(a, mirred);
223 struct tcf_t t;
224
225 opt.index = p->index;
226 opt.action = p->action;
227 opt.refcnt = p->refcnt - ref;
228 opt.bindcnt = p->bindcnt - bind;
229 opt.eaction = p->eaction;
230 opt.ifindex = p->ifindex;
231 DPRINTK("tcf_mirred_dump index %d action %d eaction %d ifindex %d\n",
232 p->index, p->action, p->eaction, p->ifindex);
233 RTA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt);
234 t.install = jiffies_to_clock_t(jiffies - p->tm.install);
235 t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse);
236 t.expires = jiffies_to_clock_t(p->tm.expires);
237 RTA_PUT(skb, TCA_MIRRED_TM, sizeof(t), &t);
238 return skb->len;
239
240 rtattr_failure:
241 skb_trim(skb, b - skb->data);
242 return -1;
243}
244
245static struct tc_action_ops act_mirred_ops = {
246 .kind = "mirred",
247 .type = TCA_ACT_MIRRED,
248 .capab = TCA_CAP_NONE,
249 .owner = THIS_MODULE,
250 .act = tcf_mirred,
251 .dump = tcf_mirred_dump,
252 .cleanup = tcf_mirred_cleanup,
253 .lookup = tcf_hash_search,
254 .init = tcf_mirred_init,
255 .walk = tcf_generic_walker
256};
257
258MODULE_AUTHOR("Jamal Hadi Salim(2002)");
259MODULE_DESCRIPTION("Device Mirror/redirect actions");
260MODULE_LICENSE("GPL");
261
262static int __init
263mirred_init_module(void)
264{
265 printk("Mirror/redirect action on\n");
266 return tcf_register_action(&act_mirred_ops);
267}
268
269static void __exit
270mirred_cleanup_module(void)
271{
272 tcf_unregister_action(&act_mirred_ops);
273}
274
275module_init(mirred_init_module);
276module_exit(mirred_cleanup_module);
diff --git a/net/sched/pedit.c b/net/sched/pedit.c
new file mode 100644
index 000000000000..678be6a645fb
--- /dev/null
+++ b/net/sched/pedit.c
@@ -0,0 +1,288 @@
1/*
2 * net/sched/pedit.c Generic packet editor
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Jamal Hadi Salim (2002-4)
10 */
11
12#include <asm/uaccess.h>
13#include <asm/system.h>
14#include <asm/bitops.h>
15#include <linux/config.h>
16#include <linux/types.h>
17#include <linux/kernel.h>
18#include <linux/sched.h>
19#include <linux/string.h>
20#include <linux/mm.h>
21#include <linux/socket.h>
22#include <linux/sockios.h>
23#include <linux/in.h>
24#include <linux/errno.h>
25#include <linux/interrupt.h>
26#include <linux/netdevice.h>
27#include <linux/skbuff.h>
28#include <linux/rtnetlink.h>
29#include <linux/module.h>
30#include <linux/init.h>
31#include <linux/proc_fs.h>
32#include <net/sock.h>
33#include <net/pkt_sched.h>
34#include <linux/tc_act/tc_pedit.h>
35#include <net/tc_act/tc_pedit.h>
36
37
38#define PEDIT_DEB 1
39
40/* use generic hash table */
41#define MY_TAB_SIZE 16
42#define MY_TAB_MASK 15
43static u32 idx_gen;
44static struct tcf_pedit *tcf_pedit_ht[MY_TAB_SIZE];
45static DEFINE_RWLOCK(pedit_lock);
46
47#define tcf_st tcf_pedit
48#define tc_st tc_pedit
49#define tcf_t_lock pedit_lock
50#define tcf_ht tcf_pedit_ht
51
52#define CONFIG_NET_ACT_INIT 1
53#include <net/pkt_act.h>
54
55static int
56tcf_pedit_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a,
57 int ovr, int bind)
58{
59 struct rtattr *tb[TCA_PEDIT_MAX];
60 struct tc_pedit *parm;
61 int ret = 0;
62 struct tcf_pedit *p;
63 struct tc_pedit_key *keys = NULL;
64 int ksize;
65
66 if (rta == NULL || rtattr_parse_nested(tb, TCA_PEDIT_MAX, rta) < 0)
67 return -EINVAL;
68
69 if (tb[TCA_PEDIT_PARMS - 1] == NULL ||
70 RTA_PAYLOAD(tb[TCA_PEDIT_PARMS-1]) < sizeof(*parm))
71 return -EINVAL;
72 parm = RTA_DATA(tb[TCA_PEDIT_PARMS-1]);
73 ksize = parm->nkeys * sizeof(struct tc_pedit_key);
74 if (RTA_PAYLOAD(tb[TCA_PEDIT_PARMS-1]) < sizeof(*parm) + ksize)
75 return -EINVAL;
76
77 p = tcf_hash_check(parm->index, a, ovr, bind);
78 if (p == NULL) {
79 if (!parm->nkeys)
80 return -EINVAL;
81 p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind);
82 if (p == NULL)
83 return -ENOMEM;
84 keys = kmalloc(ksize, GFP_KERNEL);
85 if (keys == NULL) {
86 kfree(p);
87 return -ENOMEM;
88 }
89 ret = ACT_P_CREATED;
90 } else {
91 if (!ovr) {
92 tcf_hash_release(p, bind);
93 return -EEXIST;
94 }
95 if (p->nkeys && p->nkeys != parm->nkeys) {
96 keys = kmalloc(ksize, GFP_KERNEL);
97 if (keys == NULL)
98 return -ENOMEM;
99 }
100 }
101
102 spin_lock_bh(&p->lock);
103 p->flags = parm->flags;
104 p->action = parm->action;
105 if (keys) {
106 kfree(p->keys);
107 p->keys = keys;
108 p->nkeys = parm->nkeys;
109 }
110 memcpy(p->keys, parm->keys, ksize);
111 spin_unlock_bh(&p->lock);
112 if (ret == ACT_P_CREATED)
113 tcf_hash_insert(p);
114 return ret;
115}
116
117static int
118tcf_pedit_cleanup(struct tc_action *a, int bind)
119{
120 struct tcf_pedit *p = PRIV(a, pedit);
121
122 if (p != NULL) {
123 struct tc_pedit_key *keys = p->keys;
124 if (tcf_hash_release(p, bind)) {
125 kfree(keys);
126 return 1;
127 }
128 }
129 return 0;
130}
131
132static int
133tcf_pedit(struct sk_buff **pskb, struct tc_action *a)
134{
135 struct tcf_pedit *p = PRIV(a, pedit);
136 struct sk_buff *skb = *pskb;
137 int i, munged = 0;
138 u8 *pptr;
139
140 if (!(skb->tc_verd & TC_OK2MUNGE)) {
141 /* should we set skb->cloned? */
142 if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
143 return p->action;
144 }
145 }
146
147 pptr = skb->nh.raw;
148
149 spin_lock(&p->lock);
150
151 p->tm.lastuse = jiffies;
152
153 if (p->nkeys > 0) {
154 struct tc_pedit_key *tkey = p->keys;
155
156 for (i = p->nkeys; i > 0; i--, tkey++) {
157 u32 *ptr;
158 int offset = tkey->off;
159
160 if (tkey->offmask) {
161 if (skb->len > tkey->at) {
162 char *j = pptr + tkey->at;
163 offset += ((*j & tkey->offmask) >>
164 tkey->shift);
165 } else {
166 goto bad;
167 }
168 }
169
170 if (offset % 4) {
171 printk("offset must be on 32 bit boundaries\n");
172 goto bad;
173 }
174 if (skb->len < 0 || (offset > 0 && offset > skb->len)) {
175 printk("offset %d cant exceed pkt length %d\n",
176 offset, skb->len);
177 goto bad;
178 }
179
180 ptr = (u32 *)(pptr+offset);
181 /* just do it, baby */
182 *ptr = ((*ptr & tkey->mask) ^ tkey->val);
183 munged++;
184 }
185
186 if (munged)
187 skb->tc_verd = SET_TC_MUNGED(skb->tc_verd);
188 goto done;
189 } else {
190 printk("pedit BUG: index %d\n",p->index);
191 }
192
193bad:
194 p->qstats.overlimits++;
195done:
196 p->bstats.bytes += skb->len;
197 p->bstats.packets++;
198 spin_unlock(&p->lock);
199 return p->action;
200}
201
202static int
203tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,int bind, int ref)
204{
205 unsigned char *b = skb->tail;
206 struct tc_pedit *opt;
207 struct tcf_pedit *p = PRIV(a, pedit);
208 struct tcf_t t;
209 int s;
210
211 s = sizeof(*opt) + p->nkeys * sizeof(struct tc_pedit_key);
212
213 /* netlink spinlocks held above us - must use ATOMIC */
214 opt = kmalloc(s, GFP_ATOMIC);
215 if (opt == NULL)
216 return -ENOBUFS;
217 memset(opt, 0, s);
218
219 memcpy(opt->keys, p->keys, p->nkeys * sizeof(struct tc_pedit_key));
220 opt->index = p->index;
221 opt->nkeys = p->nkeys;
222 opt->flags = p->flags;
223 opt->action = p->action;
224 opt->refcnt = p->refcnt - ref;
225 opt->bindcnt = p->bindcnt - bind;
226
227
228#ifdef PEDIT_DEB
229 {
230 /* Debug - get rid of later */
231 int i;
232 struct tc_pedit_key *key = opt->keys;
233
234 for (i=0; i<opt->nkeys; i++, key++) {
235 printk( "\n key #%d",i);
236 printk( " at %d: val %08x mask %08x",
237 (unsigned int)key->off,
238 (unsigned int)key->val,
239 (unsigned int)key->mask);
240 }
241 }
242#endif
243
244 RTA_PUT(skb, TCA_PEDIT_PARMS, s, opt);
245 t.install = jiffies_to_clock_t(jiffies - p->tm.install);
246 t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse);
247 t.expires = jiffies_to_clock_t(p->tm.expires);
248 RTA_PUT(skb, TCA_PEDIT_TM, sizeof(t), &t);
249 return skb->len;
250
251rtattr_failure:
252 skb_trim(skb, b - skb->data);
253 return -1;
254}
255
256static
257struct tc_action_ops act_pedit_ops = {
258 .kind = "pedit",
259 .type = TCA_ACT_PEDIT,
260 .capab = TCA_CAP_NONE,
261 .owner = THIS_MODULE,
262 .act = tcf_pedit,
263 .dump = tcf_pedit_dump,
264 .cleanup = tcf_pedit_cleanup,
265 .lookup = tcf_hash_search,
266 .init = tcf_pedit_init,
267 .walk = tcf_generic_walker
268};
269
270MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
271MODULE_DESCRIPTION("Generic Packet Editor actions");
272MODULE_LICENSE("GPL");
273
274static int __init
275pedit_init_module(void)
276{
277 return tcf_register_action(&act_pedit_ops);
278}
279
280static void __exit
281pedit_cleanup_module(void)
282{
283 tcf_unregister_action(&act_pedit_ops);
284}
285
286module_init(pedit_init_module);
287module_exit(pedit_cleanup_module);
288
diff --git a/net/sched/police.c b/net/sched/police.c
new file mode 100644
index 000000000000..c03545faf523
--- /dev/null
+++ b/net/sched/police.c
@@ -0,0 +1,612 @@
1/*
2 * net/sched/police.c Input police filter.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 * J Hadi Salim (action changes)
11 */
12
13#include <asm/uaccess.h>
14#include <asm/system.h>
15#include <linux/bitops.h>
16#include <linux/config.h>
17#include <linux/module.h>
18#include <linux/types.h>
19#include <linux/kernel.h>
20#include <linux/sched.h>
21#include <linux/string.h>
22#include <linux/mm.h>
23#include <linux/socket.h>
24#include <linux/sockios.h>
25#include <linux/in.h>
26#include <linux/errno.h>
27#include <linux/interrupt.h>
28#include <linux/netdevice.h>
29#include <linux/skbuff.h>
30#include <linux/module.h>
31#include <linux/rtnetlink.h>
32#include <linux/init.h>
33#include <net/sock.h>
34#include <net/act_api.h>
35
36#define L2T(p,L) ((p)->R_tab->data[(L)>>(p)->R_tab->rate.cell_log])
37#define L2T_P(p,L) ((p)->P_tab->data[(L)>>(p)->P_tab->rate.cell_log])
38#define PRIV(a) ((struct tcf_police *) (a)->priv)
39
40/* use generic hash table */
41#define MY_TAB_SIZE 16
42#define MY_TAB_MASK 15
43static u32 idx_gen;
44static struct tcf_police *tcf_police_ht[MY_TAB_SIZE];
45/* Policer hash table lock */
46static DEFINE_RWLOCK(police_lock);
47
48/* Each policer is serialized by its individual spinlock */
49
50static __inline__ unsigned tcf_police_hash(u32 index)
51{
52 return index&0xF;
53}
54
55static __inline__ struct tcf_police * tcf_police_lookup(u32 index)
56{
57 struct tcf_police *p;
58
59 read_lock(&police_lock);
60 for (p = tcf_police_ht[tcf_police_hash(index)]; p; p = p->next) {
61 if (p->index == index)
62 break;
63 }
64 read_unlock(&police_lock);
65 return p;
66}
67
68#ifdef CONFIG_NET_CLS_ACT
69static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb,
70 int type, struct tc_action *a)
71{
72 struct tcf_police *p;
73 int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
74 struct rtattr *r;
75
76 read_lock(&police_lock);
77
78 s_i = cb->args[0];
79
80 for (i = 0; i < MY_TAB_SIZE; i++) {
81 p = tcf_police_ht[tcf_police_hash(i)];
82
83 for (; p; p = p->next) {
84 index++;
85 if (index < s_i)
86 continue;
87 a->priv = p;
88 a->order = index;
89 r = (struct rtattr*) skb->tail;
90 RTA_PUT(skb, a->order, 0, NULL);
91 if (type == RTM_DELACTION)
92 err = tcf_action_dump_1(skb, a, 0, 1);
93 else
94 err = tcf_action_dump_1(skb, a, 0, 0);
95 if (err < 0) {
96 index--;
97 skb_trim(skb, (u8*)r - skb->data);
98 goto done;
99 }
100 r->rta_len = skb->tail - (u8*)r;
101 n_i++;
102 }
103 }
104done:
105 read_unlock(&police_lock);
106 if (n_i)
107 cb->args[0] += n_i;
108 return n_i;
109
110rtattr_failure:
111 skb_trim(skb, (u8*)r - skb->data);
112 goto done;
113}
114
115static inline int
116tcf_hash_search(struct tc_action *a, u32 index)
117{
118 struct tcf_police *p = tcf_police_lookup(index);
119
120 if (p != NULL) {
121 a->priv = p;
122 return 1;
123 } else {
124 return 0;
125 }
126}
127#endif
128
129static inline u32 tcf_police_new_index(void)
130{
131 do {
132 if (++idx_gen == 0)
133 idx_gen = 1;
134 } while (tcf_police_lookup(idx_gen));
135
136 return idx_gen;
137}
138
139void tcf_police_destroy(struct tcf_police *p)
140{
141 unsigned h = tcf_police_hash(p->index);
142 struct tcf_police **p1p;
143
144 for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->next) {
145 if (*p1p == p) {
146 write_lock_bh(&police_lock);
147 *p1p = p->next;
148 write_unlock_bh(&police_lock);
149#ifdef CONFIG_NET_ESTIMATOR
150 gen_kill_estimator(&p->bstats, &p->rate_est);
151#endif
152 if (p->R_tab)
153 qdisc_put_rtab(p->R_tab);
154 if (p->P_tab)
155 qdisc_put_rtab(p->P_tab);
156 kfree(p);
157 return;
158 }
159 }
160 BUG_TRAP(0);
161}
162
163#ifdef CONFIG_NET_CLS_ACT
164static int tcf_act_police_locate(struct rtattr *rta, struct rtattr *est,
165 struct tc_action *a, int ovr, int bind)
166{
167 unsigned h;
168 int ret = 0, err;
169 struct rtattr *tb[TCA_POLICE_MAX];
170 struct tc_police *parm;
171 struct tcf_police *p;
172 struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
173
174 if (rta == NULL || rtattr_parse_nested(tb, TCA_POLICE_MAX, rta) < 0)
175 return -EINVAL;
176
177 if (tb[TCA_POLICE_TBF-1] == NULL ||
178 RTA_PAYLOAD(tb[TCA_POLICE_TBF-1]) != sizeof(*parm))
179 return -EINVAL;
180 parm = RTA_DATA(tb[TCA_POLICE_TBF-1]);
181
182 if (tb[TCA_POLICE_RESULT-1] != NULL &&
183 RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32))
184 return -EINVAL;
185 if (tb[TCA_POLICE_RESULT-1] != NULL &&
186 RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32))
187 return -EINVAL;
188
189 if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) {
190 a->priv = p;
191 if (bind) {
192 p->bindcnt += 1;
193 p->refcnt += 1;
194 }
195 if (ovr)
196 goto override;
197 return ret;
198 }
199
200 p = kmalloc(sizeof(*p), GFP_KERNEL);
201 if (p == NULL)
202 return -ENOMEM;
203 memset(p, 0, sizeof(*p));
204
205 ret = ACT_P_CREATED;
206 p->refcnt = 1;
207 spin_lock_init(&p->lock);
208 p->stats_lock = &p->lock;
209 if (bind)
210 p->bindcnt = 1;
211override:
212 if (parm->rate.rate) {
213 err = -ENOMEM;
214 R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1]);
215 if (R_tab == NULL)
216 goto failure;
217 if (parm->peakrate.rate) {
218 P_tab = qdisc_get_rtab(&parm->peakrate,
219 tb[TCA_POLICE_PEAKRATE-1]);
220 if (p->P_tab == NULL) {
221 qdisc_put_rtab(R_tab);
222 goto failure;
223 }
224 }
225 }
226 /* No failure allowed after this point */
227 spin_lock_bh(&p->lock);
228 if (R_tab != NULL) {
229 qdisc_put_rtab(p->R_tab);
230 p->R_tab = R_tab;
231 }
232 if (P_tab != NULL) {
233 qdisc_put_rtab(p->P_tab);
234 p->P_tab = P_tab;
235 }
236
237 if (tb[TCA_POLICE_RESULT-1])
238 p->result = *(u32*)RTA_DATA(tb[TCA_POLICE_RESULT-1]);
239 p->toks = p->burst = parm->burst;
240 p->mtu = parm->mtu;
241 if (p->mtu == 0) {
242 p->mtu = ~0;
243 if (p->R_tab)
244 p->mtu = 255<<p->R_tab->rate.cell_log;
245 }
246 if (p->P_tab)
247 p->ptoks = L2T_P(p, p->mtu);
248 p->action = parm->action;
249
250#ifdef CONFIG_NET_ESTIMATOR
251 if (tb[TCA_POLICE_AVRATE-1])
252 p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]);
253 if (est)
254 gen_replace_estimator(&p->bstats, &p->rate_est, p->stats_lock, est);
255#endif
256
257 spin_unlock_bh(&p->lock);
258 if (ret != ACT_P_CREATED)
259 return ret;
260
261 PSCHED_GET_TIME(p->t_c);
262 p->index = parm->index ? : tcf_police_new_index();
263 h = tcf_police_hash(p->index);
264 write_lock_bh(&police_lock);
265 p->next = tcf_police_ht[h];
266 tcf_police_ht[h] = p;
267 write_unlock_bh(&police_lock);
268
269 a->priv = p;
270 return ret;
271
272failure:
273 if (ret == ACT_P_CREATED)
274 kfree(p);
275 return err;
276}
277
278static int tcf_act_police_cleanup(struct tc_action *a, int bind)
279{
280 struct tcf_police *p = PRIV(a);
281
282 if (p != NULL)
283 return tcf_police_release(p, bind);
284 return 0;
285}
286
287static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a)
288{
289 psched_time_t now;
290 struct sk_buff *skb = *pskb;
291 struct tcf_police *p = PRIV(a);
292 long toks;
293 long ptoks = 0;
294
295 spin_lock(&p->lock);
296
297 p->bstats.bytes += skb->len;
298 p->bstats.packets++;
299
300#ifdef CONFIG_NET_ESTIMATOR
301 if (p->ewma_rate && p->rate_est.bps >= p->ewma_rate) {
302 p->qstats.overlimits++;
303 spin_unlock(&p->lock);
304 return p->action;
305 }
306#endif
307
308 if (skb->len <= p->mtu) {
309 if (p->R_tab == NULL) {
310 spin_unlock(&p->lock);
311 return p->result;
312 }
313
314 PSCHED_GET_TIME(now);
315
316 toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst);
317
318 if (p->P_tab) {
319 ptoks = toks + p->ptoks;
320 if (ptoks > (long)L2T_P(p, p->mtu))
321 ptoks = (long)L2T_P(p, p->mtu);
322 ptoks -= L2T_P(p, skb->len);
323 }
324 toks += p->toks;
325 if (toks > (long)p->burst)
326 toks = p->burst;
327 toks -= L2T(p, skb->len);
328
329 if ((toks|ptoks) >= 0) {
330 p->t_c = now;
331 p->toks = toks;
332 p->ptoks = ptoks;
333 spin_unlock(&p->lock);
334 return p->result;
335 }
336 }
337
338 p->qstats.overlimits++;
339 spin_unlock(&p->lock);
340 return p->action;
341}
342
343static int
344tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
345{
346 unsigned char *b = skb->tail;
347 struct tc_police opt;
348 struct tcf_police *p = PRIV(a);
349
350 opt.index = p->index;
351 opt.action = p->action;
352 opt.mtu = p->mtu;
353 opt.burst = p->burst;
354 opt.refcnt = p->refcnt - ref;
355 opt.bindcnt = p->bindcnt - bind;
356 if (p->R_tab)
357 opt.rate = p->R_tab->rate;
358 else
359 memset(&opt.rate, 0, sizeof(opt.rate));
360 if (p->P_tab)
361 opt.peakrate = p->P_tab->rate;
362 else
363 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
364 RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt);
365 if (p->result)
366 RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result);
367#ifdef CONFIG_NET_ESTIMATOR
368 if (p->ewma_rate)
369 RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate);
370#endif
371 return skb->len;
372
373rtattr_failure:
374 skb_trim(skb, b - skb->data);
375 return -1;
376}
377
378MODULE_AUTHOR("Alexey Kuznetsov");
379MODULE_DESCRIPTION("Policing actions");
380MODULE_LICENSE("GPL");
381
382static struct tc_action_ops act_police_ops = {
383 .kind = "police",
384 .type = TCA_ID_POLICE,
385 .capab = TCA_CAP_NONE,
386 .owner = THIS_MODULE,
387 .act = tcf_act_police,
388 .dump = tcf_act_police_dump,
389 .cleanup = tcf_act_police_cleanup,
390 .lookup = tcf_hash_search,
391 .init = tcf_act_police_locate,
392 .walk = tcf_generic_walker
393};
394
395static int __init
396police_init_module(void)
397{
398 return tcf_register_action(&act_police_ops);
399}
400
401static void __exit
402police_cleanup_module(void)
403{
404 tcf_unregister_action(&act_police_ops);
405}
406
407module_init(police_init_module);
408module_exit(police_cleanup_module);
409
410#endif
411
412struct tcf_police * tcf_police_locate(struct rtattr *rta, struct rtattr *est)
413{
414 unsigned h;
415 struct tcf_police *p;
416 struct rtattr *tb[TCA_POLICE_MAX];
417 struct tc_police *parm;
418
419 if (rtattr_parse_nested(tb, TCA_POLICE_MAX, rta) < 0)
420 return NULL;
421
422 if (tb[TCA_POLICE_TBF-1] == NULL ||
423 RTA_PAYLOAD(tb[TCA_POLICE_TBF-1]) != sizeof(*parm))
424 return NULL;
425
426 parm = RTA_DATA(tb[TCA_POLICE_TBF-1]);
427
428 if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) {
429 p->refcnt++;
430 return p;
431 }
432
433 p = kmalloc(sizeof(*p), GFP_KERNEL);
434 if (p == NULL)
435 return NULL;
436
437 memset(p, 0, sizeof(*p));
438 p->refcnt = 1;
439 spin_lock_init(&p->lock);
440 p->stats_lock = &p->lock;
441 if (parm->rate.rate) {
442 p->R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1]);
443 if (p->R_tab == NULL)
444 goto failure;
445 if (parm->peakrate.rate) {
446 p->P_tab = qdisc_get_rtab(&parm->peakrate,
447 tb[TCA_POLICE_PEAKRATE-1]);
448 if (p->P_tab == NULL)
449 goto failure;
450 }
451 }
452 if (tb[TCA_POLICE_RESULT-1]) {
453 if (RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32))
454 goto failure;
455 p->result = *(u32*)RTA_DATA(tb[TCA_POLICE_RESULT-1]);
456 }
457#ifdef CONFIG_NET_ESTIMATOR
458 if (tb[TCA_POLICE_AVRATE-1]) {
459 if (RTA_PAYLOAD(tb[TCA_POLICE_AVRATE-1]) != sizeof(u32))
460 goto failure;
461 p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]);
462 }
463#endif
464 p->toks = p->burst = parm->burst;
465 p->mtu = parm->mtu;
466 if (p->mtu == 0) {
467 p->mtu = ~0;
468 if (p->R_tab)
469 p->mtu = 255<<p->R_tab->rate.cell_log;
470 }
471 if (p->P_tab)
472 p->ptoks = L2T_P(p, p->mtu);
473 PSCHED_GET_TIME(p->t_c);
474 p->index = parm->index ? : tcf_police_new_index();
475 p->action = parm->action;
476#ifdef CONFIG_NET_ESTIMATOR
477 if (est)
478 gen_new_estimator(&p->bstats, &p->rate_est, p->stats_lock, est);
479#endif
480 h = tcf_police_hash(p->index);
481 write_lock_bh(&police_lock);
482 p->next = tcf_police_ht[h];
483 tcf_police_ht[h] = p;
484 write_unlock_bh(&police_lock);
485 return p;
486
487failure:
488 if (p->R_tab)
489 qdisc_put_rtab(p->R_tab);
490 kfree(p);
491 return NULL;
492}
493
494int tcf_police(struct sk_buff *skb, struct tcf_police *p)
495{
496 psched_time_t now;
497 long toks;
498 long ptoks = 0;
499
500 spin_lock(&p->lock);
501
502 p->bstats.bytes += skb->len;
503 p->bstats.packets++;
504
505#ifdef CONFIG_NET_ESTIMATOR
506 if (p->ewma_rate && p->rate_est.bps >= p->ewma_rate) {
507 p->qstats.overlimits++;
508 spin_unlock(&p->lock);
509 return p->action;
510 }
511#endif
512
513 if (skb->len <= p->mtu) {
514 if (p->R_tab == NULL) {
515 spin_unlock(&p->lock);
516 return p->result;
517 }
518
519 PSCHED_GET_TIME(now);
520
521 toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst);
522
523 if (p->P_tab) {
524 ptoks = toks + p->ptoks;
525 if (ptoks > (long)L2T_P(p, p->mtu))
526 ptoks = (long)L2T_P(p, p->mtu);
527 ptoks -= L2T_P(p, skb->len);
528 }
529 toks += p->toks;
530 if (toks > (long)p->burst)
531 toks = p->burst;
532 toks -= L2T(p, skb->len);
533
534 if ((toks|ptoks) >= 0) {
535 p->t_c = now;
536 p->toks = toks;
537 p->ptoks = ptoks;
538 spin_unlock(&p->lock);
539 return p->result;
540 }
541 }
542
543 p->qstats.overlimits++;
544 spin_unlock(&p->lock);
545 return p->action;
546}
547
548int tcf_police_dump(struct sk_buff *skb, struct tcf_police *p)
549{
550 unsigned char *b = skb->tail;
551 struct tc_police opt;
552
553 opt.index = p->index;
554 opt.action = p->action;
555 opt.mtu = p->mtu;
556 opt.burst = p->burst;
557 if (p->R_tab)
558 opt.rate = p->R_tab->rate;
559 else
560 memset(&opt.rate, 0, sizeof(opt.rate));
561 if (p->P_tab)
562 opt.peakrate = p->P_tab->rate;
563 else
564 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
565 RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt);
566 if (p->result)
567 RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result);
568#ifdef CONFIG_NET_ESTIMATOR
569 if (p->ewma_rate)
570 RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate);
571#endif
572 return skb->len;
573
574rtattr_failure:
575 skb_trim(skb, b - skb->data);
576 return -1;
577}
578
579int tcf_police_dump_stats(struct sk_buff *skb, struct tcf_police *p)
580{
581 struct gnet_dump d;
582
583 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
584 TCA_XSTATS, p->stats_lock, &d) < 0)
585 goto errout;
586
587 if (gnet_stats_copy_basic(&d, &p->bstats) < 0 ||
588#ifdef CONFIG_NET_ESTIMATOR
589 gnet_stats_copy_rate_est(&d, &p->rate_est) < 0 ||
590#endif
591 gnet_stats_copy_queue(&d, &p->qstats) < 0)
592 goto errout;
593
594 if (gnet_stats_finish_copy(&d) < 0)
595 goto errout;
596
597 return 0;
598
599errout:
600 return -1;
601}
602
603
604EXPORT_SYMBOL(tcf_police);
605EXPORT_SYMBOL(tcf_police_destroy);
606EXPORT_SYMBOL(tcf_police_dump);
607EXPORT_SYMBOL(tcf_police_dump_stats);
608EXPORT_SYMBOL(tcf_police_hash);
609EXPORT_SYMBOL(tcf_police_ht);
610EXPORT_SYMBOL(tcf_police_locate);
611EXPORT_SYMBOL(tcf_police_lookup);
612EXPORT_SYMBOL(tcf_police_new_index);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
new file mode 100644
index 000000000000..4323a74eea30
--- /dev/null
+++ b/net/sched/sch_api.c
@@ -0,0 +1,1296 @@
1/*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
18#include <linux/config.h>
19#include <linux/module.h>
20#include <linux/types.h>
21#include <linux/kernel.h>
22#include <linux/sched.h>
23#include <linux/string.h>
24#include <linux/mm.h>
25#include <linux/socket.h>
26#include <linux/sockios.h>
27#include <linux/in.h>
28#include <linux/errno.h>
29#include <linux/interrupt.h>
30#include <linux/netdevice.h>
31#include <linux/skbuff.h>
32#include <linux/rtnetlink.h>
33#include <linux/init.h>
34#include <linux/proc_fs.h>
35#include <linux/seq_file.h>
36#include <linux/kmod.h>
37#include <linux/list.h>
38#include <linux/bitops.h>
39
40#include <net/sock.h>
41#include <net/pkt_sched.h>
42
43#include <asm/processor.h>
44#include <asm/uaccess.h>
45#include <asm/system.h>
46
47static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
48 struct Qdisc *old, struct Qdisc *new);
49static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
50 struct Qdisc *q, unsigned long cl, int event);
51
52/*
53
54 Short review.
55 -------------
56
57 This file consists of two interrelated parts:
58
59 1. queueing disciplines manager frontend.
60 2. traffic classes manager frontend.
61
62 Generally, queueing discipline ("qdisc") is a black box,
63 which is able to enqueue packets and to dequeue them (when
64 device is ready to send something) in order and at times
65 determined by algorithm hidden in it.
66
67 qdisc's are divided to two categories:
68 - "queues", which have no internal structure visible from outside.
69 - "schedulers", which split all the packets to "traffic classes",
70 using "packet classifiers" (look at cls_api.c)
71
72 In turn, classes may have child qdiscs (as rule, queues)
73 attached to them etc. etc. etc.
74
75 The goal of the routines in this file is to translate
76 information supplied by user in the form of handles
77 to more intelligible for kernel form, to make some sanity
78 checks and part of work, which is common to all qdiscs
79 and to provide rtnetlink notifications.
80
81 All real intelligent work is done inside qdisc modules.
82
83
84
85 Every discipline has two major routines: enqueue and dequeue.
86
87 ---dequeue
88
89 dequeue usually returns a skb to send. It is allowed to return NULL,
90 but it does not mean that queue is empty, it just means that
91 discipline does not want to send anything this time.
92 Queue is really empty if q->q.qlen == 0.
93 For complicated disciplines with multiple queues q->q is not
94 real packet queue, but however q->q.qlen must be valid.
95
96 ---enqueue
97
98 enqueue returns 0, if packet was enqueued successfully.
99 If packet (this one or another one) was dropped, it returns
100 not zero error code.
101 NET_XMIT_DROP - this packet dropped
102 Expected action: do not backoff, but wait until queue will clear.
103 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
104 Expected action: backoff or ignore
105 NET_XMIT_POLICED - dropped by police.
106 Expected action: backoff or error to real-time apps.
107
108 Auxiliary routines:
109
110 ---requeue
111
112 requeues once dequeued packet. It is used for non-standard or
113 just buggy devices, which can defer output even if dev->tbusy=0.
114
115 ---reset
116
117 returns qdisc to initial state: purge all buffers, clear all
118 timers, counters (except for statistics) etc.
119
120 ---init
121
122 initializes newly created qdisc.
123
124 ---destroy
125
126 destroys resources allocated by init and during lifetime of qdisc.
127
128 ---change
129
130 changes qdisc parameters.
131 */
132
133/* Protects list of registered TC modules. It is pure SMP lock. */
134static DEFINE_RWLOCK(qdisc_mod_lock);
135
136
137/************************************************
138 * Queueing disciplines manipulation. *
139 ************************************************/
140
141
142/* The list of all installed queueing disciplines. */
143
144static struct Qdisc_ops *qdisc_base;
145
146/* Register/uregister queueing discipline */
147
148int register_qdisc(struct Qdisc_ops *qops)
149{
150 struct Qdisc_ops *q, **qp;
151 int rc = -EEXIST;
152
153 write_lock(&qdisc_mod_lock);
154 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
155 if (!strcmp(qops->id, q->id))
156 goto out;
157
158 if (qops->enqueue == NULL)
159 qops->enqueue = noop_qdisc_ops.enqueue;
160 if (qops->requeue == NULL)
161 qops->requeue = noop_qdisc_ops.requeue;
162 if (qops->dequeue == NULL)
163 qops->dequeue = noop_qdisc_ops.dequeue;
164
165 qops->next = NULL;
166 *qp = qops;
167 rc = 0;
168out:
169 write_unlock(&qdisc_mod_lock);
170 return rc;
171}
172
173int unregister_qdisc(struct Qdisc_ops *qops)
174{
175 struct Qdisc_ops *q, **qp;
176 int err = -ENOENT;
177
178 write_lock(&qdisc_mod_lock);
179 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
180 if (q == qops)
181 break;
182 if (q) {
183 *qp = q->next;
184 q->next = NULL;
185 err = 0;
186 }
187 write_unlock(&qdisc_mod_lock);
188 return err;
189}
190
191/* We know handle. Find qdisc among all qdisc's attached to device
192 (root qdisc, all its children, children of children etc.)
193 */
194
195struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
196{
197 struct Qdisc *q;
198
199 read_lock_bh(&qdisc_tree_lock);
200 list_for_each_entry(q, &dev->qdisc_list, list) {
201 if (q->handle == handle) {
202 read_unlock_bh(&qdisc_tree_lock);
203 return q;
204 }
205 }
206 read_unlock_bh(&qdisc_tree_lock);
207 return NULL;
208}
209
210static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
211{
212 unsigned long cl;
213 struct Qdisc *leaf;
214 struct Qdisc_class_ops *cops = p->ops->cl_ops;
215
216 if (cops == NULL)
217 return NULL;
218 cl = cops->get(p, classid);
219
220 if (cl == 0)
221 return NULL;
222 leaf = cops->leaf(p, cl);
223 cops->put(p, cl);
224 return leaf;
225}
226
227/* Find queueing discipline by name */
228
229static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
230{
231 struct Qdisc_ops *q = NULL;
232
233 if (kind) {
234 read_lock(&qdisc_mod_lock);
235 for (q = qdisc_base; q; q = q->next) {
236 if (rtattr_strcmp(kind, q->id) == 0) {
237 if (!try_module_get(q->owner))
238 q = NULL;
239 break;
240 }
241 }
242 read_unlock(&qdisc_mod_lock);
243 }
244 return q;
245}
246
247static struct qdisc_rate_table *qdisc_rtab_list;
248
249struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
250{
251 struct qdisc_rate_table *rtab;
252
253 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
254 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
255 rtab->refcnt++;
256 return rtab;
257 }
258 }
259
260 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
261 return NULL;
262
263 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
264 if (rtab) {
265 rtab->rate = *r;
266 rtab->refcnt = 1;
267 memcpy(rtab->data, RTA_DATA(tab), 1024);
268 rtab->next = qdisc_rtab_list;
269 qdisc_rtab_list = rtab;
270 }
271 return rtab;
272}
273
274void qdisc_put_rtab(struct qdisc_rate_table *tab)
275{
276 struct qdisc_rate_table *rtab, **rtabp;
277
278 if (!tab || --tab->refcnt)
279 return;
280
281 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
282 if (rtab == tab) {
283 *rtabp = rtab->next;
284 kfree(rtab);
285 return;
286 }
287 }
288}
289
290
291/* Allocate an unique handle from space managed by kernel */
292
293static u32 qdisc_alloc_handle(struct net_device *dev)
294{
295 int i = 0x10000;
296 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
297
298 do {
299 autohandle += TC_H_MAKE(0x10000U, 0);
300 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
301 autohandle = TC_H_MAKE(0x80000000U, 0);
302 } while (qdisc_lookup(dev, autohandle) && --i > 0);
303
304 return i>0 ? autohandle : 0;
305}
306
307/* Attach toplevel qdisc to device dev */
308
309static struct Qdisc *
310dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
311{
312 struct Qdisc *oqdisc;
313
314 if (dev->flags & IFF_UP)
315 dev_deactivate(dev);
316
317 qdisc_lock_tree(dev);
318 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
319 oqdisc = dev->qdisc_ingress;
320 /* Prune old scheduler */
321 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
322 /* delete */
323 qdisc_reset(oqdisc);
324 dev->qdisc_ingress = NULL;
325 } else { /* new */
326 dev->qdisc_ingress = qdisc;
327 }
328
329 } else {
330
331 oqdisc = dev->qdisc_sleeping;
332
333 /* Prune old scheduler */
334 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
335 qdisc_reset(oqdisc);
336
337 /* ... and graft new one */
338 if (qdisc == NULL)
339 qdisc = &noop_qdisc;
340 dev->qdisc_sleeping = qdisc;
341 dev->qdisc = &noop_qdisc;
342 }
343
344 qdisc_unlock_tree(dev);
345
346 if (dev->flags & IFF_UP)
347 dev_activate(dev);
348
349 return oqdisc;
350}
351
352
353/* Graft qdisc "new" to class "classid" of qdisc "parent" or
354 to device "dev".
355
356 Old qdisc is not destroyed but returned in *old.
357 */
358
359static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
360 u32 classid,
361 struct Qdisc *new, struct Qdisc **old)
362{
363 int err = 0;
364 struct Qdisc *q = *old;
365
366
367 if (parent == NULL) {
368 if (q && q->flags&TCQ_F_INGRESS) {
369 *old = dev_graft_qdisc(dev, q);
370 } else {
371 *old = dev_graft_qdisc(dev, new);
372 }
373 } else {
374 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
375
376 err = -EINVAL;
377
378 if (cops) {
379 unsigned long cl = cops->get(parent, classid);
380 if (cl) {
381 err = cops->graft(parent, cl, new, old);
382 if (new)
383 new->parent = classid;
384 cops->put(parent, cl);
385 }
386 }
387 }
388 return err;
389}
390
391/*
392 Allocate and initialize new qdisc.
393
394 Parameters are passed via opt.
395 */
396
397static struct Qdisc *
398qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
399{
400 int err;
401 struct rtattr *kind = tca[TCA_KIND-1];
402 void *p = NULL;
403 struct Qdisc *sch;
404 struct Qdisc_ops *ops;
405 int size;
406
407 ops = qdisc_lookup_ops(kind);
408#ifdef CONFIG_KMOD
409 if (ops == NULL && kind != NULL) {
410 char name[IFNAMSIZ];
411 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
412 /* We dropped the RTNL semaphore in order to
413 * perform the module load. So, even if we
414 * succeeded in loading the module we have to
415 * tell the caller to replay the request. We
416 * indicate this using -EAGAIN.
417 * We replay the request because the device may
418 * go away in the mean time.
419 */
420 rtnl_unlock();
421 request_module("sch_%s", name);
422 rtnl_lock();
423 ops = qdisc_lookup_ops(kind);
424 if (ops != NULL) {
425 /* We will try again qdisc_lookup_ops,
426 * so don't keep a reference.
427 */
428 module_put(ops->owner);
429 err = -EAGAIN;
430 goto err_out;
431 }
432 }
433 }
434#endif
435
436 err = -EINVAL;
437 if (ops == NULL)
438 goto err_out;
439
440 /* ensure that the Qdisc and the private data are 32-byte aligned */
441 size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
442 size += ops->priv_size + QDISC_ALIGN_CONST;
443
444 p = kmalloc(size, GFP_KERNEL);
445 err = -ENOBUFS;
446 if (!p)
447 goto err_out2;
448 memset(p, 0, size);
449 sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
450 & ~QDISC_ALIGN_CONST);
451 sch->padded = (char *)sch - (char *)p;
452
453 INIT_LIST_HEAD(&sch->list);
454 skb_queue_head_init(&sch->q);
455
456 if (handle == TC_H_INGRESS)
457 sch->flags |= TCQ_F_INGRESS;
458
459 sch->ops = ops;
460 sch->enqueue = ops->enqueue;
461 sch->dequeue = ops->dequeue;
462 sch->dev = dev;
463 dev_hold(dev);
464 atomic_set(&sch->refcnt, 1);
465 sch->stats_lock = &dev->queue_lock;
466 if (handle == 0) {
467 handle = qdisc_alloc_handle(dev);
468 err = -ENOMEM;
469 if (handle == 0)
470 goto err_out3;
471 }
472
473 if (handle == TC_H_INGRESS)
474 sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
475 else
476 sch->handle = handle;
477
478 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
479 qdisc_lock_tree(dev);
480 list_add_tail(&sch->list, &dev->qdisc_list);
481 qdisc_unlock_tree(dev);
482
483#ifdef CONFIG_NET_ESTIMATOR
484 if (tca[TCA_RATE-1])
485 gen_new_estimator(&sch->bstats, &sch->rate_est,
486 sch->stats_lock, tca[TCA_RATE-1]);
487#endif
488 return sch;
489 }
490err_out3:
491 dev_put(dev);
492err_out2:
493 module_put(ops->owner);
494err_out:
495 *errp = err;
496 if (p)
497 kfree(p);
498 return NULL;
499}
500
501static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
502{
503 if (tca[TCA_OPTIONS-1]) {
504 int err;
505
506 if (sch->ops->change == NULL)
507 return -EINVAL;
508 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
509 if (err)
510 return err;
511 }
512#ifdef CONFIG_NET_ESTIMATOR
513 if (tca[TCA_RATE-1])
514 gen_replace_estimator(&sch->bstats, &sch->rate_est,
515 sch->stats_lock, tca[TCA_RATE-1]);
516#endif
517 return 0;
518}
519
520struct check_loop_arg
521{
522 struct qdisc_walker w;
523 struct Qdisc *p;
524 int depth;
525};
526
527static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
528
529static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
530{
531 struct check_loop_arg arg;
532
533 if (q->ops->cl_ops == NULL)
534 return 0;
535
536 arg.w.stop = arg.w.skip = arg.w.count = 0;
537 arg.w.fn = check_loop_fn;
538 arg.depth = depth;
539 arg.p = p;
540 q->ops->cl_ops->walk(q, &arg.w);
541 return arg.w.stop ? -ELOOP : 0;
542}
543
544static int
545check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
546{
547 struct Qdisc *leaf;
548 struct Qdisc_class_ops *cops = q->ops->cl_ops;
549 struct check_loop_arg *arg = (struct check_loop_arg *)w;
550
551 leaf = cops->leaf(q, cl);
552 if (leaf) {
553 if (leaf == arg->p || arg->depth > 7)
554 return -ELOOP;
555 return check_loop(leaf, arg->p, arg->depth + 1);
556 }
557 return 0;
558}
559
560/*
561 * Delete/get qdisc.
562 */
563
564static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
565{
566 struct tcmsg *tcm = NLMSG_DATA(n);
567 struct rtattr **tca = arg;
568 struct net_device *dev;
569 u32 clid = tcm->tcm_parent;
570 struct Qdisc *q = NULL;
571 struct Qdisc *p = NULL;
572 int err;
573
574 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
575 return -ENODEV;
576
577 if (clid) {
578 if (clid != TC_H_ROOT) {
579 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
580 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
581 return -ENOENT;
582 q = qdisc_leaf(p, clid);
583 } else { /* ingress */
584 q = dev->qdisc_ingress;
585 }
586 } else {
587 q = dev->qdisc_sleeping;
588 }
589 if (!q)
590 return -ENOENT;
591
592 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
593 return -EINVAL;
594 } else {
595 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
596 return -ENOENT;
597 }
598
599 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
600 return -EINVAL;
601
602 if (n->nlmsg_type == RTM_DELQDISC) {
603 if (!clid)
604 return -EINVAL;
605 if (q->handle == 0)
606 return -ENOENT;
607 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
608 return err;
609 if (q) {
610 qdisc_notify(skb, n, clid, q, NULL);
611 spin_lock_bh(&dev->queue_lock);
612 qdisc_destroy(q);
613 spin_unlock_bh(&dev->queue_lock);
614 }
615 } else {
616 qdisc_notify(skb, n, clid, NULL, q);
617 }
618 return 0;
619}
620
621/*
622 Create/change qdisc.
623 */
624
625static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
626{
627 struct tcmsg *tcm;
628 struct rtattr **tca;
629 struct net_device *dev;
630 u32 clid;
631 struct Qdisc *q, *p;
632 int err;
633
634replay:
635 /* Reinit, just in case something touches this. */
636 tcm = NLMSG_DATA(n);
637 tca = arg;
638 clid = tcm->tcm_parent;
639 q = p = NULL;
640
641 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
642 return -ENODEV;
643
644 if (clid) {
645 if (clid != TC_H_ROOT) {
646 if (clid != TC_H_INGRESS) {
647 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
648 return -ENOENT;
649 q = qdisc_leaf(p, clid);
650 } else { /*ingress */
651 q = dev->qdisc_ingress;
652 }
653 } else {
654 q = dev->qdisc_sleeping;
655 }
656
657 /* It may be default qdisc, ignore it */
658 if (q && q->handle == 0)
659 q = NULL;
660
661 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
662 if (tcm->tcm_handle) {
663 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
664 return -EEXIST;
665 if (TC_H_MIN(tcm->tcm_handle))
666 return -EINVAL;
667 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
668 goto create_n_graft;
669 if (n->nlmsg_flags&NLM_F_EXCL)
670 return -EEXIST;
671 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
672 return -EINVAL;
673 if (q == p ||
674 (p && check_loop(q, p, 0)))
675 return -ELOOP;
676 atomic_inc(&q->refcnt);
677 goto graft;
678 } else {
679 if (q == NULL)
680 goto create_n_graft;
681
682 /* This magic test requires explanation.
683 *
684 * We know, that some child q is already
685 * attached to this parent and have choice:
686 * either to change it or to create/graft new one.
687 *
688 * 1. We are allowed to create/graft only
689 * if CREATE and REPLACE flags are set.
690 *
691 * 2. If EXCL is set, requestor wanted to say,
692 * that qdisc tcm_handle is not expected
693 * to exist, so that we choose create/graft too.
694 *
695 * 3. The last case is when no flags are set.
696 * Alas, it is sort of hole in API, we
697 * cannot decide what to do unambiguously.
698 * For now we select create/graft, if
699 * user gave KIND, which does not match existing.
700 */
701 if ((n->nlmsg_flags&NLM_F_CREATE) &&
702 (n->nlmsg_flags&NLM_F_REPLACE) &&
703 ((n->nlmsg_flags&NLM_F_EXCL) ||
704 (tca[TCA_KIND-1] &&
705 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
706 goto create_n_graft;
707 }
708 }
709 } else {
710 if (!tcm->tcm_handle)
711 return -EINVAL;
712 q = qdisc_lookup(dev, tcm->tcm_handle);
713 }
714
715 /* Change qdisc parameters */
716 if (q == NULL)
717 return -ENOENT;
718 if (n->nlmsg_flags&NLM_F_EXCL)
719 return -EEXIST;
720 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
721 return -EINVAL;
722 err = qdisc_change(q, tca);
723 if (err == 0)
724 qdisc_notify(skb, n, clid, NULL, q);
725 return err;
726
727create_n_graft:
728 if (!(n->nlmsg_flags&NLM_F_CREATE))
729 return -ENOENT;
730 if (clid == TC_H_INGRESS)
731 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
732 else
733 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
734 if (q == NULL) {
735 if (err == -EAGAIN)
736 goto replay;
737 return err;
738 }
739
740graft:
741 if (1) {
742 struct Qdisc *old_q = NULL;
743 err = qdisc_graft(dev, p, clid, q, &old_q);
744 if (err) {
745 if (q) {
746 spin_lock_bh(&dev->queue_lock);
747 qdisc_destroy(q);
748 spin_unlock_bh(&dev->queue_lock);
749 }
750 return err;
751 }
752 qdisc_notify(skb, n, clid, old_q, q);
753 if (old_q) {
754 spin_lock_bh(&dev->queue_lock);
755 qdisc_destroy(old_q);
756 spin_unlock_bh(&dev->queue_lock);
757 }
758 }
759 return 0;
760}
761
762static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
763 u32 pid, u32 seq, unsigned flags, int event)
764{
765 struct tcmsg *tcm;
766 struct nlmsghdr *nlh;
767 unsigned char *b = skb->tail;
768 struct gnet_dump d;
769
770 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
771 nlh->nlmsg_flags = flags;
772 tcm = NLMSG_DATA(nlh);
773 tcm->tcm_family = AF_UNSPEC;
774 tcm->tcm_ifindex = q->dev->ifindex;
775 tcm->tcm_parent = clid;
776 tcm->tcm_handle = q->handle;
777 tcm->tcm_info = atomic_read(&q->refcnt);
778 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
779 if (q->ops->dump && q->ops->dump(q, skb) < 0)
780 goto rtattr_failure;
781 q->qstats.qlen = q->q.qlen;
782
783 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
784 TCA_XSTATS, q->stats_lock, &d) < 0)
785 goto rtattr_failure;
786
787 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
788 goto rtattr_failure;
789
790 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
791#ifdef CONFIG_NET_ESTIMATOR
792 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
793#endif
794 gnet_stats_copy_queue(&d, &q->qstats) < 0)
795 goto rtattr_failure;
796
797 if (gnet_stats_finish_copy(&d) < 0)
798 goto rtattr_failure;
799
800 nlh->nlmsg_len = skb->tail - b;
801 return skb->len;
802
803nlmsg_failure:
804rtattr_failure:
805 skb_trim(skb, b - skb->data);
806 return -1;
807}
808
809static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
810 u32 clid, struct Qdisc *old, struct Qdisc *new)
811{
812 struct sk_buff *skb;
813 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
814
815 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
816 if (!skb)
817 return -ENOBUFS;
818
819 if (old && old->handle) {
820 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
821 goto err_out;
822 }
823 if (new) {
824 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
825 goto err_out;
826 }
827
828 if (skb->len)
829 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
830
831err_out:
832 kfree_skb(skb);
833 return -EINVAL;
834}
835
836static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
837{
838 int idx, q_idx;
839 int s_idx, s_q_idx;
840 struct net_device *dev;
841 struct Qdisc *q;
842
843 s_idx = cb->args[0];
844 s_q_idx = q_idx = cb->args[1];
845 read_lock(&dev_base_lock);
846 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
847 if (idx < s_idx)
848 continue;
849 if (idx > s_idx)
850 s_q_idx = 0;
851 read_lock_bh(&qdisc_tree_lock);
852 q_idx = 0;
853 list_for_each_entry(q, &dev->qdisc_list, list) {
854 if (q_idx < s_q_idx) {
855 q_idx++;
856 continue;
857 }
858 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
859 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
860 read_unlock_bh(&qdisc_tree_lock);
861 goto done;
862 }
863 q_idx++;
864 }
865 read_unlock_bh(&qdisc_tree_lock);
866 }
867
868done:
869 read_unlock(&dev_base_lock);
870
871 cb->args[0] = idx;
872 cb->args[1] = q_idx;
873
874 return skb->len;
875}
876
877
878
879/************************************************
880 * Traffic classes manipulation. *
881 ************************************************/
882
883
884
885static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
886{
887 struct tcmsg *tcm = NLMSG_DATA(n);
888 struct rtattr **tca = arg;
889 struct net_device *dev;
890 struct Qdisc *q = NULL;
891 struct Qdisc_class_ops *cops;
892 unsigned long cl = 0;
893 unsigned long new_cl;
894 u32 pid = tcm->tcm_parent;
895 u32 clid = tcm->tcm_handle;
896 u32 qid = TC_H_MAJ(clid);
897 int err;
898
899 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
900 return -ENODEV;
901
902 /*
903 parent == TC_H_UNSPEC - unspecified parent.
904 parent == TC_H_ROOT - class is root, which has no parent.
905 parent == X:0 - parent is root class.
906 parent == X:Y - parent is a node in hierarchy.
907 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
908
909 handle == 0:0 - generate handle from kernel pool.
910 handle == 0:Y - class is X:Y, where X:0 is qdisc.
911 handle == X:Y - clear.
912 handle == X:0 - root class.
913 */
914
915 /* Step 1. Determine qdisc handle X:0 */
916
917 if (pid != TC_H_ROOT) {
918 u32 qid1 = TC_H_MAJ(pid);
919
920 if (qid && qid1) {
921 /* If both majors are known, they must be identical. */
922 if (qid != qid1)
923 return -EINVAL;
924 } else if (qid1) {
925 qid = qid1;
926 } else if (qid == 0)
927 qid = dev->qdisc_sleeping->handle;
928
929 /* Now qid is genuine qdisc handle consistent
930 both with parent and child.
931
932 TC_H_MAJ(pid) still may be unspecified, complete it now.
933 */
934 if (pid)
935 pid = TC_H_MAKE(qid, pid);
936 } else {
937 if (qid == 0)
938 qid = dev->qdisc_sleeping->handle;
939 }
940
941 /* OK. Locate qdisc */
942 if ((q = qdisc_lookup(dev, qid)) == NULL)
943 return -ENOENT;
944
945 /* An check that it supports classes */
946 cops = q->ops->cl_ops;
947 if (cops == NULL)
948 return -EINVAL;
949
950 /* Now try to get class */
951 if (clid == 0) {
952 if (pid == TC_H_ROOT)
953 clid = qid;
954 } else
955 clid = TC_H_MAKE(qid, clid);
956
957 if (clid)
958 cl = cops->get(q, clid);
959
960 if (cl == 0) {
961 err = -ENOENT;
962 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
963 goto out;
964 } else {
965 switch (n->nlmsg_type) {
966 case RTM_NEWTCLASS:
967 err = -EEXIST;
968 if (n->nlmsg_flags&NLM_F_EXCL)
969 goto out;
970 break;
971 case RTM_DELTCLASS:
972 err = cops->delete(q, cl);
973 if (err == 0)
974 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
975 goto out;
976 case RTM_GETTCLASS:
977 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
978 goto out;
979 default:
980 err = -EINVAL;
981 goto out;
982 }
983 }
984
985 new_cl = cl;
986 err = cops->change(q, clid, pid, tca, &new_cl);
987 if (err == 0)
988 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
989
990out:
991 if (cl)
992 cops->put(q, cl);
993
994 return err;
995}
996
997
998static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
999 unsigned long cl,
1000 u32 pid, u32 seq, unsigned flags, int event)
1001{
1002 struct tcmsg *tcm;
1003 struct nlmsghdr *nlh;
1004 unsigned char *b = skb->tail;
1005 struct gnet_dump d;
1006 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1007
1008 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
1009 nlh->nlmsg_flags = flags;
1010 tcm = NLMSG_DATA(nlh);
1011 tcm->tcm_family = AF_UNSPEC;
1012 tcm->tcm_ifindex = q->dev->ifindex;
1013 tcm->tcm_parent = q->handle;
1014 tcm->tcm_handle = q->handle;
1015 tcm->tcm_info = 0;
1016 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1017 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1018 goto rtattr_failure;
1019
1020 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1021 TCA_XSTATS, q->stats_lock, &d) < 0)
1022 goto rtattr_failure;
1023
1024 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1025 goto rtattr_failure;
1026
1027 if (gnet_stats_finish_copy(&d) < 0)
1028 goto rtattr_failure;
1029
1030 nlh->nlmsg_len = skb->tail - b;
1031 return skb->len;
1032
1033nlmsg_failure:
1034rtattr_failure:
1035 skb_trim(skb, b - skb->data);
1036 return -1;
1037}
1038
1039static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1040 struct Qdisc *q, unsigned long cl, int event)
1041{
1042 struct sk_buff *skb;
1043 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1044
1045 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1046 if (!skb)
1047 return -ENOBUFS;
1048
1049 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1050 kfree_skb(skb);
1051 return -EINVAL;
1052 }
1053
1054 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1055}
1056
1057struct qdisc_dump_args
1058{
1059 struct qdisc_walker w;
1060 struct sk_buff *skb;
1061 struct netlink_callback *cb;
1062};
1063
1064static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1065{
1066 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1067
1068 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1069 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1070}
1071
1072static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1073{
1074 int t;
1075 int s_t;
1076 struct net_device *dev;
1077 struct Qdisc *q;
1078 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1079 struct qdisc_dump_args arg;
1080
1081 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1082 return 0;
1083 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1084 return 0;
1085
1086 s_t = cb->args[0];
1087 t = 0;
1088
1089 read_lock_bh(&qdisc_tree_lock);
1090 list_for_each_entry(q, &dev->qdisc_list, list) {
1091 if (t < s_t || !q->ops->cl_ops ||
1092 (tcm->tcm_parent &&
1093 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1094 t++;
1095 continue;
1096 }
1097 if (t > s_t)
1098 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1099 arg.w.fn = qdisc_class_dump;
1100 arg.skb = skb;
1101 arg.cb = cb;
1102 arg.w.stop = 0;
1103 arg.w.skip = cb->args[1];
1104 arg.w.count = 0;
1105 q->ops->cl_ops->walk(q, &arg.w);
1106 cb->args[1] = arg.w.count;
1107 if (arg.w.stop)
1108 break;
1109 t++;
1110 }
1111 read_unlock_bh(&qdisc_tree_lock);
1112
1113 cb->args[0] = t;
1114
1115 dev_put(dev);
1116 return skb->len;
1117}
1118
1119/* Main classifier routine: scans classifier chain attached
1120 to this qdisc, (optionally) tests for protocol and asks
1121 specific classifiers.
1122 */
1123int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1124 struct tcf_result *res)
1125{
1126 int err = 0;
1127 u32 protocol = skb->protocol;
1128#ifdef CONFIG_NET_CLS_ACT
1129 struct tcf_proto *otp = tp;
1130reclassify:
1131#endif
1132 protocol = skb->protocol;
1133
1134 for ( ; tp; tp = tp->next) {
1135 if ((tp->protocol == protocol ||
1136 tp->protocol == __constant_htons(ETH_P_ALL)) &&
1137 (err = tp->classify(skb, tp, res)) >= 0) {
1138#ifdef CONFIG_NET_CLS_ACT
1139 if ( TC_ACT_RECLASSIFY == err) {
1140 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1141 tp = otp;
1142
1143 if (MAX_REC_LOOP < verd++) {
1144 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1145 tp->prio&0xffff, ntohs(tp->protocol));
1146 return TC_ACT_SHOT;
1147 }
1148 skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1149 goto reclassify;
1150 } else {
1151 if (skb->tc_verd)
1152 skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1153 return err;
1154 }
1155#else
1156
1157 return err;
1158#endif
1159 }
1160
1161 }
1162 return -1;
1163}
1164
1165static int psched_us_per_tick = 1;
1166static int psched_tick_per_us = 1;
1167
1168#ifdef CONFIG_PROC_FS
1169static int psched_show(struct seq_file *seq, void *v)
1170{
1171 seq_printf(seq, "%08x %08x %08x %08x\n",
1172 psched_tick_per_us, psched_us_per_tick,
1173 1000000, HZ);
1174
1175 return 0;
1176}
1177
1178static int psched_open(struct inode *inode, struct file *file)
1179{
1180 return single_open(file, psched_show, PDE(inode)->data);
1181}
1182
1183static struct file_operations psched_fops = {
1184 .owner = THIS_MODULE,
1185 .open = psched_open,
1186 .read = seq_read,
1187 .llseek = seq_lseek,
1188 .release = single_release,
1189};
1190#endif
1191
1192#ifdef CONFIG_NET_SCH_CLK_CPU
1193psched_tdiff_t psched_clock_per_hz;
1194int psched_clock_scale;
1195EXPORT_SYMBOL(psched_clock_per_hz);
1196EXPORT_SYMBOL(psched_clock_scale);
1197
1198psched_time_t psched_time_base;
1199cycles_t psched_time_mark;
1200EXPORT_SYMBOL(psched_time_mark);
1201EXPORT_SYMBOL(psched_time_base);
1202
1203/*
1204 * Periodically adjust psched_time_base to avoid overflow
1205 * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1206 */
1207static void psched_tick(unsigned long);
1208static struct timer_list psched_timer = TIMER_INITIALIZER(psched_tick, 0, 0);
1209
1210static void psched_tick(unsigned long dummy)
1211{
1212 if (sizeof(cycles_t) == sizeof(u32)) {
1213 psched_time_t dummy_stamp;
1214 PSCHED_GET_TIME(dummy_stamp);
1215 psched_timer.expires = jiffies + 1*HZ;
1216 add_timer(&psched_timer);
1217 }
1218}
1219
1220int __init psched_calibrate_clock(void)
1221{
1222 psched_time_t stamp, stamp1;
1223 struct timeval tv, tv1;
1224 psched_tdiff_t delay;
1225 long rdelay;
1226 unsigned long stop;
1227
1228 psched_tick(0);
1229 stop = jiffies + HZ/10;
1230 PSCHED_GET_TIME(stamp);
1231 do_gettimeofday(&tv);
1232 while (time_before(jiffies, stop)) {
1233 barrier();
1234 cpu_relax();
1235 }
1236 PSCHED_GET_TIME(stamp1);
1237 do_gettimeofday(&tv1);
1238
1239 delay = PSCHED_TDIFF(stamp1, stamp);
1240 rdelay = tv1.tv_usec - tv.tv_usec;
1241 rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1242 if (rdelay > delay)
1243 return -1;
1244 delay /= rdelay;
1245 psched_tick_per_us = delay;
1246 while ((delay>>=1) != 0)
1247 psched_clock_scale++;
1248 psched_us_per_tick = 1<<psched_clock_scale;
1249 psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1250 return 0;
1251}
1252#endif
1253
1254static int __init pktsched_init(void)
1255{
1256 struct rtnetlink_link *link_p;
1257
1258#ifdef CONFIG_NET_SCH_CLK_CPU
1259 if (psched_calibrate_clock() < 0)
1260 return -1;
1261#elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1262 psched_tick_per_us = HZ<<PSCHED_JSCALE;
1263 psched_us_per_tick = 1000000;
1264#endif
1265
1266 link_p = rtnetlink_links[PF_UNSPEC];
1267
1268 /* Setup rtnetlink links. It is made here to avoid
1269 exporting large number of public symbols.
1270 */
1271
1272 if (link_p) {
1273 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1274 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1275 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1276 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1277 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1278 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1279 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1280 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1281 }
1282
1283 register_qdisc(&pfifo_qdisc_ops);
1284 register_qdisc(&bfifo_qdisc_ops);
1285 proc_net_fops_create("psched", 0, &psched_fops);
1286
1287 return 0;
1288}
1289
1290subsys_initcall(pktsched_init);
1291
1292EXPORT_SYMBOL(qdisc_get_rtab);
1293EXPORT_SYMBOL(qdisc_put_rtab);
1294EXPORT_SYMBOL(register_qdisc);
1295EXPORT_SYMBOL(unregister_qdisc);
1296EXPORT_SYMBOL(tc_classify);
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
new file mode 100644
index 000000000000..93ebce40acac
--- /dev/null
+++ b/net/sched/sch_atm.c
@@ -0,0 +1,735 @@
1/* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */
2
3/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
4
5
6#include <linux/config.h>
7#include <linux/module.h>
8#include <linux/init.h>
9#include <linux/string.h>
10#include <linux/errno.h>
11#include <linux/skbuff.h>
12#include <linux/interrupt.h>
13#include <linux/atmdev.h>
14#include <linux/atmclip.h>
15#include <linux/netdevice.h>
16#include <linux/rtnetlink.h>
17#include <linux/file.h> /* for fput */
18#include <net/pkt_sched.h>
19#include <net/sock.h>
20
21
22extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */
23
24#if 0 /* control */
25#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
26#else
27#define DPRINTK(format,args...)
28#endif
29
30#if 0 /* data */
31#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
32#else
33#define D2PRINTK(format,args...)
34#endif
35
36
37/*
38 * The ATM queuing discipline provides a framework for invoking classifiers
39 * (aka "filters"), which in turn select classes of this queuing discipline.
40 * Each class maps the flow(s) it is handling to a given VC. Multiple classes
41 * may share the same VC.
42 *
43 * When creating a class, VCs are specified by passing the number of the open
44 * socket descriptor by which the calling process references the VC. The kernel
45 * keeps the VC open at least until all classes using it are removed.
46 *
47 * In this file, most functions are named atm_tc_* to avoid confusion with all
48 * the atm_* in net/atm. This naming convention differs from what's used in the
49 * rest of net/sched.
50 *
51 * Known bugs:
52 * - sometimes messes up the IP stack
53 * - any manipulations besides the few operations described in the README, are
54 * untested and likely to crash the system
55 * - should lock the flow while there is data in the queue (?)
56 */
57
58
59#define PRIV(sch) qdisc_priv(sch)
60#define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back))
61
62
63struct atm_flow_data {
64 struct Qdisc *q; /* FIFO, TBF, etc. */
65 struct tcf_proto *filter_list;
66 struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */
67 void (*old_pop)(struct atm_vcc *vcc,struct sk_buff *skb); /* chaining */
68 struct atm_qdisc_data *parent; /* parent qdisc */
69 struct socket *sock; /* for closing */
70 u32 classid; /* x:y type ID */
71 int ref; /* reference count */
72 struct gnet_stats_basic bstats;
73 struct gnet_stats_queue qstats;
74 spinlock_t *stats_lock;
75 struct atm_flow_data *next;
76 struct atm_flow_data *excess; /* flow for excess traffic;
77 NULL to set CLP instead */
78 int hdr_len;
79 unsigned char hdr[0]; /* header data; MUST BE LAST */
80};
81
82struct atm_qdisc_data {
83 struct atm_flow_data link; /* unclassified skbs go here */
84 struct atm_flow_data *flows; /* NB: "link" is also on this
85 list */
86 struct tasklet_struct task; /* requeue tasklet */
87};
88
89
90/* ------------------------- Class/flow operations ------------------------- */
91
92
93static int find_flow(struct atm_qdisc_data *qdisc,struct atm_flow_data *flow)
94{
95 struct atm_flow_data *walk;
96
97 DPRINTK("find_flow(qdisc %p,flow %p)\n",qdisc,flow);
98 for (walk = qdisc->flows; walk; walk = walk->next)
99 if (walk == flow) return 1;
100 DPRINTK("find_flow: not found\n");
101 return 0;
102}
103
104
105static __inline__ struct atm_flow_data *lookup_flow(struct Qdisc *sch,
106 u32 classid)
107{
108 struct atm_qdisc_data *p = PRIV(sch);
109 struct atm_flow_data *flow;
110
111 for (flow = p->flows; flow; flow = flow->next)
112 if (flow->classid == classid) break;
113 return flow;
114}
115
116
117static int atm_tc_graft(struct Qdisc *sch,unsigned long arg,
118 struct Qdisc *new,struct Qdisc **old)
119{
120 struct atm_qdisc_data *p = PRIV(sch);
121 struct atm_flow_data *flow = (struct atm_flow_data *) arg;
122
123 DPRINTK("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",sch,
124 p,flow,new,old);
125 if (!find_flow(p,flow)) return -EINVAL;
126 if (!new) new = &noop_qdisc;
127 *old = xchg(&flow->q,new);
128 if (*old) qdisc_reset(*old);
129 return 0;
130}
131
132
133static struct Qdisc *atm_tc_leaf(struct Qdisc *sch,unsigned long cl)
134{
135 struct atm_flow_data *flow = (struct atm_flow_data *) cl;
136
137 DPRINTK("atm_tc_leaf(sch %p,flow %p)\n",sch,flow);
138 return flow ? flow->q : NULL;
139}
140
141
142static unsigned long atm_tc_get(struct Qdisc *sch,u32 classid)
143{
144 struct atm_qdisc_data *p __attribute__((unused)) = PRIV(sch);
145 struct atm_flow_data *flow;
146
147 DPRINTK("atm_tc_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid);
148 flow = lookup_flow(sch,classid);
149 if (flow) flow->ref++;
150 DPRINTK("atm_tc_get: flow %p\n",flow);
151 return (unsigned long) flow;
152}
153
154
155static unsigned long atm_tc_bind_filter(struct Qdisc *sch,
156 unsigned long parent, u32 classid)
157{
158 return atm_tc_get(sch,classid);
159}
160
161
162static void destroy_filters(struct atm_flow_data *flow)
163{
164 struct tcf_proto *filter;
165
166 while ((filter = flow->filter_list)) {
167 DPRINTK("destroy_filters: destroying filter %p\n",filter);
168 flow->filter_list = filter->next;
169 tcf_destroy(filter);
170 }
171}
172
173
174/*
175 * atm_tc_put handles all destructions, including the ones that are explicitly
176 * requested (atm_tc_destroy, etc.). The assumption here is that we never drop
177 * anything that still seems to be in use.
178 */
179
180static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
181{
182 struct atm_qdisc_data *p = PRIV(sch);
183 struct atm_flow_data *flow = (struct atm_flow_data *) cl;
184 struct atm_flow_data **prev;
185
186 DPRINTK("atm_tc_put(sch %p,[qdisc %p],flow %p)\n",sch,p,flow);
187 if (--flow->ref) return;
188 DPRINTK("atm_tc_put: destroying\n");
189 for (prev = &p->flows; *prev; prev = &(*prev)->next)
190 if (*prev == flow) break;
191 if (!*prev) {
192 printk(KERN_CRIT "atm_tc_put: class %p not found\n",flow);
193 return;
194 }
195 *prev = flow->next;
196 DPRINTK("atm_tc_put: qdisc %p\n",flow->q);
197 qdisc_destroy(flow->q);
198 destroy_filters(flow);
199 if (flow->sock) {
200 DPRINTK("atm_tc_put: f_count %d\n",
201 file_count(flow->sock->file));
202 flow->vcc->pop = flow->old_pop;
203 sockfd_put(flow->sock);
204 }
205 if (flow->excess) atm_tc_put(sch,(unsigned long) flow->excess);
206 if (flow != &p->link) kfree(flow);
207 /*
208 * If flow == &p->link, the qdisc no longer works at this point and
209 * needs to be removed. (By the caller of atm_tc_put.)
210 */
211}
212
213
214static void sch_atm_pop(struct atm_vcc *vcc,struct sk_buff *skb)
215{
216 struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent;
217
218 D2PRINTK("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n",vcc,skb,p);
219 VCC2FLOW(vcc)->old_pop(vcc,skb);
220 tasklet_schedule(&p->task);
221}
222
223static const u8 llc_oui_ip[] = {
224 0xaa, /* DSAP: non-ISO */
225 0xaa, /* SSAP: non-ISO */
226 0x03, /* Ctrl: Unnumbered Information Command PDU */
227 0x00, /* OUI: EtherType */
228 0x00, 0x00,
229 0x08, 0x00 }; /* Ethertype IP (0800) */
230
231static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
232 struct rtattr **tca, unsigned long *arg)
233{
234 struct atm_qdisc_data *p = PRIV(sch);
235 struct atm_flow_data *flow = (struct atm_flow_data *) *arg;
236 struct atm_flow_data *excess = NULL;
237 struct rtattr *opt = tca[TCA_OPTIONS-1];
238 struct rtattr *tb[TCA_ATM_MAX];
239 struct socket *sock;
240 int fd,error,hdr_len;
241 void *hdr;
242
243 DPRINTK("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x,"
244 "flow %p,opt %p)\n",sch,p,classid,parent,flow,opt);
245 /*
246 * The concept of parents doesn't apply for this qdisc.
247 */
248 if (parent && parent != TC_H_ROOT && parent != sch->handle)
249 return -EINVAL;
250 /*
251 * ATM classes cannot be changed. In order to change properties of the
252 * ATM connection, that socket needs to be modified directly (via the
253 * native ATM API. In order to send a flow to a different VC, the old
254 * class needs to be removed and a new one added. (This may be changed
255 * later.)
256 */
257 if (flow) return -EBUSY;
258 if (opt == NULL || rtattr_parse_nested(tb, TCA_ATM_MAX, opt))
259 return -EINVAL;
260 if (!tb[TCA_ATM_FD-1] || RTA_PAYLOAD(tb[TCA_ATM_FD-1]) < sizeof(fd))
261 return -EINVAL;
262 fd = *(int *) RTA_DATA(tb[TCA_ATM_FD-1]);
263 DPRINTK("atm_tc_change: fd %d\n",fd);
264 if (tb[TCA_ATM_HDR-1]) {
265 hdr_len = RTA_PAYLOAD(tb[TCA_ATM_HDR-1]);
266 hdr = RTA_DATA(tb[TCA_ATM_HDR-1]);
267 }
268 else {
269 hdr_len = RFC1483LLC_LEN;
270 hdr = NULL; /* default LLC/SNAP for IP */
271 }
272 if (!tb[TCA_ATM_EXCESS-1]) excess = NULL;
273 else {
274 if (RTA_PAYLOAD(tb[TCA_ATM_EXCESS-1]) != sizeof(u32))
275 return -EINVAL;
276 excess = (struct atm_flow_data *) atm_tc_get(sch,
277 *(u32 *) RTA_DATA(tb[TCA_ATM_EXCESS-1]));
278 if (!excess) return -ENOENT;
279 }
280 DPRINTK("atm_tc_change: type %d, payload %d, hdr_len %d\n",
281 opt->rta_type,RTA_PAYLOAD(opt),hdr_len);
282 if (!(sock = sockfd_lookup(fd,&error))) return error; /* f_count++ */
283 DPRINTK("atm_tc_change: f_count %d\n",file_count(sock->file));
284 if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) {
285 error = -EPROTOTYPE;
286 goto err_out;
287 }
288 /* @@@ should check if the socket is really operational or we'll crash
289 on vcc->send */
290 if (classid) {
291 if (TC_H_MAJ(classid ^ sch->handle)) {
292 DPRINTK("atm_tc_change: classid mismatch\n");
293 error = -EINVAL;
294 goto err_out;
295 }
296 if (find_flow(p,flow)) {
297 error = -EEXIST;
298 goto err_out;
299 }
300 }
301 else {
302 int i;
303 unsigned long cl;
304
305 for (i = 1; i < 0x8000; i++) {
306 classid = TC_H_MAKE(sch->handle,0x8000 | i);
307 if (!(cl = atm_tc_get(sch,classid))) break;
308 atm_tc_put(sch,cl);
309 }
310 }
311 DPRINTK("atm_tc_change: new id %x\n",classid);
312 flow = kmalloc(sizeof(struct atm_flow_data)+hdr_len,GFP_KERNEL);
313 DPRINTK("atm_tc_change: flow %p\n",flow);
314 if (!flow) {
315 error = -ENOBUFS;
316 goto err_out;
317 }
318 memset(flow,0,sizeof(*flow));
319 flow->filter_list = NULL;
320 if (!(flow->q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops)))
321 flow->q = &noop_qdisc;
322 DPRINTK("atm_tc_change: qdisc %p\n",flow->q);
323 flow->sock = sock;
324 flow->vcc = ATM_SD(sock); /* speedup */
325 flow->vcc->user_back = flow;
326 DPRINTK("atm_tc_change: vcc %p\n",flow->vcc);
327 flow->old_pop = flow->vcc->pop;
328 flow->parent = p;
329 flow->vcc->pop = sch_atm_pop;
330 flow->classid = classid;
331 flow->ref = 1;
332 flow->excess = excess;
333 flow->next = p->link.next;
334 p->link.next = flow;
335 flow->hdr_len = hdr_len;
336 if (hdr)
337 memcpy(flow->hdr,hdr,hdr_len);
338 else
339 memcpy(flow->hdr,llc_oui_ip,sizeof(llc_oui_ip));
340 *arg = (unsigned long) flow;
341 return 0;
342err_out:
343 if (excess) atm_tc_put(sch,(unsigned long) excess);
344 sockfd_put(sock);
345 return error;
346}
347
348
349static int atm_tc_delete(struct Qdisc *sch,unsigned long arg)
350{
351 struct atm_qdisc_data *p = PRIV(sch);
352 struct atm_flow_data *flow = (struct atm_flow_data *) arg;
353
354 DPRINTK("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n",sch,p,flow);
355 if (!find_flow(PRIV(sch),flow)) return -EINVAL;
356 if (flow->filter_list || flow == &p->link) return -EBUSY;
357 /*
358 * Reference count must be 2: one for "keepalive" (set at class
359 * creation), and one for the reference held when calling delete.
360 */
361 if (flow->ref < 2) {
362 printk(KERN_ERR "atm_tc_delete: flow->ref == %d\n",flow->ref);
363 return -EINVAL;
364 }
365 if (flow->ref > 2) return -EBUSY; /* catch references via excess, etc.*/
366 atm_tc_put(sch,arg);
367 return 0;
368}
369
370
371static void atm_tc_walk(struct Qdisc *sch,struct qdisc_walker *walker)
372{
373 struct atm_qdisc_data *p = PRIV(sch);
374 struct atm_flow_data *flow;
375
376 DPRINTK("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker);
377 if (walker->stop) return;
378 for (flow = p->flows; flow; flow = flow->next) {
379 if (walker->count >= walker->skip)
380 if (walker->fn(sch,(unsigned long) flow,walker) < 0) {
381 walker->stop = 1;
382 break;
383 }
384 walker->count++;
385 }
386}
387
388
389static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch,unsigned long cl)
390{
391 struct atm_qdisc_data *p = PRIV(sch);
392 struct atm_flow_data *flow = (struct atm_flow_data *) cl;
393
394 DPRINTK("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n",sch,p,flow);
395 return flow ? &flow->filter_list : &p->link.filter_list;
396}
397
398
399/* --------------------------- Qdisc operations ---------------------------- */
400
401
402static int atm_tc_enqueue(struct sk_buff *skb,struct Qdisc *sch)
403{
404 struct atm_qdisc_data *p = PRIV(sch);
405 struct atm_flow_data *flow = NULL ; /* @@@ */
406 struct tcf_result res;
407 int result;
408 int ret = NET_XMIT_POLICED;
409
410 D2PRINTK("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p);
411 result = TC_POLICE_OK; /* be nice to gcc */
412 if (TC_H_MAJ(skb->priority) != sch->handle ||
413 !(flow = (struct atm_flow_data *) atm_tc_get(sch,skb->priority)))
414 for (flow = p->flows; flow; flow = flow->next)
415 if (flow->filter_list) {
416 result = tc_classify(skb,flow->filter_list,
417 &res);
418 if (result < 0) continue;
419 flow = (struct atm_flow_data *) res.class;
420 if (!flow) flow = lookup_flow(sch,res.classid);
421 break;
422 }
423 if (!flow) flow = &p->link;
424 else {
425 if (flow->vcc)
426 ATM_SKB(skb)->atm_options = flow->vcc->atm_options;
427 /*@@@ looks good ... but it's not supposed to work :-)*/
428#ifdef CONFIG_NET_CLS_POLICE
429 switch (result) {
430 case TC_POLICE_SHOT:
431 kfree_skb(skb);
432 break;
433 case TC_POLICE_RECLASSIFY:
434 if (flow->excess) flow = flow->excess;
435 else {
436 ATM_SKB(skb)->atm_options |=
437 ATM_ATMOPT_CLP;
438 break;
439 }
440 /* fall through */
441 case TC_POLICE_OK:
442 /* fall through */
443 default:
444 break;
445 }
446#endif
447 }
448 if (
449#ifdef CONFIG_NET_CLS_POLICE
450 result == TC_POLICE_SHOT ||
451#endif
452 (ret = flow->q->enqueue(skb,flow->q)) != 0) {
453 sch->qstats.drops++;
454 if (flow) flow->qstats.drops++;
455 return ret;
456 }
457 sch->bstats.bytes += skb->len;
458 sch->bstats.packets++;
459 flow->bstats.bytes += skb->len;
460 flow->bstats.packets++;
461 /*
462 * Okay, this may seem weird. We pretend we've dropped the packet if
463 * it goes via ATM. The reason for this is that the outer qdisc
464 * expects to be able to q->dequeue the packet later on if we return
465 * success at this place. Also, sch->q.qdisc needs to reflect whether
466 * there is a packet egligible for dequeuing or not. Note that the
467 * statistics of the outer qdisc are necessarily wrong because of all
468 * this. There's currently no correct solution for this.
469 */
470 if (flow == &p->link) {
471 sch->q.qlen++;
472 return 0;
473 }
474 tasklet_schedule(&p->task);
475 return NET_XMIT_BYPASS;
476}
477
478
479/*
480 * Dequeue packets and send them over ATM. Note that we quite deliberately
481 * avoid checking net_device's flow control here, simply because sch_atm
482 * uses its own channels, which have nothing to do with any CLIP/LANE/or
483 * non-ATM interfaces.
484 */
485
486
487static void sch_atm_dequeue(unsigned long data)
488{
489 struct Qdisc *sch = (struct Qdisc *) data;
490 struct atm_qdisc_data *p = PRIV(sch);
491 struct atm_flow_data *flow;
492 struct sk_buff *skb;
493
494 D2PRINTK("sch_atm_dequeue(sch %p,[qdisc %p])\n",sch,p);
495 for (flow = p->link.next; flow; flow = flow->next)
496 /*
497 * If traffic is properly shaped, this won't generate nasty
498 * little bursts. Otherwise, it may ... (but that's okay)
499 */
500 while ((skb = flow->q->dequeue(flow->q))) {
501 if (!atm_may_send(flow->vcc,skb->truesize)) {
502 (void) flow->q->ops->requeue(skb,flow->q);
503 break;
504 }
505 D2PRINTK("atm_tc_dequeue: sending on class %p\n",flow);
506 /* remove any LL header somebody else has attached */
507 skb_pull(skb,(char *) skb->nh.iph-(char *) skb->data);
508 if (skb_headroom(skb) < flow->hdr_len) {
509 struct sk_buff *new;
510
511 new = skb_realloc_headroom(skb,flow->hdr_len);
512 dev_kfree_skb(skb);
513 if (!new) continue;
514 skb = new;
515 }
516 D2PRINTK("sch_atm_dequeue: ip %p, data %p\n",
517 skb->nh.iph,skb->data);
518 ATM_SKB(skb)->vcc = flow->vcc;
519 memcpy(skb_push(skb,flow->hdr_len),flow->hdr,
520 flow->hdr_len);
521 atomic_add(skb->truesize,
522 &sk_atm(flow->vcc)->sk_wmem_alloc);
523 /* atm.atm_options are already set by atm_tc_enqueue */
524 (void) flow->vcc->send(flow->vcc,skb);
525 }
526}
527
528
529static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch)
530{
531 struct atm_qdisc_data *p = PRIV(sch);
532 struct sk_buff *skb;
533
534 D2PRINTK("atm_tc_dequeue(sch %p,[qdisc %p])\n",sch,p);
535 tasklet_schedule(&p->task);
536 skb = p->link.q->dequeue(p->link.q);
537 if (skb) sch->q.qlen--;
538 return skb;
539}
540
541
542static int atm_tc_requeue(struct sk_buff *skb,struct Qdisc *sch)
543{
544 struct atm_qdisc_data *p = PRIV(sch);
545 int ret;
546
547 D2PRINTK("atm_tc_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p);
548 ret = p->link.q->ops->requeue(skb,p->link.q);
549 if (!ret) {
550 sch->q.qlen++;
551 sch->qstats.requeues++;
552 } else {
553 sch->qstats.drops++;
554 p->link.qstats.drops++;
555 }
556 return ret;
557}
558
559
560static unsigned int atm_tc_drop(struct Qdisc *sch)
561{
562 struct atm_qdisc_data *p = PRIV(sch);
563 struct atm_flow_data *flow;
564 unsigned int len;
565
566 DPRINTK("atm_tc_drop(sch %p,[qdisc %p])\n",sch,p);
567 for (flow = p->flows; flow; flow = flow->next)
568 if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q)))
569 return len;
570 return 0;
571}
572
573
574static int atm_tc_init(struct Qdisc *sch,struct rtattr *opt)
575{
576 struct atm_qdisc_data *p = PRIV(sch);
577
578 DPRINTK("atm_tc_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt);
579 p->flows = &p->link;
580 if(!(p->link.q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops)))
581 p->link.q = &noop_qdisc;
582 DPRINTK("atm_tc_init: link (%p) qdisc %p\n",&p->link,p->link.q);
583 p->link.filter_list = NULL;
584 p->link.vcc = NULL;
585 p->link.sock = NULL;
586 p->link.classid = sch->handle;
587 p->link.ref = 1;
588 p->link.next = NULL;
589 tasklet_init(&p->task,sch_atm_dequeue,(unsigned long) sch);
590 return 0;
591}
592
593
594static void atm_tc_reset(struct Qdisc *sch)
595{
596 struct atm_qdisc_data *p = PRIV(sch);
597 struct atm_flow_data *flow;
598
599 DPRINTK("atm_tc_reset(sch %p,[qdisc %p])\n",sch,p);
600 for (flow = p->flows; flow; flow = flow->next) qdisc_reset(flow->q);
601 sch->q.qlen = 0;
602}
603
604
605static void atm_tc_destroy(struct Qdisc *sch)
606{
607 struct atm_qdisc_data *p = PRIV(sch);
608 struct atm_flow_data *flow;
609
610 DPRINTK("atm_tc_destroy(sch %p,[qdisc %p])\n",sch,p);
611 /* races ? */
612 while ((flow = p->flows)) {
613 destroy_filters(flow);
614 if (flow->ref > 1)
615 printk(KERN_ERR "atm_destroy: %p->ref = %d\n",flow,
616 flow->ref);
617 atm_tc_put(sch,(unsigned long) flow);
618 if (p->flows == flow) {
619 printk(KERN_ERR "atm_destroy: putting flow %p didn't "
620 "kill it\n",flow);
621 p->flows = flow->next; /* brute force */
622 break;
623 }
624 }
625 tasklet_kill(&p->task);
626}
627
628
629static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,
630 struct sk_buff *skb, struct tcmsg *tcm)
631{
632 struct atm_qdisc_data *p = PRIV(sch);
633 struct atm_flow_data *flow = (struct atm_flow_data *) cl;
634 unsigned char *b = skb->tail;
635 struct rtattr *rta;
636
637 DPRINTK("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n",
638 sch,p,flow,skb,tcm);
639 if (!find_flow(p,flow)) return -EINVAL;
640 tcm->tcm_handle = flow->classid;
641 rta = (struct rtattr *) b;
642 RTA_PUT(skb,TCA_OPTIONS,0,NULL);
643 RTA_PUT(skb,TCA_ATM_HDR,flow->hdr_len,flow->hdr);
644 if (flow->vcc) {
645 struct sockaddr_atmpvc pvc;
646 int state;
647
648 pvc.sap_family = AF_ATMPVC;
649 pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1;
650 pvc.sap_addr.vpi = flow->vcc->vpi;
651 pvc.sap_addr.vci = flow->vcc->vci;
652 RTA_PUT(skb,TCA_ATM_ADDR,sizeof(pvc),&pvc);
653 state = ATM_VF2VS(flow->vcc->flags);
654 RTA_PUT(skb,TCA_ATM_STATE,sizeof(state),&state);
655 }
656 if (flow->excess)
657 RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(u32),&flow->classid);
658 else {
659 static u32 zero;
660
661 RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(zero),&zero);
662 }
663 rta->rta_len = skb->tail-b;
664 return skb->len;
665
666rtattr_failure:
667 skb_trim(skb,b-skb->data);
668 return -1;
669}
670static int
671atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
672 struct gnet_dump *d)
673{
674 struct atm_flow_data *flow = (struct atm_flow_data *) arg;
675
676 flow->qstats.qlen = flow->q->q.qlen;
677
678 if (gnet_stats_copy_basic(d, &flow->bstats) < 0 ||
679 gnet_stats_copy_queue(d, &flow->qstats) < 0)
680 return -1;
681
682 return 0;
683}
684
685static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb)
686{
687 return 0;
688}
689
690static struct Qdisc_class_ops atm_class_ops = {
691 .graft = atm_tc_graft,
692 .leaf = atm_tc_leaf,
693 .get = atm_tc_get,
694 .put = atm_tc_put,
695 .change = atm_tc_change,
696 .delete = atm_tc_delete,
697 .walk = atm_tc_walk,
698 .tcf_chain = atm_tc_find_tcf,
699 .bind_tcf = atm_tc_bind_filter,
700 .unbind_tcf = atm_tc_put,
701 .dump = atm_tc_dump_class,
702 .dump_stats = atm_tc_dump_class_stats,
703};
704
705static struct Qdisc_ops atm_qdisc_ops = {
706 .next = NULL,
707 .cl_ops = &atm_class_ops,
708 .id = "atm",
709 .priv_size = sizeof(struct atm_qdisc_data),
710 .enqueue = atm_tc_enqueue,
711 .dequeue = atm_tc_dequeue,
712 .requeue = atm_tc_requeue,
713 .drop = atm_tc_drop,
714 .init = atm_tc_init,
715 .reset = atm_tc_reset,
716 .destroy = atm_tc_destroy,
717 .change = NULL,
718 .dump = atm_tc_dump,
719 .owner = THIS_MODULE,
720};
721
722
723static int __init atm_init(void)
724{
725 return register_qdisc(&atm_qdisc_ops);
726}
727
728static void __exit atm_exit(void)
729{
730 unregister_qdisc(&atm_qdisc_ops);
731}
732
733module_init(atm_init)
734module_exit(atm_exit)
735MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
new file mode 100644
index 000000000000..d43e3b8cbf6a
--- /dev/null
+++ b/net/sched/sch_cbq.c
@@ -0,0 +1,2124 @@
1/*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 */
12
13#include <linux/config.h>
14#include <linux/module.h>
15#include <asm/uaccess.h>
16#include <asm/system.h>
17#include <linux/bitops.h>
18#include <linux/types.h>
19#include <linux/kernel.h>
20#include <linux/sched.h>
21#include <linux/string.h>
22#include <linux/mm.h>
23#include <linux/socket.h>
24#include <linux/sockios.h>
25#include <linux/in.h>
26#include <linux/errno.h>
27#include <linux/interrupt.h>
28#include <linux/if_ether.h>
29#include <linux/inet.h>
30#include <linux/netdevice.h>
31#include <linux/etherdevice.h>
32#include <linux/notifier.h>
33#include <net/ip.h>
34#include <net/route.h>
35#include <linux/skbuff.h>
36#include <net/sock.h>
37#include <net/pkt_sched.h>
38
39
40/* Class-Based Queueing (CBQ) algorithm.
41 =======================================
42
43 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
44 Management Models for Packet Networks",
45 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
46
47 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
48
49 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
50 Parameters", 1996
51
52 [4] Sally Floyd and Michael Speer, "Experimental Results
53 for Class-Based Queueing", 1998, not published.
54
55 -----------------------------------------------------------------------
56
57 Algorithm skeleton was taken from NS simulator cbq.cc.
58 If someone wants to check this code against the LBL version,
59 he should take into account that ONLY the skeleton was borrowed,
60 the implementation is different. Particularly:
61
62 --- The WRR algorithm is different. Our version looks more
63 reasonable (I hope) and works when quanta are allowed to be
64 less than MTU, which is always the case when real time classes
65 have small rates. Note, that the statement of [3] is
66 incomplete, delay may actually be estimated even if class
67 per-round allotment is less than MTU. Namely, if per-round
68 allotment is W*r_i, and r_1+...+r_k = r < 1
69
70 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
71
72 In the worst case we have IntServ estimate with D = W*r+k*MTU
73 and C = MTU*r. The proof (if correct at all) is trivial.
74
75
76 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
77 interpret some places, which look like wrong translations
78 from NS. Anyone is advised to find these differences
79 and explain to me, why I am wrong 8).
80
81 --- Linux has no EOI event, so that we cannot estimate true class
82 idle time. Workaround is to consider the next dequeue event
83 as sign that previous packet is finished. This is wrong because of
84 internal device queueing, but on a permanently loaded link it is true.
85 Moreover, combined with clock integrator, this scheme looks
86 very close to an ideal solution. */
87
88struct cbq_sched_data;
89
90
91struct cbq_class
92{
93 struct cbq_class *next; /* hash table link */
94 struct cbq_class *next_alive; /* next class with backlog in this priority band */
95
96/* Parameters */
97 u32 classid;
98 unsigned char priority; /* class priority */
99 unsigned char priority2; /* priority to be used after overlimit */
100 unsigned char ewma_log; /* time constant for idle time calculation */
101 unsigned char ovl_strategy;
102#ifdef CONFIG_NET_CLS_POLICE
103 unsigned char police;
104#endif
105
106 u32 defmap;
107
108 /* Link-sharing scheduler parameters */
109 long maxidle; /* Class parameters: see below. */
110 long offtime;
111 long minidle;
112 u32 avpkt;
113 struct qdisc_rate_table *R_tab;
114
115 /* Overlimit strategy parameters */
116 void (*overlimit)(struct cbq_class *cl);
117 long penalty;
118
119 /* General scheduler (WRR) parameters */
120 long allot;
121 long quantum; /* Allotment per WRR round */
122 long weight; /* Relative allotment: see below */
123
124 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
125 struct cbq_class *split; /* Ptr to split node */
126 struct cbq_class *share; /* Ptr to LS parent in the class tree */
127 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
128 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
129 parent otherwise */
130 struct cbq_class *sibling; /* Sibling chain */
131 struct cbq_class *children; /* Pointer to children chain */
132
133 struct Qdisc *q; /* Elementary queueing discipline */
134
135
136/* Variables */
137 unsigned char cpriority; /* Effective priority */
138 unsigned char delayed;
139 unsigned char level; /* level of the class in hierarchy:
140 0 for leaf classes, and maximal
141 level of children + 1 for nodes.
142 */
143
144 psched_time_t last; /* Last end of service */
145 psched_time_t undertime;
146 long avgidle;
147 long deficit; /* Saved deficit for WRR */
148 unsigned long penalized;
149 struct gnet_stats_basic bstats;
150 struct gnet_stats_queue qstats;
151 struct gnet_stats_rate_est rate_est;
152 spinlock_t *stats_lock;
153 struct tc_cbq_xstats xstats;
154
155 struct tcf_proto *filter_list;
156
157 int refcnt;
158 int filters;
159
160 struct cbq_class *defaults[TC_PRIO_MAX+1];
161};
162
163struct cbq_sched_data
164{
165 struct cbq_class *classes[16]; /* Hash table of all classes */
166 int nclasses[TC_CBQ_MAXPRIO+1];
167 unsigned quanta[TC_CBQ_MAXPRIO+1];
168
169 struct cbq_class link;
170
171 unsigned activemask;
172 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
173 with backlog */
174
175#ifdef CONFIG_NET_CLS_POLICE
176 struct cbq_class *rx_class;
177#endif
178 struct cbq_class *tx_class;
179 struct cbq_class *tx_borrowed;
180 int tx_len;
181 psched_time_t now; /* Cached timestamp */
182 psched_time_t now_rt; /* Cached real time */
183 unsigned pmask;
184
185 struct timer_list delay_timer;
186 struct timer_list wd_timer; /* Watchdog timer,
187 started when CBQ has
188 backlog, but cannot
189 transmit just now */
190 long wd_expires;
191 int toplevel;
192 u32 hgenerator;
193};
194
195
196#define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log])
197
198
199static __inline__ unsigned cbq_hash(u32 h)
200{
201 h ^= h>>8;
202 h ^= h>>4;
203 return h&0xF;
204}
205
206static __inline__ struct cbq_class *
207cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
208{
209 struct cbq_class *cl;
210
211 for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next)
212 if (cl->classid == classid)
213 return cl;
214 return NULL;
215}
216
217#ifdef CONFIG_NET_CLS_POLICE
218
219static struct cbq_class *
220cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
221{
222 struct cbq_class *cl, *new;
223
224 for (cl = this->tparent; cl; cl = cl->tparent)
225 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
226 return new;
227
228 return NULL;
229}
230
231#endif
232
233/* Classify packet. The procedure is pretty complicated, but
234 it allows us to combine link sharing and priority scheduling
235 transparently.
236
237 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
238 so that it resolves to split nodes. Then packets are classified
239 by logical priority, or a more specific classifier may be attached
240 to the split node.
241 */
242
243static struct cbq_class *
244cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
245{
246 struct cbq_sched_data *q = qdisc_priv(sch);
247 struct cbq_class *head = &q->link;
248 struct cbq_class **defmap;
249 struct cbq_class *cl = NULL;
250 u32 prio = skb->priority;
251 struct tcf_result res;
252
253 /*
254 * Step 1. If skb->priority points to one of our classes, use it.
255 */
256 if (TC_H_MAJ(prio^sch->handle) == 0 &&
257 (cl = cbq_class_lookup(q, prio)) != NULL)
258 return cl;
259
260 *qerr = NET_XMIT_DROP;
261 for (;;) {
262 int result = 0;
263 defmap = head->defaults;
264
265 /*
266 * Step 2+n. Apply classifier.
267 */
268 if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0)
269 goto fallback;
270
271 if ((cl = (void*)res.class) == NULL) {
272 if (TC_H_MAJ(res.classid))
273 cl = cbq_class_lookup(q, res.classid);
274 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
275 cl = defmap[TC_PRIO_BESTEFFORT];
276
277 if (cl == NULL || cl->level >= head->level)
278 goto fallback;
279 }
280
281#ifdef CONFIG_NET_CLS_ACT
282 switch (result) {
283 case TC_ACT_QUEUED:
284 case TC_ACT_STOLEN:
285 *qerr = NET_XMIT_SUCCESS;
286 case TC_ACT_SHOT:
287 return NULL;
288 }
289#elif defined(CONFIG_NET_CLS_POLICE)
290 switch (result) {
291 case TC_POLICE_RECLASSIFY:
292 return cbq_reclassify(skb, cl);
293 case TC_POLICE_SHOT:
294 return NULL;
295 default:
296 break;
297 }
298#endif
299 if (cl->level == 0)
300 return cl;
301
302 /*
303 * Step 3+n. If classifier selected a link sharing class,
304 * apply agency specific classifier.
305 * Repeat this procdure until we hit a leaf node.
306 */
307 head = cl;
308 }
309
310fallback:
311 cl = head;
312
313 /*
314 * Step 4. No success...
315 */
316 if (TC_H_MAJ(prio) == 0 &&
317 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
318 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
319 return head;
320
321 return cl;
322}
323
324/*
325 A packet has just been enqueued on the empty class.
326 cbq_activate_class adds it to the tail of active class list
327 of its priority band.
328 */
329
330static __inline__ void cbq_activate_class(struct cbq_class *cl)
331{
332 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
333 int prio = cl->cpriority;
334 struct cbq_class *cl_tail;
335
336 cl_tail = q->active[prio];
337 q->active[prio] = cl;
338
339 if (cl_tail != NULL) {
340 cl->next_alive = cl_tail->next_alive;
341 cl_tail->next_alive = cl;
342 } else {
343 cl->next_alive = cl;
344 q->activemask |= (1<<prio);
345 }
346}
347
348/*
349 Unlink class from active chain.
350 Note that this same procedure is done directly in cbq_dequeue*
351 during round-robin procedure.
352 */
353
354static void cbq_deactivate_class(struct cbq_class *this)
355{
356 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
357 int prio = this->cpriority;
358 struct cbq_class *cl;
359 struct cbq_class *cl_prev = q->active[prio];
360
361 do {
362 cl = cl_prev->next_alive;
363 if (cl == this) {
364 cl_prev->next_alive = cl->next_alive;
365 cl->next_alive = NULL;
366
367 if (cl == q->active[prio]) {
368 q->active[prio] = cl_prev;
369 if (cl == q->active[prio]) {
370 q->active[prio] = NULL;
371 q->activemask &= ~(1<<prio);
372 return;
373 }
374 }
375
376 cl = cl_prev->next_alive;
377 return;
378 }
379 } while ((cl_prev = cl) != q->active[prio]);
380}
381
382static void
383cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
384{
385 int toplevel = q->toplevel;
386
387 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
388 psched_time_t now;
389 psched_tdiff_t incr;
390
391 PSCHED_GET_TIME(now);
392 incr = PSCHED_TDIFF(now, q->now_rt);
393 PSCHED_TADD2(q->now, incr, now);
394
395 do {
396 if (PSCHED_TLESS(cl->undertime, now)) {
397 q->toplevel = cl->level;
398 return;
399 }
400 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
401 }
402}
403
404static int
405cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
406{
407 struct cbq_sched_data *q = qdisc_priv(sch);
408 int len = skb->len;
409 int ret;
410 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
411
412#ifdef CONFIG_NET_CLS_POLICE
413 q->rx_class = cl;
414#endif
415 if (cl == NULL) {
416 if (ret == NET_XMIT_DROP)
417 sch->qstats.drops++;
418 kfree_skb(skb);
419 return ret;
420 }
421
422#ifdef CONFIG_NET_CLS_POLICE
423 cl->q->__parent = sch;
424#endif
425 if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) {
426 sch->q.qlen++;
427 sch->bstats.packets++;
428 sch->bstats.bytes+=len;
429 cbq_mark_toplevel(q, cl);
430 if (!cl->next_alive)
431 cbq_activate_class(cl);
432 return ret;
433 }
434
435 sch->qstats.drops++;
436 cbq_mark_toplevel(q, cl);
437 cl->qstats.drops++;
438 return ret;
439}
440
441static int
442cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
443{
444 struct cbq_sched_data *q = qdisc_priv(sch);
445 struct cbq_class *cl;
446 int ret;
447
448 if ((cl = q->tx_class) == NULL) {
449 kfree_skb(skb);
450 sch->qstats.drops++;
451 return NET_XMIT_CN;
452 }
453 q->tx_class = NULL;
454
455 cbq_mark_toplevel(q, cl);
456
457#ifdef CONFIG_NET_CLS_POLICE
458 q->rx_class = cl;
459 cl->q->__parent = sch;
460#endif
461 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
462 sch->q.qlen++;
463 sch->qstats.requeues++;
464 if (!cl->next_alive)
465 cbq_activate_class(cl);
466 return 0;
467 }
468 sch->qstats.drops++;
469 cl->qstats.drops++;
470 return ret;
471}
472
473/* Overlimit actions */
474
475/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
476
477static void cbq_ovl_classic(struct cbq_class *cl)
478{
479 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
480 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
481
482 if (!cl->delayed) {
483 delay += cl->offtime;
484
485 /*
486 Class goes to sleep, so that it will have no
487 chance to work avgidle. Let's forgive it 8)
488
489 BTW cbq-2.0 has a crap in this
490 place, apparently they forgot to shift it by cl->ewma_log.
491 */
492 if (cl->avgidle < 0)
493 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
494 if (cl->avgidle < cl->minidle)
495 cl->avgidle = cl->minidle;
496 if (delay <= 0)
497 delay = 1;
498 PSCHED_TADD2(q->now, delay, cl->undertime);
499
500 cl->xstats.overactions++;
501 cl->delayed = 1;
502 }
503 if (q->wd_expires == 0 || q->wd_expires > delay)
504 q->wd_expires = delay;
505
506 /* Dirty work! We must schedule wakeups based on
507 real available rate, rather than leaf rate,
508 which may be tiny (even zero).
509 */
510 if (q->toplevel == TC_CBQ_MAXLEVEL) {
511 struct cbq_class *b;
512 psched_tdiff_t base_delay = q->wd_expires;
513
514 for (b = cl->borrow; b; b = b->borrow) {
515 delay = PSCHED_TDIFF(b->undertime, q->now);
516 if (delay < base_delay) {
517 if (delay <= 0)
518 delay = 1;
519 base_delay = delay;
520 }
521 }
522
523 q->wd_expires = base_delay;
524 }
525}
526
527/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
528 they go overlimit
529 */
530
531static void cbq_ovl_rclassic(struct cbq_class *cl)
532{
533 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
534 struct cbq_class *this = cl;
535
536 do {
537 if (cl->level > q->toplevel) {
538 cl = NULL;
539 break;
540 }
541 } while ((cl = cl->borrow) != NULL);
542
543 if (cl == NULL)
544 cl = this;
545 cbq_ovl_classic(cl);
546}
547
548/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
549
550static void cbq_ovl_delay(struct cbq_class *cl)
551{
552 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
553 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
554
555 if (!cl->delayed) {
556 unsigned long sched = jiffies;
557
558 delay += cl->offtime;
559 if (cl->avgidle < 0)
560 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
561 if (cl->avgidle < cl->minidle)
562 cl->avgidle = cl->minidle;
563 PSCHED_TADD2(q->now, delay, cl->undertime);
564
565 if (delay > 0) {
566 sched += PSCHED_US2JIFFIE(delay) + cl->penalty;
567 cl->penalized = sched;
568 cl->cpriority = TC_CBQ_MAXPRIO;
569 q->pmask |= (1<<TC_CBQ_MAXPRIO);
570 if (del_timer(&q->delay_timer) &&
571 (long)(q->delay_timer.expires - sched) > 0)
572 q->delay_timer.expires = sched;
573 add_timer(&q->delay_timer);
574 cl->delayed = 1;
575 cl->xstats.overactions++;
576 return;
577 }
578 delay = 1;
579 }
580 if (q->wd_expires == 0 || q->wd_expires > delay)
581 q->wd_expires = delay;
582}
583
584/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
585
586static void cbq_ovl_lowprio(struct cbq_class *cl)
587{
588 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
589
590 cl->penalized = jiffies + cl->penalty;
591
592 if (cl->cpriority != cl->priority2) {
593 cl->cpriority = cl->priority2;
594 q->pmask |= (1<<cl->cpriority);
595 cl->xstats.overactions++;
596 }
597 cbq_ovl_classic(cl);
598}
599
600/* TC_CBQ_OVL_DROP: penalize class by dropping */
601
602static void cbq_ovl_drop(struct cbq_class *cl)
603{
604 if (cl->q->ops->drop)
605 if (cl->q->ops->drop(cl->q))
606 cl->qdisc->q.qlen--;
607 cl->xstats.overactions++;
608 cbq_ovl_classic(cl);
609}
610
611static void cbq_watchdog(unsigned long arg)
612{
613 struct Qdisc *sch = (struct Qdisc*)arg;
614
615 sch->flags &= ~TCQ_F_THROTTLED;
616 netif_schedule(sch->dev);
617}
618
619static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio)
620{
621 struct cbq_class *cl;
622 struct cbq_class *cl_prev = q->active[prio];
623 unsigned long now = jiffies;
624 unsigned long sched = now;
625
626 if (cl_prev == NULL)
627 return now;
628
629 do {
630 cl = cl_prev->next_alive;
631 if ((long)(now - cl->penalized) > 0) {
632 cl_prev->next_alive = cl->next_alive;
633 cl->next_alive = NULL;
634 cl->cpriority = cl->priority;
635 cl->delayed = 0;
636 cbq_activate_class(cl);
637
638 if (cl == q->active[prio]) {
639 q->active[prio] = cl_prev;
640 if (cl == q->active[prio]) {
641 q->active[prio] = NULL;
642 return 0;
643 }
644 }
645
646 cl = cl_prev->next_alive;
647 } else if ((long)(sched - cl->penalized) > 0)
648 sched = cl->penalized;
649 } while ((cl_prev = cl) != q->active[prio]);
650
651 return (long)(sched - now);
652}
653
654static void cbq_undelay(unsigned long arg)
655{
656 struct Qdisc *sch = (struct Qdisc*)arg;
657 struct cbq_sched_data *q = qdisc_priv(sch);
658 long delay = 0;
659 unsigned pmask;
660
661 pmask = q->pmask;
662 q->pmask = 0;
663
664 while (pmask) {
665 int prio = ffz(~pmask);
666 long tmp;
667
668 pmask &= ~(1<<prio);
669
670 tmp = cbq_undelay_prio(q, prio);
671 if (tmp > 0) {
672 q->pmask |= 1<<prio;
673 if (tmp < delay || delay == 0)
674 delay = tmp;
675 }
676 }
677
678 if (delay) {
679 q->delay_timer.expires = jiffies + delay;
680 add_timer(&q->delay_timer);
681 }
682
683 sch->flags &= ~TCQ_F_THROTTLED;
684 netif_schedule(sch->dev);
685}
686
687
688#ifdef CONFIG_NET_CLS_POLICE
689
690static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
691{
692 int len = skb->len;
693 struct Qdisc *sch = child->__parent;
694 struct cbq_sched_data *q = qdisc_priv(sch);
695 struct cbq_class *cl = q->rx_class;
696
697 q->rx_class = NULL;
698
699 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
700
701 cbq_mark_toplevel(q, cl);
702
703 q->rx_class = cl;
704 cl->q->__parent = sch;
705
706 if (cl->q->enqueue(skb, cl->q) == 0) {
707 sch->q.qlen++;
708 sch->bstats.packets++;
709 sch->bstats.bytes+=len;
710 if (!cl->next_alive)
711 cbq_activate_class(cl);
712 return 0;
713 }
714 sch->qstats.drops++;
715 return 0;
716 }
717
718 sch->qstats.drops++;
719 return -1;
720}
721#endif
722
723/*
724 It is mission critical procedure.
725
726 We "regenerate" toplevel cutoff, if transmitting class
727 has backlog and it is not regulated. It is not part of
728 original CBQ description, but looks more reasonable.
729 Probably, it is wrong. This question needs further investigation.
730*/
731
732static __inline__ void
733cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
734 struct cbq_class *borrowed)
735{
736 if (cl && q->toplevel >= borrowed->level) {
737 if (cl->q->q.qlen > 1) {
738 do {
739 if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) {
740 q->toplevel = borrowed->level;
741 return;
742 }
743 } while ((borrowed=borrowed->borrow) != NULL);
744 }
745#if 0
746 /* It is not necessary now. Uncommenting it
747 will save CPU cycles, but decrease fairness.
748 */
749 q->toplevel = TC_CBQ_MAXLEVEL;
750#endif
751 }
752}
753
754static void
755cbq_update(struct cbq_sched_data *q)
756{
757 struct cbq_class *this = q->tx_class;
758 struct cbq_class *cl = this;
759 int len = q->tx_len;
760
761 q->tx_class = NULL;
762
763 for ( ; cl; cl = cl->share) {
764 long avgidle = cl->avgidle;
765 long idle;
766
767 cl->bstats.packets++;
768 cl->bstats.bytes += len;
769
770 /*
771 (now - last) is total time between packet right edges.
772 (last_pktlen/rate) is "virtual" busy time, so that
773
774 idle = (now - last) - last_pktlen/rate
775 */
776
777 idle = PSCHED_TDIFF(q->now, cl->last);
778 if ((unsigned long)idle > 128*1024*1024) {
779 avgidle = cl->maxidle;
780 } else {
781 idle -= L2T(cl, len);
782
783 /* true_avgidle := (1-W)*true_avgidle + W*idle,
784 where W=2^{-ewma_log}. But cl->avgidle is scaled:
785 cl->avgidle == true_avgidle/W,
786 hence:
787 */
788 avgidle += idle - (avgidle>>cl->ewma_log);
789 }
790
791 if (avgidle <= 0) {
792 /* Overlimit or at-limit */
793
794 if (avgidle < cl->minidle)
795 avgidle = cl->minidle;
796
797 cl->avgidle = avgidle;
798
799 /* Calculate expected time, when this class
800 will be allowed to send.
801 It will occur, when:
802 (1-W)*true_avgidle + W*delay = 0, i.e.
803 idle = (1/W - 1)*(-true_avgidle)
804 or
805 idle = (1 - W)*(-cl->avgidle);
806 */
807 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
808
809 /*
810 That is not all.
811 To maintain the rate allocated to the class,
812 we add to undertime virtual clock,
813 necessary to complete transmitted packet.
814 (len/phys_bandwidth has been already passed
815 to the moment of cbq_update)
816 */
817
818 idle -= L2T(&q->link, len);
819 idle += L2T(cl, len);
820
821 PSCHED_AUDIT_TDIFF(idle);
822
823 PSCHED_TADD2(q->now, idle, cl->undertime);
824 } else {
825 /* Underlimit */
826
827 PSCHED_SET_PASTPERFECT(cl->undertime);
828 if (avgidle > cl->maxidle)
829 cl->avgidle = cl->maxidle;
830 else
831 cl->avgidle = avgidle;
832 }
833 cl->last = q->now;
834 }
835
836 cbq_update_toplevel(q, this, q->tx_borrowed);
837}
838
839static __inline__ struct cbq_class *
840cbq_under_limit(struct cbq_class *cl)
841{
842 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
843 struct cbq_class *this_cl = cl;
844
845 if (cl->tparent == NULL)
846 return cl;
847
848 if (PSCHED_IS_PASTPERFECT(cl->undertime) ||
849 !PSCHED_TLESS(q->now, cl->undertime)) {
850 cl->delayed = 0;
851 return cl;
852 }
853
854 do {
855 /* It is very suspicious place. Now overlimit
856 action is generated for not bounded classes
857 only if link is completely congested.
858 Though it is in agree with ancestor-only paradigm,
859 it looks very stupid. Particularly,
860 it means that this chunk of code will either
861 never be called or result in strong amplification
862 of burstiness. Dangerous, silly, and, however,
863 no another solution exists.
864 */
865 if ((cl = cl->borrow) == NULL) {
866 this_cl->qstats.overlimits++;
867 this_cl->overlimit(this_cl);
868 return NULL;
869 }
870 if (cl->level > q->toplevel)
871 return NULL;
872 } while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
873 PSCHED_TLESS(q->now, cl->undertime));
874
875 cl->delayed = 0;
876 return cl;
877}
878
879static __inline__ struct sk_buff *
880cbq_dequeue_prio(struct Qdisc *sch, int prio)
881{
882 struct cbq_sched_data *q = qdisc_priv(sch);
883 struct cbq_class *cl_tail, *cl_prev, *cl;
884 struct sk_buff *skb;
885 int deficit;
886
887 cl_tail = cl_prev = q->active[prio];
888 cl = cl_prev->next_alive;
889
890 do {
891 deficit = 0;
892
893 /* Start round */
894 do {
895 struct cbq_class *borrow = cl;
896
897 if (cl->q->q.qlen &&
898 (borrow = cbq_under_limit(cl)) == NULL)
899 goto skip_class;
900
901 if (cl->deficit <= 0) {
902 /* Class exhausted its allotment per
903 this round. Switch to the next one.
904 */
905 deficit = 1;
906 cl->deficit += cl->quantum;
907 goto next_class;
908 }
909
910 skb = cl->q->dequeue(cl->q);
911
912 /* Class did not give us any skb :-(
913 It could occur even if cl->q->q.qlen != 0
914 f.e. if cl->q == "tbf"
915 */
916 if (skb == NULL)
917 goto skip_class;
918
919 cl->deficit -= skb->len;
920 q->tx_class = cl;
921 q->tx_borrowed = borrow;
922 if (borrow != cl) {
923#ifndef CBQ_XSTATS_BORROWS_BYTES
924 borrow->xstats.borrows++;
925 cl->xstats.borrows++;
926#else
927 borrow->xstats.borrows += skb->len;
928 cl->xstats.borrows += skb->len;
929#endif
930 }
931 q->tx_len = skb->len;
932
933 if (cl->deficit <= 0) {
934 q->active[prio] = cl;
935 cl = cl->next_alive;
936 cl->deficit += cl->quantum;
937 }
938 return skb;
939
940skip_class:
941 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
942 /* Class is empty or penalized.
943 Unlink it from active chain.
944 */
945 cl_prev->next_alive = cl->next_alive;
946 cl->next_alive = NULL;
947
948 /* Did cl_tail point to it? */
949 if (cl == cl_tail) {
950 /* Repair it! */
951 cl_tail = cl_prev;
952
953 /* Was it the last class in this band? */
954 if (cl == cl_tail) {
955 /* Kill the band! */
956 q->active[prio] = NULL;
957 q->activemask &= ~(1<<prio);
958 if (cl->q->q.qlen)
959 cbq_activate_class(cl);
960 return NULL;
961 }
962
963 q->active[prio] = cl_tail;
964 }
965 if (cl->q->q.qlen)
966 cbq_activate_class(cl);
967
968 cl = cl_prev;
969 }
970
971next_class:
972 cl_prev = cl;
973 cl = cl->next_alive;
974 } while (cl_prev != cl_tail);
975 } while (deficit);
976
977 q->active[prio] = cl_prev;
978
979 return NULL;
980}
981
982static __inline__ struct sk_buff *
983cbq_dequeue_1(struct Qdisc *sch)
984{
985 struct cbq_sched_data *q = qdisc_priv(sch);
986 struct sk_buff *skb;
987 unsigned activemask;
988
989 activemask = q->activemask&0xFF;
990 while (activemask) {
991 int prio = ffz(~activemask);
992 activemask &= ~(1<<prio);
993 skb = cbq_dequeue_prio(sch, prio);
994 if (skb)
995 return skb;
996 }
997 return NULL;
998}
999
1000static struct sk_buff *
1001cbq_dequeue(struct Qdisc *sch)
1002{
1003 struct sk_buff *skb;
1004 struct cbq_sched_data *q = qdisc_priv(sch);
1005 psched_time_t now;
1006 psched_tdiff_t incr;
1007
1008 PSCHED_GET_TIME(now);
1009 incr = PSCHED_TDIFF(now, q->now_rt);
1010
1011 if (q->tx_class) {
1012 psched_tdiff_t incr2;
1013 /* Time integrator. We calculate EOS time
1014 by adding expected packet transmission time.
1015 If real time is greater, we warp artificial clock,
1016 so that:
1017
1018 cbq_time = max(real_time, work);
1019 */
1020 incr2 = L2T(&q->link, q->tx_len);
1021 PSCHED_TADD(q->now, incr2);
1022 cbq_update(q);
1023 if ((incr -= incr2) < 0)
1024 incr = 0;
1025 }
1026 PSCHED_TADD(q->now, incr);
1027 q->now_rt = now;
1028
1029 for (;;) {
1030 q->wd_expires = 0;
1031
1032 skb = cbq_dequeue_1(sch);
1033 if (skb) {
1034 sch->q.qlen--;
1035 sch->flags &= ~TCQ_F_THROTTLED;
1036 return skb;
1037 }
1038
1039 /* All the classes are overlimit.
1040
1041 It is possible, if:
1042
1043 1. Scheduler is empty.
1044 2. Toplevel cutoff inhibited borrowing.
1045 3. Root class is overlimit.
1046
1047 Reset 2d and 3d conditions and retry.
1048
1049 Note, that NS and cbq-2.0 are buggy, peeking
1050 an arbitrary class is appropriate for ancestor-only
1051 sharing, but not for toplevel algorithm.
1052
1053 Our version is better, but slower, because it requires
1054 two passes, but it is unavoidable with top-level sharing.
1055 */
1056
1057 if (q->toplevel == TC_CBQ_MAXLEVEL &&
1058 PSCHED_IS_PASTPERFECT(q->link.undertime))
1059 break;
1060
1061 q->toplevel = TC_CBQ_MAXLEVEL;
1062 PSCHED_SET_PASTPERFECT(q->link.undertime);
1063 }
1064
1065 /* No packets in scheduler or nobody wants to give them to us :-(
1066 Sigh... start watchdog timer in the last case. */
1067
1068 if (sch->q.qlen) {
1069 sch->qstats.overlimits++;
1070 if (q->wd_expires) {
1071 long delay = PSCHED_US2JIFFIE(q->wd_expires);
1072 if (delay <= 0)
1073 delay = 1;
1074 mod_timer(&q->wd_timer, jiffies + delay);
1075 sch->flags |= TCQ_F_THROTTLED;
1076 }
1077 }
1078 return NULL;
1079}
1080
1081/* CBQ class maintanance routines */
1082
1083static void cbq_adjust_levels(struct cbq_class *this)
1084{
1085 if (this == NULL)
1086 return;
1087
1088 do {
1089 int level = 0;
1090 struct cbq_class *cl;
1091
1092 if ((cl = this->children) != NULL) {
1093 do {
1094 if (cl->level > level)
1095 level = cl->level;
1096 } while ((cl = cl->sibling) != this->children);
1097 }
1098 this->level = level+1;
1099 } while ((this = this->tparent) != NULL);
1100}
1101
1102static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1103{
1104 struct cbq_class *cl;
1105 unsigned h;
1106
1107 if (q->quanta[prio] == 0)
1108 return;
1109
1110 for (h=0; h<16; h++) {
1111 for (cl = q->classes[h]; cl; cl = cl->next) {
1112 /* BUGGGG... Beware! This expression suffer of
1113 arithmetic overflows!
1114 */
1115 if (cl->priority == prio) {
1116 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1117 q->quanta[prio];
1118 }
1119 if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) {
1120 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum);
1121 cl->quantum = cl->qdisc->dev->mtu/2 + 1;
1122 }
1123 }
1124 }
1125}
1126
1127static void cbq_sync_defmap(struct cbq_class *cl)
1128{
1129 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1130 struct cbq_class *split = cl->split;
1131 unsigned h;
1132 int i;
1133
1134 if (split == NULL)
1135 return;
1136
1137 for (i=0; i<=TC_PRIO_MAX; i++) {
1138 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1139 split->defaults[i] = NULL;
1140 }
1141
1142 for (i=0; i<=TC_PRIO_MAX; i++) {
1143 int level = split->level;
1144
1145 if (split->defaults[i])
1146 continue;
1147
1148 for (h=0; h<16; h++) {
1149 struct cbq_class *c;
1150
1151 for (c = q->classes[h]; c; c = c->next) {
1152 if (c->split == split && c->level < level &&
1153 c->defmap&(1<<i)) {
1154 split->defaults[i] = c;
1155 level = c->level;
1156 }
1157 }
1158 }
1159 }
1160}
1161
1162static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1163{
1164 struct cbq_class *split = NULL;
1165
1166 if (splitid == 0) {
1167 if ((split = cl->split) == NULL)
1168 return;
1169 splitid = split->classid;
1170 }
1171
1172 if (split == NULL || split->classid != splitid) {
1173 for (split = cl->tparent; split; split = split->tparent)
1174 if (split->classid == splitid)
1175 break;
1176 }
1177
1178 if (split == NULL)
1179 return;
1180
1181 if (cl->split != split) {
1182 cl->defmap = 0;
1183 cbq_sync_defmap(cl);
1184 cl->split = split;
1185 cl->defmap = def&mask;
1186 } else
1187 cl->defmap = (cl->defmap&~mask)|(def&mask);
1188
1189 cbq_sync_defmap(cl);
1190}
1191
1192static void cbq_unlink_class(struct cbq_class *this)
1193{
1194 struct cbq_class *cl, **clp;
1195 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1196
1197 for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) {
1198 if (cl == this) {
1199 *clp = cl->next;
1200 cl->next = NULL;
1201 break;
1202 }
1203 }
1204
1205 if (this->tparent) {
1206 clp=&this->sibling;
1207 cl = *clp;
1208 do {
1209 if (cl == this) {
1210 *clp = cl->sibling;
1211 break;
1212 }
1213 clp = &cl->sibling;
1214 } while ((cl = *clp) != this->sibling);
1215
1216 if (this->tparent->children == this) {
1217 this->tparent->children = this->sibling;
1218 if (this->sibling == this)
1219 this->tparent->children = NULL;
1220 }
1221 } else {
1222 BUG_TRAP(this->sibling == this);
1223 }
1224}
1225
1226static void cbq_link_class(struct cbq_class *this)
1227{
1228 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1229 unsigned h = cbq_hash(this->classid);
1230 struct cbq_class *parent = this->tparent;
1231
1232 this->sibling = this;
1233 this->next = q->classes[h];
1234 q->classes[h] = this;
1235
1236 if (parent == NULL)
1237 return;
1238
1239 if (parent->children == NULL) {
1240 parent->children = this;
1241 } else {
1242 this->sibling = parent->children->sibling;
1243 parent->children->sibling = this;
1244 }
1245}
1246
1247static unsigned int cbq_drop(struct Qdisc* sch)
1248{
1249 struct cbq_sched_data *q = qdisc_priv(sch);
1250 struct cbq_class *cl, *cl_head;
1251 int prio;
1252 unsigned int len;
1253
1254 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1255 if ((cl_head = q->active[prio]) == NULL)
1256 continue;
1257
1258 cl = cl_head;
1259 do {
1260 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1261 sch->q.qlen--;
1262 return len;
1263 }
1264 } while ((cl = cl->next_alive) != cl_head);
1265 }
1266 return 0;
1267}
1268
1269static void
1270cbq_reset(struct Qdisc* sch)
1271{
1272 struct cbq_sched_data *q = qdisc_priv(sch);
1273 struct cbq_class *cl;
1274 int prio;
1275 unsigned h;
1276
1277 q->activemask = 0;
1278 q->pmask = 0;
1279 q->tx_class = NULL;
1280 q->tx_borrowed = NULL;
1281 del_timer(&q->wd_timer);
1282 del_timer(&q->delay_timer);
1283 q->toplevel = TC_CBQ_MAXLEVEL;
1284 PSCHED_GET_TIME(q->now);
1285 q->now_rt = q->now;
1286
1287 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1288 q->active[prio] = NULL;
1289
1290 for (h = 0; h < 16; h++) {
1291 for (cl = q->classes[h]; cl; cl = cl->next) {
1292 qdisc_reset(cl->q);
1293
1294 cl->next_alive = NULL;
1295 PSCHED_SET_PASTPERFECT(cl->undertime);
1296 cl->avgidle = cl->maxidle;
1297 cl->deficit = cl->quantum;
1298 cl->cpriority = cl->priority;
1299 }
1300 }
1301 sch->q.qlen = 0;
1302}
1303
1304
1305static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1306{
1307 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1308 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1309 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1310 }
1311 if (lss->change&TCF_CBQ_LSS_EWMA)
1312 cl->ewma_log = lss->ewma_log;
1313 if (lss->change&TCF_CBQ_LSS_AVPKT)
1314 cl->avpkt = lss->avpkt;
1315 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1316 cl->minidle = -(long)lss->minidle;
1317 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1318 cl->maxidle = lss->maxidle;
1319 cl->avgidle = lss->maxidle;
1320 }
1321 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1322 cl->offtime = lss->offtime;
1323 return 0;
1324}
1325
1326static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1327{
1328 q->nclasses[cl->priority]--;
1329 q->quanta[cl->priority] -= cl->weight;
1330 cbq_normalize_quanta(q, cl->priority);
1331}
1332
1333static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1334{
1335 q->nclasses[cl->priority]++;
1336 q->quanta[cl->priority] += cl->weight;
1337 cbq_normalize_quanta(q, cl->priority);
1338}
1339
1340static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1341{
1342 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1343
1344 if (wrr->allot)
1345 cl->allot = wrr->allot;
1346 if (wrr->weight)
1347 cl->weight = wrr->weight;
1348 if (wrr->priority) {
1349 cl->priority = wrr->priority-1;
1350 cl->cpriority = cl->priority;
1351 if (cl->priority >= cl->priority2)
1352 cl->priority2 = TC_CBQ_MAXPRIO-1;
1353 }
1354
1355 cbq_addprio(q, cl);
1356 return 0;
1357}
1358
1359static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1360{
1361 switch (ovl->strategy) {
1362 case TC_CBQ_OVL_CLASSIC:
1363 cl->overlimit = cbq_ovl_classic;
1364 break;
1365 case TC_CBQ_OVL_DELAY:
1366 cl->overlimit = cbq_ovl_delay;
1367 break;
1368 case TC_CBQ_OVL_LOWPRIO:
1369 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1370 ovl->priority2-1 <= cl->priority)
1371 return -EINVAL;
1372 cl->priority2 = ovl->priority2-1;
1373 cl->overlimit = cbq_ovl_lowprio;
1374 break;
1375 case TC_CBQ_OVL_DROP:
1376 cl->overlimit = cbq_ovl_drop;
1377 break;
1378 case TC_CBQ_OVL_RCLASSIC:
1379 cl->overlimit = cbq_ovl_rclassic;
1380 break;
1381 default:
1382 return -EINVAL;
1383 }
1384 cl->penalty = (ovl->penalty*HZ)/1000;
1385 return 0;
1386}
1387
1388#ifdef CONFIG_NET_CLS_POLICE
1389static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1390{
1391 cl->police = p->police;
1392
1393 if (cl->q->handle) {
1394 if (p->police == TC_POLICE_RECLASSIFY)
1395 cl->q->reshape_fail = cbq_reshape_fail;
1396 else
1397 cl->q->reshape_fail = NULL;
1398 }
1399 return 0;
1400}
1401#endif
1402
1403static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1404{
1405 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1406 return 0;
1407}
1408
1409static int cbq_init(struct Qdisc *sch, struct rtattr *opt)
1410{
1411 struct cbq_sched_data *q = qdisc_priv(sch);
1412 struct rtattr *tb[TCA_CBQ_MAX];
1413 struct tc_ratespec *r;
1414
1415 if (rtattr_parse_nested(tb, TCA_CBQ_MAX, opt) < 0 ||
1416 tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1417 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1418 return -EINVAL;
1419
1420 if (tb[TCA_CBQ_LSSOPT-1] &&
1421 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1422 return -EINVAL;
1423
1424 r = RTA_DATA(tb[TCA_CBQ_RATE-1]);
1425
1426 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL)
1427 return -EINVAL;
1428
1429 q->link.refcnt = 1;
1430 q->link.sibling = &q->link;
1431 q->link.classid = sch->handle;
1432 q->link.qdisc = sch;
1433 if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
1434 q->link.q = &noop_qdisc;
1435
1436 q->link.priority = TC_CBQ_MAXPRIO-1;
1437 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1438 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1439 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1440 q->link.overlimit = cbq_ovl_classic;
1441 q->link.allot = psched_mtu(sch->dev);
1442 q->link.quantum = q->link.allot;
1443 q->link.weight = q->link.R_tab->rate.rate;
1444
1445 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1446 q->link.avpkt = q->link.allot/2;
1447 q->link.minidle = -0x7FFFFFFF;
1448 q->link.stats_lock = &sch->dev->queue_lock;
1449
1450 init_timer(&q->wd_timer);
1451 q->wd_timer.data = (unsigned long)sch;
1452 q->wd_timer.function = cbq_watchdog;
1453 init_timer(&q->delay_timer);
1454 q->delay_timer.data = (unsigned long)sch;
1455 q->delay_timer.function = cbq_undelay;
1456 q->toplevel = TC_CBQ_MAXLEVEL;
1457 PSCHED_GET_TIME(q->now);
1458 q->now_rt = q->now;
1459
1460 cbq_link_class(&q->link);
1461
1462 if (tb[TCA_CBQ_LSSOPT-1])
1463 cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1464
1465 cbq_addprio(q, &q->link);
1466 return 0;
1467}
1468
1469static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1470{
1471 unsigned char *b = skb->tail;
1472
1473 RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1474 return skb->len;
1475
1476rtattr_failure:
1477 skb_trim(skb, b - skb->data);
1478 return -1;
1479}
1480
1481static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1482{
1483 unsigned char *b = skb->tail;
1484 struct tc_cbq_lssopt opt;
1485
1486 opt.flags = 0;
1487 if (cl->borrow == NULL)
1488 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1489 if (cl->share == NULL)
1490 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1491 opt.ewma_log = cl->ewma_log;
1492 opt.level = cl->level;
1493 opt.avpkt = cl->avpkt;
1494 opt.maxidle = cl->maxidle;
1495 opt.minidle = (u32)(-cl->minidle);
1496 opt.offtime = cl->offtime;
1497 opt.change = ~0;
1498 RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1499 return skb->len;
1500
1501rtattr_failure:
1502 skb_trim(skb, b - skb->data);
1503 return -1;
1504}
1505
1506static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1507{
1508 unsigned char *b = skb->tail;
1509 struct tc_cbq_wrropt opt;
1510
1511 opt.flags = 0;
1512 opt.allot = cl->allot;
1513 opt.priority = cl->priority+1;
1514 opt.cpriority = cl->cpriority+1;
1515 opt.weight = cl->weight;
1516 RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1517 return skb->len;
1518
1519rtattr_failure:
1520 skb_trim(skb, b - skb->data);
1521 return -1;
1522}
1523
1524static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1525{
1526 unsigned char *b = skb->tail;
1527 struct tc_cbq_ovl opt;
1528
1529 opt.strategy = cl->ovl_strategy;
1530 opt.priority2 = cl->priority2+1;
1531 opt.penalty = (cl->penalty*1000)/HZ;
1532 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1533 return skb->len;
1534
1535rtattr_failure:
1536 skb_trim(skb, b - skb->data);
1537 return -1;
1538}
1539
1540static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1541{
1542 unsigned char *b = skb->tail;
1543 struct tc_cbq_fopt opt;
1544
1545 if (cl->split || cl->defmap) {
1546 opt.split = cl->split ? cl->split->classid : 0;
1547 opt.defmap = cl->defmap;
1548 opt.defchange = ~0;
1549 RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1550 }
1551 return skb->len;
1552
1553rtattr_failure:
1554 skb_trim(skb, b - skb->data);
1555 return -1;
1556}
1557
1558#ifdef CONFIG_NET_CLS_POLICE
1559static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1560{
1561 unsigned char *b = skb->tail;
1562 struct tc_cbq_police opt;
1563
1564 if (cl->police) {
1565 opt.police = cl->police;
1566 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1567 }
1568 return skb->len;
1569
1570rtattr_failure:
1571 skb_trim(skb, b - skb->data);
1572 return -1;
1573}
1574#endif
1575
1576static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1577{
1578 if (cbq_dump_lss(skb, cl) < 0 ||
1579 cbq_dump_rate(skb, cl) < 0 ||
1580 cbq_dump_wrr(skb, cl) < 0 ||
1581 cbq_dump_ovl(skb, cl) < 0 ||
1582#ifdef CONFIG_NET_CLS_POLICE
1583 cbq_dump_police(skb, cl) < 0 ||
1584#endif
1585 cbq_dump_fopt(skb, cl) < 0)
1586 return -1;
1587 return 0;
1588}
1589
1590static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1591{
1592 struct cbq_sched_data *q = qdisc_priv(sch);
1593 unsigned char *b = skb->tail;
1594 struct rtattr *rta;
1595
1596 rta = (struct rtattr*)b;
1597 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1598 if (cbq_dump_attr(skb, &q->link) < 0)
1599 goto rtattr_failure;
1600 rta->rta_len = skb->tail - b;
1601 return skb->len;
1602
1603rtattr_failure:
1604 skb_trim(skb, b - skb->data);
1605 return -1;
1606}
1607
1608static int
1609cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1610{
1611 struct cbq_sched_data *q = qdisc_priv(sch);
1612
1613 q->link.xstats.avgidle = q->link.avgidle;
1614 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1615}
1616
1617static int
1618cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1619 struct sk_buff *skb, struct tcmsg *tcm)
1620{
1621 struct cbq_class *cl = (struct cbq_class*)arg;
1622 unsigned char *b = skb->tail;
1623 struct rtattr *rta;
1624
1625 if (cl->tparent)
1626 tcm->tcm_parent = cl->tparent->classid;
1627 else
1628 tcm->tcm_parent = TC_H_ROOT;
1629 tcm->tcm_handle = cl->classid;
1630 tcm->tcm_info = cl->q->handle;
1631
1632 rta = (struct rtattr*)b;
1633 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1634 if (cbq_dump_attr(skb, cl) < 0)
1635 goto rtattr_failure;
1636 rta->rta_len = skb->tail - b;
1637 return skb->len;
1638
1639rtattr_failure:
1640 skb_trim(skb, b - skb->data);
1641 return -1;
1642}
1643
1644static int
1645cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1646 struct gnet_dump *d)
1647{
1648 struct cbq_sched_data *q = qdisc_priv(sch);
1649 struct cbq_class *cl = (struct cbq_class*)arg;
1650
1651 cl->qstats.qlen = cl->q->q.qlen;
1652 cl->xstats.avgidle = cl->avgidle;
1653 cl->xstats.undertime = 0;
1654
1655 if (!PSCHED_IS_PASTPERFECT(cl->undertime))
1656 cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now);
1657
1658 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1659#ifdef CONFIG_NET_ESTIMATOR
1660 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1661#endif
1662 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1663 return -1;
1664
1665 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1666}
1667
1668static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1669 struct Qdisc **old)
1670{
1671 struct cbq_class *cl = (struct cbq_class*)arg;
1672
1673 if (cl) {
1674 if (new == NULL) {
1675 if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)) == NULL)
1676 return -ENOBUFS;
1677 } else {
1678#ifdef CONFIG_NET_CLS_POLICE
1679 if (cl->police == TC_POLICE_RECLASSIFY)
1680 new->reshape_fail = cbq_reshape_fail;
1681#endif
1682 }
1683 sch_tree_lock(sch);
1684 *old = cl->q;
1685 cl->q = new;
1686 sch->q.qlen -= (*old)->q.qlen;
1687 qdisc_reset(*old);
1688 sch_tree_unlock(sch);
1689
1690 return 0;
1691 }
1692 return -ENOENT;
1693}
1694
1695static struct Qdisc *
1696cbq_leaf(struct Qdisc *sch, unsigned long arg)
1697{
1698 struct cbq_class *cl = (struct cbq_class*)arg;
1699
1700 return cl ? cl->q : NULL;
1701}
1702
1703static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1704{
1705 struct cbq_sched_data *q = qdisc_priv(sch);
1706 struct cbq_class *cl = cbq_class_lookup(q, classid);
1707
1708 if (cl) {
1709 cl->refcnt++;
1710 return (unsigned long)cl;
1711 }
1712 return 0;
1713}
1714
1715static void cbq_destroy_filters(struct cbq_class *cl)
1716{
1717 struct tcf_proto *tp;
1718
1719 while ((tp = cl->filter_list) != NULL) {
1720 cl->filter_list = tp->next;
1721 tcf_destroy(tp);
1722 }
1723}
1724
1725static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1726{
1727 struct cbq_sched_data *q = qdisc_priv(sch);
1728
1729 BUG_TRAP(!cl->filters);
1730
1731 cbq_destroy_filters(cl);
1732 qdisc_destroy(cl->q);
1733 qdisc_put_rtab(cl->R_tab);
1734#ifdef CONFIG_NET_ESTIMATOR
1735 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1736#endif
1737 if (cl != &q->link)
1738 kfree(cl);
1739}
1740
1741static void
1742cbq_destroy(struct Qdisc* sch)
1743{
1744 struct cbq_sched_data *q = qdisc_priv(sch);
1745 struct cbq_class *cl;
1746 unsigned h;
1747
1748#ifdef CONFIG_NET_CLS_POLICE
1749 q->rx_class = NULL;
1750#endif
1751 /*
1752 * Filters must be destroyed first because we don't destroy the
1753 * classes from root to leafs which means that filters can still
1754 * be bound to classes which have been destroyed already. --TGR '04
1755 */
1756 for (h = 0; h < 16; h++)
1757 for (cl = q->classes[h]; cl; cl = cl->next)
1758 cbq_destroy_filters(cl);
1759
1760 for (h = 0; h < 16; h++) {
1761 struct cbq_class *next;
1762
1763 for (cl = q->classes[h]; cl; cl = next) {
1764 next = cl->next;
1765 cbq_destroy_class(sch, cl);
1766 }
1767 }
1768}
1769
1770static void cbq_put(struct Qdisc *sch, unsigned long arg)
1771{
1772 struct cbq_class *cl = (struct cbq_class*)arg;
1773
1774 if (--cl->refcnt == 0) {
1775#ifdef CONFIG_NET_CLS_POLICE
1776 struct cbq_sched_data *q = qdisc_priv(sch);
1777
1778 spin_lock_bh(&sch->dev->queue_lock);
1779 if (q->rx_class == cl)
1780 q->rx_class = NULL;
1781 spin_unlock_bh(&sch->dev->queue_lock);
1782#endif
1783
1784 cbq_destroy_class(sch, cl);
1785 }
1786}
1787
1788static int
1789cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
1790 unsigned long *arg)
1791{
1792 int err;
1793 struct cbq_sched_data *q = qdisc_priv(sch);
1794 struct cbq_class *cl = (struct cbq_class*)*arg;
1795 struct rtattr *opt = tca[TCA_OPTIONS-1];
1796 struct rtattr *tb[TCA_CBQ_MAX];
1797 struct cbq_class *parent;
1798 struct qdisc_rate_table *rtab = NULL;
1799
1800 if (opt==NULL || rtattr_parse_nested(tb, TCA_CBQ_MAX, opt))
1801 return -EINVAL;
1802
1803 if (tb[TCA_CBQ_OVL_STRATEGY-1] &&
1804 RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl))
1805 return -EINVAL;
1806
1807 if (tb[TCA_CBQ_FOPT-1] &&
1808 RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt))
1809 return -EINVAL;
1810
1811 if (tb[TCA_CBQ_RATE-1] &&
1812 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1813 return -EINVAL;
1814
1815 if (tb[TCA_CBQ_LSSOPT-1] &&
1816 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1817 return -EINVAL;
1818
1819 if (tb[TCA_CBQ_WRROPT-1] &&
1820 RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt))
1821 return -EINVAL;
1822
1823#ifdef CONFIG_NET_CLS_POLICE
1824 if (tb[TCA_CBQ_POLICE-1] &&
1825 RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police))
1826 return -EINVAL;
1827#endif
1828
1829 if (cl) {
1830 /* Check parent */
1831 if (parentid) {
1832 if (cl->tparent && cl->tparent->classid != parentid)
1833 return -EINVAL;
1834 if (!cl->tparent && parentid != TC_H_ROOT)
1835 return -EINVAL;
1836 }
1837
1838 if (tb[TCA_CBQ_RATE-1]) {
1839 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1840 if (rtab == NULL)
1841 return -EINVAL;
1842 }
1843
1844 /* Change class parameters */
1845 sch_tree_lock(sch);
1846
1847 if (cl->next_alive != NULL)
1848 cbq_deactivate_class(cl);
1849
1850 if (rtab) {
1851 rtab = xchg(&cl->R_tab, rtab);
1852 qdisc_put_rtab(rtab);
1853 }
1854
1855 if (tb[TCA_CBQ_LSSOPT-1])
1856 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1857
1858 if (tb[TCA_CBQ_WRROPT-1]) {
1859 cbq_rmprio(q, cl);
1860 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1861 }
1862
1863 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1864 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1865
1866#ifdef CONFIG_NET_CLS_POLICE
1867 if (tb[TCA_CBQ_POLICE-1])
1868 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1869#endif
1870
1871 if (tb[TCA_CBQ_FOPT-1])
1872 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1873
1874 if (cl->q->q.qlen)
1875 cbq_activate_class(cl);
1876
1877 sch_tree_unlock(sch);
1878
1879#ifdef CONFIG_NET_ESTIMATOR
1880 if (tca[TCA_RATE-1])
1881 gen_replace_estimator(&cl->bstats, &cl->rate_est,
1882 cl->stats_lock, tca[TCA_RATE-1]);
1883#endif
1884 return 0;
1885 }
1886
1887 if (parentid == TC_H_ROOT)
1888 return -EINVAL;
1889
1890 if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1891 tb[TCA_CBQ_LSSOPT-1] == NULL)
1892 return -EINVAL;
1893
1894 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1895 if (rtab == NULL)
1896 return -EINVAL;
1897
1898 if (classid) {
1899 err = -EINVAL;
1900 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1901 goto failure;
1902 } else {
1903 int i;
1904 classid = TC_H_MAKE(sch->handle,0x8000);
1905
1906 for (i=0; i<0x8000; i++) {
1907 if (++q->hgenerator >= 0x8000)
1908 q->hgenerator = 1;
1909 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1910 break;
1911 }
1912 err = -ENOSR;
1913 if (i >= 0x8000)
1914 goto failure;
1915 classid = classid|q->hgenerator;
1916 }
1917
1918 parent = &q->link;
1919 if (parentid) {
1920 parent = cbq_class_lookup(q, parentid);
1921 err = -EINVAL;
1922 if (parent == NULL)
1923 goto failure;
1924 }
1925
1926 err = -ENOBUFS;
1927 cl = kmalloc(sizeof(*cl), GFP_KERNEL);
1928 if (cl == NULL)
1929 goto failure;
1930 memset(cl, 0, sizeof(*cl));
1931 cl->R_tab = rtab;
1932 rtab = NULL;
1933 cl->refcnt = 1;
1934 if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
1935 cl->q = &noop_qdisc;
1936 cl->classid = classid;
1937 cl->tparent = parent;
1938 cl->qdisc = sch;
1939 cl->allot = parent->allot;
1940 cl->quantum = cl->allot;
1941 cl->weight = cl->R_tab->rate.rate;
1942 cl->stats_lock = &sch->dev->queue_lock;
1943
1944 sch_tree_lock(sch);
1945 cbq_link_class(cl);
1946 cl->borrow = cl->tparent;
1947 if (cl->tparent != &q->link)
1948 cl->share = cl->tparent;
1949 cbq_adjust_levels(parent);
1950 cl->minidle = -0x7FFFFFFF;
1951 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1952 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1953 if (cl->ewma_log==0)
1954 cl->ewma_log = q->link.ewma_log;
1955 if (cl->maxidle==0)
1956 cl->maxidle = q->link.maxidle;
1957 if (cl->avpkt==0)
1958 cl->avpkt = q->link.avpkt;
1959 cl->overlimit = cbq_ovl_classic;
1960 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1961 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1962#ifdef CONFIG_NET_CLS_POLICE
1963 if (tb[TCA_CBQ_POLICE-1])
1964 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1965#endif
1966 if (tb[TCA_CBQ_FOPT-1])
1967 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1968 sch_tree_unlock(sch);
1969
1970#ifdef CONFIG_NET_ESTIMATOR
1971 if (tca[TCA_RATE-1])
1972 gen_new_estimator(&cl->bstats, &cl->rate_est,
1973 cl->stats_lock, tca[TCA_RATE-1]);
1974#endif
1975
1976 *arg = (unsigned long)cl;
1977 return 0;
1978
1979failure:
1980 qdisc_put_rtab(rtab);
1981 return err;
1982}
1983
1984static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1985{
1986 struct cbq_sched_data *q = qdisc_priv(sch);
1987 struct cbq_class *cl = (struct cbq_class*)arg;
1988
1989 if (cl->filters || cl->children || cl == &q->link)
1990 return -EBUSY;
1991
1992 sch_tree_lock(sch);
1993
1994 if (cl->next_alive)
1995 cbq_deactivate_class(cl);
1996
1997 if (q->tx_borrowed == cl)
1998 q->tx_borrowed = q->tx_class;
1999 if (q->tx_class == cl) {
2000 q->tx_class = NULL;
2001 q->tx_borrowed = NULL;
2002 }
2003#ifdef CONFIG_NET_CLS_POLICE
2004 if (q->rx_class == cl)
2005 q->rx_class = NULL;
2006#endif
2007
2008 cbq_unlink_class(cl);
2009 cbq_adjust_levels(cl->tparent);
2010 cl->defmap = 0;
2011 cbq_sync_defmap(cl);
2012
2013 cbq_rmprio(q, cl);
2014 sch_tree_unlock(sch);
2015
2016 if (--cl->refcnt == 0)
2017 cbq_destroy_class(sch, cl);
2018
2019 return 0;
2020}
2021
2022static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
2023{
2024 struct cbq_sched_data *q = qdisc_priv(sch);
2025 struct cbq_class *cl = (struct cbq_class *)arg;
2026
2027 if (cl == NULL)
2028 cl = &q->link;
2029
2030 return &cl->filter_list;
2031}
2032
2033static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
2034 u32 classid)
2035{
2036 struct cbq_sched_data *q = qdisc_priv(sch);
2037 struct cbq_class *p = (struct cbq_class*)parent;
2038 struct cbq_class *cl = cbq_class_lookup(q, classid);
2039
2040 if (cl) {
2041 if (p && p->level <= cl->level)
2042 return 0;
2043 cl->filters++;
2044 return (unsigned long)cl;
2045 }
2046 return 0;
2047}
2048
2049static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2050{
2051 struct cbq_class *cl = (struct cbq_class*)arg;
2052
2053 cl->filters--;
2054}
2055
2056static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2057{
2058 struct cbq_sched_data *q = qdisc_priv(sch);
2059 unsigned h;
2060
2061 if (arg->stop)
2062 return;
2063
2064 for (h = 0; h < 16; h++) {
2065 struct cbq_class *cl;
2066
2067 for (cl = q->classes[h]; cl; cl = cl->next) {
2068 if (arg->count < arg->skip) {
2069 arg->count++;
2070 continue;
2071 }
2072 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2073 arg->stop = 1;
2074 return;
2075 }
2076 arg->count++;
2077 }
2078 }
2079}
2080
2081static struct Qdisc_class_ops cbq_class_ops = {
2082 .graft = cbq_graft,
2083 .leaf = cbq_leaf,
2084 .get = cbq_get,
2085 .put = cbq_put,
2086 .change = cbq_change_class,
2087 .delete = cbq_delete,
2088 .walk = cbq_walk,
2089 .tcf_chain = cbq_find_tcf,
2090 .bind_tcf = cbq_bind_filter,
2091 .unbind_tcf = cbq_unbind_filter,
2092 .dump = cbq_dump_class,
2093 .dump_stats = cbq_dump_class_stats,
2094};
2095
2096static struct Qdisc_ops cbq_qdisc_ops = {
2097 .next = NULL,
2098 .cl_ops = &cbq_class_ops,
2099 .id = "cbq",
2100 .priv_size = sizeof(struct cbq_sched_data),
2101 .enqueue = cbq_enqueue,
2102 .dequeue = cbq_dequeue,
2103 .requeue = cbq_requeue,
2104 .drop = cbq_drop,
2105 .init = cbq_init,
2106 .reset = cbq_reset,
2107 .destroy = cbq_destroy,
2108 .change = NULL,
2109 .dump = cbq_dump,
2110 .dump_stats = cbq_dump_stats,
2111 .owner = THIS_MODULE,
2112};
2113
2114static int __init cbq_module_init(void)
2115{
2116 return register_qdisc(&cbq_qdisc_ops);
2117}
2118static void __exit cbq_module_exit(void)
2119{
2120 unregister_qdisc(&cbq_qdisc_ops);
2121}
2122module_init(cbq_module_init)
2123module_exit(cbq_module_exit)
2124MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
new file mode 100644
index 000000000000..8a3db9d95bab
--- /dev/null
+++ b/net/sched/sch_dsmark.c
@@ -0,0 +1,479 @@
1/* net/sched/sch_dsmark.c - Differentiated Services field marker */
2
3/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
4
5
6#include <linux/config.h>
7#include <linux/module.h>
8#include <linux/init.h>
9#include <linux/types.h>
10#include <linux/string.h>
11#include <linux/errno.h>
12#include <linux/skbuff.h>
13#include <linux/netdevice.h> /* for pkt_sched */
14#include <linux/rtnetlink.h>
15#include <net/pkt_sched.h>
16#include <net/dsfield.h>
17#include <net/inet_ecn.h>
18#include <asm/byteorder.h>
19
20
21#if 1 /* control */
22#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
23#else
24#define DPRINTK(format,args...)
25#endif
26
27#if 0 /* data */
28#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
29#else
30#define D2PRINTK(format,args...)
31#endif
32
33
34#define PRIV(sch) qdisc_priv(sch)
35
36
37/*
38 * classid class marking
39 * ------- ----- -------
40 * n/a 0 n/a
41 * x:0 1 use entry [0]
42 * ... ... ...
43 * x:y y>0 y+1 use entry [y]
44 * ... ... ...
45 * x:indices-1 indices use entry [indices-1]
46 * ... ... ...
47 * x:y y+1 use entry [y & (indices-1)]
48 * ... ... ...
49 * 0xffff 0x10000 use entry [indices-1]
50 */
51
52
53#define NO_DEFAULT_INDEX (1 << 16)
54
55struct dsmark_qdisc_data {
56 struct Qdisc *q;
57 struct tcf_proto *filter_list;
58 __u8 *mask; /* "owns" the array */
59 __u8 *value;
60 __u16 indices;
61 __u32 default_index; /* index range is 0...0xffff */
62 int set_tc_index;
63};
64
65
66/* ------------------------- Class/flow operations ------------------------- */
67
68
69static int dsmark_graft(struct Qdisc *sch,unsigned long arg,
70 struct Qdisc *new,struct Qdisc **old)
71{
72 struct dsmark_qdisc_data *p = PRIV(sch);
73
74 DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",sch,p,new,
75 old);
76 if (!new)
77 new = &noop_qdisc;
78 sch_tree_lock(sch);
79 *old = xchg(&p->q,new);
80 if (*old)
81 qdisc_reset(*old);
82 sch->q.qlen = 0;
83 sch_tree_unlock(sch); /* @@@ move up ? */
84 return 0;
85}
86
87
88static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)
89{
90 struct dsmark_qdisc_data *p = PRIV(sch);
91
92 return p->q;
93}
94
95
96static unsigned long dsmark_get(struct Qdisc *sch,u32 classid)
97{
98 struct dsmark_qdisc_data *p __attribute__((unused)) = PRIV(sch);
99
100 DPRINTK("dsmark_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid);
101 return TC_H_MIN(classid)+1;
102}
103
104
105static unsigned long dsmark_bind_filter(struct Qdisc *sch,
106 unsigned long parent, u32 classid)
107{
108 return dsmark_get(sch,classid);
109}
110
111
112static void dsmark_put(struct Qdisc *sch, unsigned long cl)
113{
114}
115
116
117static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
118 struct rtattr **tca, unsigned long *arg)
119{
120 struct dsmark_qdisc_data *p = PRIV(sch);
121 struct rtattr *opt = tca[TCA_OPTIONS-1];
122 struct rtattr *tb[TCA_DSMARK_MAX];
123
124 DPRINTK("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x),"
125 "arg 0x%lx\n",sch,p,classid,parent,*arg);
126 if (*arg > p->indices)
127 return -ENOENT;
128 if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt))
129 return -EINVAL;
130 if (tb[TCA_DSMARK_MASK-1]) {
131 if (!RTA_PAYLOAD(tb[TCA_DSMARK_MASK-1]))
132 return -EINVAL;
133 p->mask[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_MASK-1]);
134 }
135 if (tb[TCA_DSMARK_VALUE-1]) {
136 if (!RTA_PAYLOAD(tb[TCA_DSMARK_VALUE-1]))
137 return -EINVAL;
138 p->value[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_VALUE-1]);
139 }
140 return 0;
141}
142
143
144static int dsmark_delete(struct Qdisc *sch,unsigned long arg)
145{
146 struct dsmark_qdisc_data *p = PRIV(sch);
147
148 if (!arg || arg > p->indices)
149 return -EINVAL;
150 p->mask[arg-1] = 0xff;
151 p->value[arg-1] = 0;
152 return 0;
153}
154
155
156static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker)
157{
158 struct dsmark_qdisc_data *p = PRIV(sch);
159 int i;
160
161 DPRINTK("dsmark_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker);
162 if (walker->stop)
163 return;
164 for (i = 0; i < p->indices; i++) {
165 if (p->mask[i] == 0xff && !p->value[i])
166 continue;
167 if (walker->count >= walker->skip) {
168 if (walker->fn(sch, i+1, walker) < 0) {
169 walker->stop = 1;
170 break;
171 }
172 }
173 walker->count++;
174 }
175}
176
177
178static struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,unsigned long cl)
179{
180 struct dsmark_qdisc_data *p = PRIV(sch);
181
182 return &p->filter_list;
183}
184
185
186/* --------------------------- Qdisc operations ---------------------------- */
187
188
189static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch)
190{
191 struct dsmark_qdisc_data *p = PRIV(sch);
192 struct tcf_result res;
193 int result;
194 int ret = NET_XMIT_POLICED;
195
196 D2PRINTK("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p);
197 if (p->set_tc_index) {
198 /* FIXME: Safe with non-linear skbs? --RR */
199 switch (skb->protocol) {
200 case __constant_htons(ETH_P_IP):
201 skb->tc_index = ipv4_get_dsfield(skb->nh.iph)
202 & ~INET_ECN_MASK;
203 break;
204 case __constant_htons(ETH_P_IPV6):
205 skb->tc_index = ipv6_get_dsfield(skb->nh.ipv6h)
206 & ~INET_ECN_MASK;
207 break;
208 default:
209 skb->tc_index = 0;
210 break;
211 };
212 }
213 result = TC_POLICE_OK; /* be nice to gcc */
214 if (TC_H_MAJ(skb->priority) == sch->handle) {
215 skb->tc_index = TC_H_MIN(skb->priority);
216 } else {
217 result = tc_classify(skb,p->filter_list,&res);
218 D2PRINTK("result %d class 0x%04x\n",result,res.classid);
219 switch (result) {
220#ifdef CONFIG_NET_CLS_POLICE
221 case TC_POLICE_SHOT:
222 kfree_skb(skb);
223 break;
224#if 0
225 case TC_POLICE_RECLASSIFY:
226 /* FIXME: what to do here ??? */
227#endif
228#endif
229 case TC_POLICE_OK:
230 skb->tc_index = TC_H_MIN(res.classid);
231 break;
232 case TC_POLICE_UNSPEC:
233 /* fall through */
234 default:
235 if (p->default_index != NO_DEFAULT_INDEX)
236 skb->tc_index = p->default_index;
237 break;
238 };
239 }
240 if (
241#ifdef CONFIG_NET_CLS_POLICE
242 result == TC_POLICE_SHOT ||
243#endif
244
245 ((ret = p->q->enqueue(skb,p->q)) != 0)) {
246 sch->qstats.drops++;
247 return ret;
248 }
249 sch->bstats.bytes += skb->len;
250 sch->bstats.packets++;
251 sch->q.qlen++;
252 return ret;
253}
254
255
256static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
257{
258 struct dsmark_qdisc_data *p = PRIV(sch);
259 struct sk_buff *skb;
260 int index;
261
262 D2PRINTK("dsmark_dequeue(sch %p,[qdisc %p])\n",sch,p);
263 skb = p->q->ops->dequeue(p->q);
264 if (!skb)
265 return NULL;
266 sch->q.qlen--;
267 index = skb->tc_index & (p->indices-1);
268 D2PRINTK("index %d->%d\n",skb->tc_index,index);
269 switch (skb->protocol) {
270 case __constant_htons(ETH_P_IP):
271 ipv4_change_dsfield(skb->nh.iph,
272 p->mask[index],p->value[index]);
273 break;
274 case __constant_htons(ETH_P_IPV6):
275 ipv6_change_dsfield(skb->nh.ipv6h,
276 p->mask[index],p->value[index]);
277 break;
278 default:
279 /*
280 * Only complain if a change was actually attempted.
281 * This way, we can send non-IP traffic through dsmark
282 * and don't need yet another qdisc as a bypass.
283 */
284 if (p->mask[index] != 0xff || p->value[index])
285 printk(KERN_WARNING "dsmark_dequeue: "
286 "unsupported protocol %d\n",
287 htons(skb->protocol));
288 break;
289 };
290 return skb;
291}
292
293
294static int dsmark_requeue(struct sk_buff *skb,struct Qdisc *sch)
295{
296 int ret;
297 struct dsmark_qdisc_data *p = PRIV(sch);
298
299 D2PRINTK("dsmark_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p);
300 if ((ret = p->q->ops->requeue(skb, p->q)) == 0) {
301 sch->q.qlen++;
302 sch->qstats.requeues++;
303 return 0;
304 }
305 sch->qstats.drops++;
306 return ret;
307}
308
309
310static unsigned int dsmark_drop(struct Qdisc *sch)
311{
312 struct dsmark_qdisc_data *p = PRIV(sch);
313 unsigned int len;
314
315 DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p);
316 if (!p->q->ops->drop)
317 return 0;
318 if (!(len = p->q->ops->drop(p->q)))
319 return 0;
320 sch->q.qlen--;
321 return len;
322}
323
324
325static int dsmark_init(struct Qdisc *sch,struct rtattr *opt)
326{
327 struct dsmark_qdisc_data *p = PRIV(sch);
328 struct rtattr *tb[TCA_DSMARK_MAX];
329 __u16 tmp;
330
331 DPRINTK("dsmark_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt);
332 if (!opt ||
333 rtattr_parse(tb,TCA_DSMARK_MAX,RTA_DATA(opt),RTA_PAYLOAD(opt)) < 0 ||
334 !tb[TCA_DSMARK_INDICES-1] ||
335 RTA_PAYLOAD(tb[TCA_DSMARK_INDICES-1]) < sizeof(__u16))
336 return -EINVAL;
337 p->indices = *(__u16 *) RTA_DATA(tb[TCA_DSMARK_INDICES-1]);
338 if (!p->indices)
339 return -EINVAL;
340 for (tmp = p->indices; tmp != 1; tmp >>= 1) {
341 if (tmp & 1)
342 return -EINVAL;
343 }
344 p->default_index = NO_DEFAULT_INDEX;
345 if (tb[TCA_DSMARK_DEFAULT_INDEX-1]) {
346 if (RTA_PAYLOAD(tb[TCA_DSMARK_DEFAULT_INDEX-1]) < sizeof(__u16))
347 return -EINVAL;
348 p->default_index =
349 *(__u16 *) RTA_DATA(tb[TCA_DSMARK_DEFAULT_INDEX-1]);
350 }
351 p->set_tc_index = !!tb[TCA_DSMARK_SET_TC_INDEX-1];
352 p->mask = kmalloc(p->indices*2,GFP_KERNEL);
353 if (!p->mask)
354 return -ENOMEM;
355 p->value = p->mask+p->indices;
356 memset(p->mask,0xff,p->indices);
357 memset(p->value,0,p->indices);
358 if (!(p->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
359 p->q = &noop_qdisc;
360 DPRINTK("dsmark_init: qdisc %p\n",&p->q);
361 return 0;
362}
363
364
365static void dsmark_reset(struct Qdisc *sch)
366{
367 struct dsmark_qdisc_data *p = PRIV(sch);
368
369 DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p);
370 qdisc_reset(p->q);
371 sch->q.qlen = 0;
372}
373
374
375static void dsmark_destroy(struct Qdisc *sch)
376{
377 struct dsmark_qdisc_data *p = PRIV(sch);
378 struct tcf_proto *tp;
379
380 DPRINTK("dsmark_destroy(sch %p,[qdisc %p])\n",sch,p);
381 while (p->filter_list) {
382 tp = p->filter_list;
383 p->filter_list = tp->next;
384 tcf_destroy(tp);
385 }
386 qdisc_destroy(p->q);
387 kfree(p->mask);
388}
389
390
391static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
392 struct sk_buff *skb, struct tcmsg *tcm)
393{
394 struct dsmark_qdisc_data *p = PRIV(sch);
395 unsigned char *b = skb->tail;
396 struct rtattr *rta;
397
398 DPRINTK("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n",sch,p,cl);
399 if (!cl || cl > p->indices)
400 return -EINVAL;
401 tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle),cl-1);
402 rta = (struct rtattr *) b;
403 RTA_PUT(skb,TCA_OPTIONS,0,NULL);
404 RTA_PUT(skb,TCA_DSMARK_MASK,1,&p->mask[cl-1]);
405 RTA_PUT(skb,TCA_DSMARK_VALUE,1,&p->value[cl-1]);
406 rta->rta_len = skb->tail-b;
407 return skb->len;
408
409rtattr_failure:
410 skb_trim(skb,b-skb->data);
411 return -1;
412}
413
414static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
415{
416 struct dsmark_qdisc_data *p = PRIV(sch);
417 unsigned char *b = skb->tail;
418 struct rtattr *rta;
419
420 rta = (struct rtattr *) b;
421 RTA_PUT(skb,TCA_OPTIONS,0,NULL);
422 RTA_PUT(skb,TCA_DSMARK_INDICES,sizeof(__u16),&p->indices);
423 if (p->default_index != NO_DEFAULT_INDEX) {
424 __u16 tmp = p->default_index;
425
426 RTA_PUT(skb,TCA_DSMARK_DEFAULT_INDEX, sizeof(__u16), &tmp);
427 }
428 if (p->set_tc_index)
429 RTA_PUT(skb, TCA_DSMARK_SET_TC_INDEX, 0, NULL);
430 rta->rta_len = skb->tail-b;
431 return skb->len;
432
433rtattr_failure:
434 skb_trim(skb,b-skb->data);
435 return -1;
436}
437
438static struct Qdisc_class_ops dsmark_class_ops = {
439 .graft = dsmark_graft,
440 .leaf = dsmark_leaf,
441 .get = dsmark_get,
442 .put = dsmark_put,
443 .change = dsmark_change,
444 .delete = dsmark_delete,
445 .walk = dsmark_walk,
446 .tcf_chain = dsmark_find_tcf,
447 .bind_tcf = dsmark_bind_filter,
448 .unbind_tcf = dsmark_put,
449 .dump = dsmark_dump_class,
450};
451
452static struct Qdisc_ops dsmark_qdisc_ops = {
453 .next = NULL,
454 .cl_ops = &dsmark_class_ops,
455 .id = "dsmark",
456 .priv_size = sizeof(struct dsmark_qdisc_data),
457 .enqueue = dsmark_enqueue,
458 .dequeue = dsmark_dequeue,
459 .requeue = dsmark_requeue,
460 .drop = dsmark_drop,
461 .init = dsmark_init,
462 .reset = dsmark_reset,
463 .destroy = dsmark_destroy,
464 .change = NULL,
465 .dump = dsmark_dump,
466 .owner = THIS_MODULE,
467};
468
469static int __init dsmark_module_init(void)
470{
471 return register_qdisc(&dsmark_qdisc_ops);
472}
473static void __exit dsmark_module_exit(void)
474{
475 unregister_qdisc(&dsmark_qdisc_ops);
476}
477module_init(dsmark_module_init)
478module_exit(dsmark_module_exit)
479MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
new file mode 100644
index 000000000000..4888305c96da
--- /dev/null
+++ b/net/sched/sch_fifo.c
@@ -0,0 +1,212 @@
1/*
2 * net/sched/sch_fifo.c The simplest FIFO queue.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 */
11
12#include <linux/config.h>
13#include <linux/module.h>
14#include <asm/uaccess.h>
15#include <asm/system.h>
16#include <linux/bitops.h>
17#include <linux/types.h>
18#include <linux/kernel.h>
19#include <linux/sched.h>
20#include <linux/string.h>
21#include <linux/mm.h>
22#include <linux/socket.h>
23#include <linux/sockios.h>
24#include <linux/in.h>
25#include <linux/errno.h>
26#include <linux/interrupt.h>
27#include <linux/if_ether.h>
28#include <linux/inet.h>
29#include <linux/netdevice.h>
30#include <linux/etherdevice.h>
31#include <linux/notifier.h>
32#include <net/ip.h>
33#include <net/route.h>
34#include <linux/skbuff.h>
35#include <net/sock.h>
36#include <net/pkt_sched.h>
37
38/* 1 band FIFO pseudo-"scheduler" */
39
40struct fifo_sched_data
41{
42 unsigned limit;
43};
44
45static int
46bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
47{
48 struct fifo_sched_data *q = qdisc_priv(sch);
49
50 if (sch->qstats.backlog + skb->len <= q->limit) {
51 __skb_queue_tail(&sch->q, skb);
52 sch->qstats.backlog += skb->len;
53 sch->bstats.bytes += skb->len;
54 sch->bstats.packets++;
55 return 0;
56 }
57 sch->qstats.drops++;
58#ifdef CONFIG_NET_CLS_POLICE
59 if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
60#endif
61 kfree_skb(skb);
62 return NET_XMIT_DROP;
63}
64
65static int
66bfifo_requeue(struct sk_buff *skb, struct Qdisc* sch)
67{
68 __skb_queue_head(&sch->q, skb);
69 sch->qstats.backlog += skb->len;
70 sch->qstats.requeues++;
71 return 0;
72}
73
74static struct sk_buff *
75bfifo_dequeue(struct Qdisc* sch)
76{
77 struct sk_buff *skb;
78
79 skb = __skb_dequeue(&sch->q);
80 if (skb)
81 sch->qstats.backlog -= skb->len;
82 return skb;
83}
84
85static unsigned int
86fifo_drop(struct Qdisc* sch)
87{
88 struct sk_buff *skb;
89
90 skb = __skb_dequeue_tail(&sch->q);
91 if (skb) {
92 unsigned int len = skb->len;
93 sch->qstats.backlog -= len;
94 kfree_skb(skb);
95 return len;
96 }
97 return 0;
98}
99
100static void
101fifo_reset(struct Qdisc* sch)
102{
103 skb_queue_purge(&sch->q);
104 sch->qstats.backlog = 0;
105}
106
107static int
108pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
109{
110 struct fifo_sched_data *q = qdisc_priv(sch);
111
112 if (sch->q.qlen < q->limit) {
113 __skb_queue_tail(&sch->q, skb);
114 sch->bstats.bytes += skb->len;
115 sch->bstats.packets++;
116 return 0;
117 }
118 sch->qstats.drops++;
119#ifdef CONFIG_NET_CLS_POLICE
120 if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
121#endif
122 kfree_skb(skb);
123 return NET_XMIT_DROP;
124}
125
126static int
127pfifo_requeue(struct sk_buff *skb, struct Qdisc* sch)
128{
129 __skb_queue_head(&sch->q, skb);
130 sch->qstats.requeues++;
131 return 0;
132}
133
134
135static struct sk_buff *
136pfifo_dequeue(struct Qdisc* sch)
137{
138 return __skb_dequeue(&sch->q);
139}
140
141static int fifo_init(struct Qdisc *sch, struct rtattr *opt)
142{
143 struct fifo_sched_data *q = qdisc_priv(sch);
144
145 if (opt == NULL) {
146 unsigned int limit = sch->dev->tx_queue_len ? : 1;
147
148 if (sch->ops == &bfifo_qdisc_ops)
149 q->limit = limit*sch->dev->mtu;
150 else
151 q->limit = limit;
152 } else {
153 struct tc_fifo_qopt *ctl = RTA_DATA(opt);
154 if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
155 return -EINVAL;
156 q->limit = ctl->limit;
157 }
158 return 0;
159}
160
161static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
162{
163 struct fifo_sched_data *q = qdisc_priv(sch);
164 unsigned char *b = skb->tail;
165 struct tc_fifo_qopt opt;
166
167 opt.limit = q->limit;
168 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
169
170 return skb->len;
171
172rtattr_failure:
173 skb_trim(skb, b - skb->data);
174 return -1;
175}
176
177struct Qdisc_ops pfifo_qdisc_ops = {
178 .next = NULL,
179 .cl_ops = NULL,
180 .id = "pfifo",
181 .priv_size = sizeof(struct fifo_sched_data),
182 .enqueue = pfifo_enqueue,
183 .dequeue = pfifo_dequeue,
184 .requeue = pfifo_requeue,
185 .drop = fifo_drop,
186 .init = fifo_init,
187 .reset = fifo_reset,
188 .destroy = NULL,
189 .change = fifo_init,
190 .dump = fifo_dump,
191 .owner = THIS_MODULE,
192};
193
194struct Qdisc_ops bfifo_qdisc_ops = {
195 .next = NULL,
196 .cl_ops = NULL,
197 .id = "bfifo",
198 .priv_size = sizeof(struct fifo_sched_data),
199 .enqueue = bfifo_enqueue,
200 .dequeue = bfifo_dequeue,
201 .requeue = bfifo_requeue,
202 .drop = fifo_drop,
203 .init = fifo_init,
204 .reset = fifo_reset,
205 .destroy = NULL,
206 .change = fifo_init,
207 .dump = fifo_dump,
208 .owner = THIS_MODULE,
209};
210
211EXPORT_SYMBOL(bfifo_qdisc_ops);
212EXPORT_SYMBOL(pfifo_qdisc_ops);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
new file mode 100644
index 000000000000..8c01e023f02e
--- /dev/null
+++ b/net/sched/sch_generic.c
@@ -0,0 +1,609 @@
1/*
2 * net/sched/sch_generic.c Generic packet scheduler routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 * Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11 * - Ingress support
12 */
13
14#include <asm/uaccess.h>
15#include <asm/system.h>
16#include <linux/bitops.h>
17#include <linux/config.h>
18#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/sched.h>
22#include <linux/string.h>
23#include <linux/mm.h>
24#include <linux/socket.h>
25#include <linux/sockios.h>
26#include <linux/in.h>
27#include <linux/errno.h>
28#include <linux/interrupt.h>
29#include <linux/netdevice.h>
30#include <linux/skbuff.h>
31#include <linux/rtnetlink.h>
32#include <linux/init.h>
33#include <linux/rcupdate.h>
34#include <linux/list.h>
35#include <net/sock.h>
36#include <net/pkt_sched.h>
37
38/* Main transmission queue. */
39
40/* Main qdisc structure lock.
41
42 However, modifications
43 to data, participating in scheduling must be additionally
44 protected with dev->queue_lock spinlock.
45
46 The idea is the following:
47 - enqueue, dequeue are serialized via top level device
48 spinlock dev->queue_lock.
49 - tree walking is protected by read_lock_bh(qdisc_tree_lock)
50 and this lock is used only in process context.
51 - updates to tree are made under rtnl semaphore or
52 from softirq context (__qdisc_destroy rcu-callback)
53 hence this lock needs local bh disabling.
54
55 qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
56 */
57DEFINE_RWLOCK(qdisc_tree_lock);
58
59void qdisc_lock_tree(struct net_device *dev)
60{
61 write_lock_bh(&qdisc_tree_lock);
62 spin_lock_bh(&dev->queue_lock);
63}
64
65void qdisc_unlock_tree(struct net_device *dev)
66{
67 spin_unlock_bh(&dev->queue_lock);
68 write_unlock_bh(&qdisc_tree_lock);
69}
70
71/*
72 dev->queue_lock serializes queue accesses for this device
73 AND dev->qdisc pointer itself.
74
75 dev->xmit_lock serializes accesses to device driver.
76
77 dev->queue_lock and dev->xmit_lock are mutually exclusive,
78 if one is grabbed, another must be free.
79 */
80
81
82/* Kick device.
83 Note, that this procedure can be called by a watchdog timer, so that
84 we do not check dev->tbusy flag here.
85
86 Returns: 0 - queue is empty.
87 >0 - queue is not empty, but throttled.
88 <0 - queue is not empty. Device is throttled, if dev->tbusy != 0.
89
90 NOTE: Called under dev->queue_lock with locally disabled BH.
91*/
92
93int qdisc_restart(struct net_device *dev)
94{
95 struct Qdisc *q = dev->qdisc;
96 struct sk_buff *skb;
97
98 /* Dequeue packet */
99 if ((skb = q->dequeue(q)) != NULL) {
100 unsigned nolock = (dev->features & NETIF_F_LLTX);
101 /*
102 * When the driver has LLTX set it does its own locking
103 * in start_xmit. No need to add additional overhead by
104 * locking again. These checks are worth it because
105 * even uncongested locks can be quite expensive.
106 * The driver can do trylock like here too, in case
107 * of lock congestion it should return -1 and the packet
108 * will be requeued.
109 */
110 if (!nolock) {
111 if (!spin_trylock(&dev->xmit_lock)) {
112 collision:
113 /* So, someone grabbed the driver. */
114
115 /* It may be transient configuration error,
116 when hard_start_xmit() recurses. We detect
117 it by checking xmit owner and drop the
118 packet when deadloop is detected.
119 */
120 if (dev->xmit_lock_owner == smp_processor_id()) {
121 kfree_skb(skb);
122 if (net_ratelimit())
123 printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
124 return -1;
125 }
126 __get_cpu_var(netdev_rx_stat).cpu_collision++;
127 goto requeue;
128 }
129 /* Remember that the driver is grabbed by us. */
130 dev->xmit_lock_owner = smp_processor_id();
131 }
132
133 {
134 /* And release queue */
135 spin_unlock(&dev->queue_lock);
136
137 if (!netif_queue_stopped(dev)) {
138 int ret;
139 if (netdev_nit)
140 dev_queue_xmit_nit(skb, dev);
141
142 ret = dev->hard_start_xmit(skb, dev);
143 if (ret == NETDEV_TX_OK) {
144 if (!nolock) {
145 dev->xmit_lock_owner = -1;
146 spin_unlock(&dev->xmit_lock);
147 }
148 spin_lock(&dev->queue_lock);
149 return -1;
150 }
151 if (ret == NETDEV_TX_LOCKED && nolock) {
152 spin_lock(&dev->queue_lock);
153 goto collision;
154 }
155 }
156
157 /* NETDEV_TX_BUSY - we need to requeue */
158 /* Release the driver */
159 if (!nolock) {
160 dev->xmit_lock_owner = -1;
161 spin_unlock(&dev->xmit_lock);
162 }
163 spin_lock(&dev->queue_lock);
164 q = dev->qdisc;
165 }
166
167 /* Device kicked us out :(
168 This is possible in three cases:
169
170 0. driver is locked
171 1. fastroute is enabled
172 2. device cannot determine busy state
173 before start of transmission (f.e. dialout)
174 3. device is buggy (ppp)
175 */
176
177requeue:
178 q->ops->requeue(skb, q);
179 netif_schedule(dev);
180 return 1;
181 }
182 return q->q.qlen;
183}
184
185static void dev_watchdog(unsigned long arg)
186{
187 struct net_device *dev = (struct net_device *)arg;
188
189 spin_lock(&dev->xmit_lock);
190 if (dev->qdisc != &noop_qdisc) {
191 if (netif_device_present(dev) &&
192 netif_running(dev) &&
193 netif_carrier_ok(dev)) {
194 if (netif_queue_stopped(dev) &&
195 (jiffies - dev->trans_start) > dev->watchdog_timeo) {
196 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n", dev->name);
197 dev->tx_timeout(dev);
198 }
199 if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
200 dev_hold(dev);
201 }
202 }
203 spin_unlock(&dev->xmit_lock);
204
205 dev_put(dev);
206}
207
208static void dev_watchdog_init(struct net_device *dev)
209{
210 init_timer(&dev->watchdog_timer);
211 dev->watchdog_timer.data = (unsigned long)dev;
212 dev->watchdog_timer.function = dev_watchdog;
213}
214
215void __netdev_watchdog_up(struct net_device *dev)
216{
217 if (dev->tx_timeout) {
218 if (dev->watchdog_timeo <= 0)
219 dev->watchdog_timeo = 5*HZ;
220 if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
221 dev_hold(dev);
222 }
223}
224
225static void dev_watchdog_up(struct net_device *dev)
226{
227 spin_lock_bh(&dev->xmit_lock);
228 __netdev_watchdog_up(dev);
229 spin_unlock_bh(&dev->xmit_lock);
230}
231
232static void dev_watchdog_down(struct net_device *dev)
233{
234 spin_lock_bh(&dev->xmit_lock);
235 if (del_timer(&dev->watchdog_timer))
236 __dev_put(dev);
237 spin_unlock_bh(&dev->xmit_lock);
238}
239
240/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
241 under all circumstances. It is difficult to invent anything faster or
242 cheaper.
243 */
244
245static int
246noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
247{
248 kfree_skb(skb);
249 return NET_XMIT_CN;
250}
251
252static struct sk_buff *
253noop_dequeue(struct Qdisc * qdisc)
254{
255 return NULL;
256}
257
258static int
259noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
260{
261 if (net_ratelimit())
262 printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name);
263 kfree_skb(skb);
264 return NET_XMIT_CN;
265}
266
267struct Qdisc_ops noop_qdisc_ops = {
268 .next = NULL,
269 .cl_ops = NULL,
270 .id = "noop",
271 .priv_size = 0,
272 .enqueue = noop_enqueue,
273 .dequeue = noop_dequeue,
274 .requeue = noop_requeue,
275 .owner = THIS_MODULE,
276};
277
278struct Qdisc noop_qdisc = {
279 .enqueue = noop_enqueue,
280 .dequeue = noop_dequeue,
281 .flags = TCQ_F_BUILTIN,
282 .ops = &noop_qdisc_ops,
283 .list = LIST_HEAD_INIT(noop_qdisc.list),
284};
285
286static struct Qdisc_ops noqueue_qdisc_ops = {
287 .next = NULL,
288 .cl_ops = NULL,
289 .id = "noqueue",
290 .priv_size = 0,
291 .enqueue = noop_enqueue,
292 .dequeue = noop_dequeue,
293 .requeue = noop_requeue,
294 .owner = THIS_MODULE,
295};
296
297static struct Qdisc noqueue_qdisc = {
298 .enqueue = NULL,
299 .dequeue = noop_dequeue,
300 .flags = TCQ_F_BUILTIN,
301 .ops = &noqueue_qdisc_ops,
302 .list = LIST_HEAD_INIT(noqueue_qdisc.list),
303};
304
305
306static const u8 prio2band[TC_PRIO_MAX+1] =
307 { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
308
309/* 3-band FIFO queue: old style, but should be a bit faster than
310 generic prio+fifo combination.
311 */
312
313static int
314pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
315{
316 struct sk_buff_head *list = qdisc_priv(qdisc);
317
318 list += prio2band[skb->priority&TC_PRIO_MAX];
319
320 if (list->qlen < qdisc->dev->tx_queue_len) {
321 __skb_queue_tail(list, skb);
322 qdisc->q.qlen++;
323 qdisc->bstats.bytes += skb->len;
324 qdisc->bstats.packets++;
325 return 0;
326 }
327 qdisc->qstats.drops++;
328 kfree_skb(skb);
329 return NET_XMIT_DROP;
330}
331
332static struct sk_buff *
333pfifo_fast_dequeue(struct Qdisc* qdisc)
334{
335 int prio;
336 struct sk_buff_head *list = qdisc_priv(qdisc);
337 struct sk_buff *skb;
338
339 for (prio = 0; prio < 3; prio++, list++) {
340 skb = __skb_dequeue(list);
341 if (skb) {
342 qdisc->q.qlen--;
343 return skb;
344 }
345 }
346 return NULL;
347}
348
349static int
350pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
351{
352 struct sk_buff_head *list = qdisc_priv(qdisc);
353
354 list += prio2band[skb->priority&TC_PRIO_MAX];
355
356 __skb_queue_head(list, skb);
357 qdisc->q.qlen++;
358 qdisc->qstats.requeues++;
359 return 0;
360}
361
362static void
363pfifo_fast_reset(struct Qdisc* qdisc)
364{
365 int prio;
366 struct sk_buff_head *list = qdisc_priv(qdisc);
367
368 for (prio=0; prio < 3; prio++)
369 skb_queue_purge(list+prio);
370 qdisc->q.qlen = 0;
371}
372
373static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
374{
375 unsigned char *b = skb->tail;
376 struct tc_prio_qopt opt;
377
378 opt.bands = 3;
379 memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
380 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
381 return skb->len;
382
383rtattr_failure:
384 skb_trim(skb, b - skb->data);
385 return -1;
386}
387
388static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
389{
390 int i;
391 struct sk_buff_head *list = qdisc_priv(qdisc);
392
393 for (i=0; i<3; i++)
394 skb_queue_head_init(list+i);
395
396 return 0;
397}
398
399static struct Qdisc_ops pfifo_fast_ops = {
400 .next = NULL,
401 .cl_ops = NULL,
402 .id = "pfifo_fast",
403 .priv_size = 3 * sizeof(struct sk_buff_head),
404 .enqueue = pfifo_fast_enqueue,
405 .dequeue = pfifo_fast_dequeue,
406 .requeue = pfifo_fast_requeue,
407 .init = pfifo_fast_init,
408 .reset = pfifo_fast_reset,
409 .dump = pfifo_fast_dump,
410 .owner = THIS_MODULE,
411};
412
413struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
414{
415 void *p;
416 struct Qdisc *sch;
417 int size;
418
419 /* ensure that the Qdisc and the private data are 32-byte aligned */
420 size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
421 size += ops->priv_size + QDISC_ALIGN_CONST;
422
423 p = kmalloc(size, GFP_KERNEL);
424 if (!p)
425 return NULL;
426 memset(p, 0, size);
427
428 sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
429 & ~QDISC_ALIGN_CONST);
430 sch->padded = (char *)sch - (char *)p;
431
432 INIT_LIST_HEAD(&sch->list);
433 skb_queue_head_init(&sch->q);
434 sch->ops = ops;
435 sch->enqueue = ops->enqueue;
436 sch->dequeue = ops->dequeue;
437 sch->dev = dev;
438 dev_hold(dev);
439 sch->stats_lock = &dev->queue_lock;
440 atomic_set(&sch->refcnt, 1);
441 if (!ops->init || ops->init(sch, NULL) == 0)
442 return sch;
443
444 dev_put(dev);
445 kfree(p);
446 return NULL;
447}
448
449/* Under dev->queue_lock and BH! */
450
451void qdisc_reset(struct Qdisc *qdisc)
452{
453 struct Qdisc_ops *ops = qdisc->ops;
454
455 if (ops->reset)
456 ops->reset(qdisc);
457}
458
459/* this is the rcu callback function to clean up a qdisc when there
460 * are no further references to it */
461
462static void __qdisc_destroy(struct rcu_head *head)
463{
464 struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
465 struct Qdisc_ops *ops = qdisc->ops;
466
467#ifdef CONFIG_NET_ESTIMATOR
468 gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
469#endif
470 write_lock(&qdisc_tree_lock);
471 if (ops->reset)
472 ops->reset(qdisc);
473 if (ops->destroy)
474 ops->destroy(qdisc);
475 write_unlock(&qdisc_tree_lock);
476 module_put(ops->owner);
477
478 dev_put(qdisc->dev);
479 kfree((char *) qdisc - qdisc->padded);
480}
481
482/* Under dev->queue_lock and BH! */
483
484void qdisc_destroy(struct Qdisc *qdisc)
485{
486 struct list_head cql = LIST_HEAD_INIT(cql);
487 struct Qdisc *cq, *q, *n;
488
489 if (qdisc->flags & TCQ_F_BUILTIN ||
490 !atomic_dec_and_test(&qdisc->refcnt))
491 return;
492
493 if (!list_empty(&qdisc->list)) {
494 if (qdisc->ops->cl_ops == NULL)
495 list_del(&qdisc->list);
496 else
497 list_move(&qdisc->list, &cql);
498 }
499
500 /* unlink inner qdiscs from dev->qdisc_list immediately */
501 list_for_each_entry(cq, &cql, list)
502 list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list)
503 if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
504 if (q->ops->cl_ops == NULL)
505 list_del_init(&q->list);
506 else
507 list_move_tail(&q->list, &cql);
508 }
509 list_for_each_entry_safe(cq, n, &cql, list)
510 list_del_init(&cq->list);
511
512 call_rcu(&qdisc->q_rcu, __qdisc_destroy);
513}
514
515void dev_activate(struct net_device *dev)
516{
517 /* No queueing discipline is attached to device;
518 create default one i.e. pfifo_fast for devices,
519 which need queueing and noqueue_qdisc for
520 virtual interfaces
521 */
522
523 if (dev->qdisc_sleeping == &noop_qdisc) {
524 struct Qdisc *qdisc;
525 if (dev->tx_queue_len) {
526 qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
527 if (qdisc == NULL) {
528 printk(KERN_INFO "%s: activation failed\n", dev->name);
529 return;
530 }
531 write_lock_bh(&qdisc_tree_lock);
532 list_add_tail(&qdisc->list, &dev->qdisc_list);
533 write_unlock_bh(&qdisc_tree_lock);
534 } else {
535 qdisc = &noqueue_qdisc;
536 }
537 write_lock_bh(&qdisc_tree_lock);
538 dev->qdisc_sleeping = qdisc;
539 write_unlock_bh(&qdisc_tree_lock);
540 }
541
542 spin_lock_bh(&dev->queue_lock);
543 rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
544 if (dev->qdisc != &noqueue_qdisc) {
545 dev->trans_start = jiffies;
546 dev_watchdog_up(dev);
547 }
548 spin_unlock_bh(&dev->queue_lock);
549}
550
551void dev_deactivate(struct net_device *dev)
552{
553 struct Qdisc *qdisc;
554
555 spin_lock_bh(&dev->queue_lock);
556 qdisc = dev->qdisc;
557 dev->qdisc = &noop_qdisc;
558
559 qdisc_reset(qdisc);
560
561 spin_unlock_bh(&dev->queue_lock);
562
563 dev_watchdog_down(dev);
564
565 while (test_bit(__LINK_STATE_SCHED, &dev->state))
566 yield();
567
568 spin_unlock_wait(&dev->xmit_lock);
569}
570
571void dev_init_scheduler(struct net_device *dev)
572{
573 qdisc_lock_tree(dev);
574 dev->qdisc = &noop_qdisc;
575 dev->qdisc_sleeping = &noop_qdisc;
576 INIT_LIST_HEAD(&dev->qdisc_list);
577 qdisc_unlock_tree(dev);
578
579 dev_watchdog_init(dev);
580}
581
582void dev_shutdown(struct net_device *dev)
583{
584 struct Qdisc *qdisc;
585
586 qdisc_lock_tree(dev);
587 qdisc = dev->qdisc_sleeping;
588 dev->qdisc = &noop_qdisc;
589 dev->qdisc_sleeping = &noop_qdisc;
590 qdisc_destroy(qdisc);
591#if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
592 if ((qdisc = dev->qdisc_ingress) != NULL) {
593 dev->qdisc_ingress = NULL;
594 qdisc_destroy(qdisc);
595 }
596#endif
597 BUG_TRAP(!timer_pending(&dev->watchdog_timer));
598 qdisc_unlock_tree(dev);
599}
600
601EXPORT_SYMBOL(__netdev_watchdog_up);
602EXPORT_SYMBOL(noop_qdisc);
603EXPORT_SYMBOL(noop_qdisc_ops);
604EXPORT_SYMBOL(qdisc_create_dflt);
605EXPORT_SYMBOL(qdisc_destroy);
606EXPORT_SYMBOL(qdisc_reset);
607EXPORT_SYMBOL(qdisc_restart);
608EXPORT_SYMBOL(qdisc_lock_tree);
609EXPORT_SYMBOL(qdisc_unlock_tree);
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
new file mode 100644
index 000000000000..25c171c32715
--- /dev/null
+++ b/net/sched/sch_gred.c
@@ -0,0 +1,630 @@
1/*
2 * net/sched/sch_gred.c Generic Random Early Detection queue.
3 *
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version
8 * 2 of the License, or (at your option) any later version.
9 *
10 * Authors: J Hadi Salim (hadi@cyberus.ca) 1998-2002
11 *
12 * 991129: - Bug fix with grio mode
13 * - a better sing. AvgQ mode with Grio(WRED)
14 * - A finer grained VQ dequeue based on sugestion
15 * from Ren Liu
16 * - More error checks
17 *
18 *
19 *
20 * For all the glorious comments look at Alexey's sch_red.c
21 */
22
23#include <linux/config.h>
24#include <linux/module.h>
25#include <asm/uaccess.h>
26#include <asm/system.h>
27#include <linux/bitops.h>
28#include <linux/types.h>
29#include <linux/kernel.h>
30#include <linux/sched.h>
31#include <linux/string.h>
32#include <linux/mm.h>
33#include <linux/socket.h>
34#include <linux/sockios.h>
35#include <linux/in.h>
36#include <linux/errno.h>
37#include <linux/interrupt.h>
38#include <linux/if_ether.h>
39#include <linux/inet.h>
40#include <linux/netdevice.h>
41#include <linux/etherdevice.h>
42#include <linux/notifier.h>
43#include <net/ip.h>
44#include <net/route.h>
45#include <linux/skbuff.h>
46#include <net/sock.h>
47#include <net/pkt_sched.h>
48
49#if 1 /* control */
50#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
51#else
52#define DPRINTK(format,args...)
53#endif
54
55#if 0 /* data */
56#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
57#else
58#define D2PRINTK(format,args...)
59#endif
60
61struct gred_sched_data;
62struct gred_sched;
63
64struct gred_sched_data
65{
66/* Parameters */
67 u32 limit; /* HARD maximal queue length */
68 u32 qth_min; /* Min average length threshold: A scaled */
69 u32 qth_max; /* Max average length threshold: A scaled */
70 u32 DP; /* the drop pramaters */
71 char Wlog; /* log(W) */
72 char Plog; /* random number bits */
73 u32 Scell_max;
74 u32 Rmask;
75 u32 bytesin; /* bytes seen on virtualQ so far*/
76 u32 packetsin; /* packets seen on virtualQ so far*/
77 u32 backlog; /* bytes on the virtualQ */
78 u32 forced; /* packets dropped for exceeding limits */
79 u32 early; /* packets dropped as a warning */
80 u32 other; /* packets dropped by invoking drop() */
81 u32 pdrop; /* packets dropped because we exceeded physical queue limits */
82 char Scell_log;
83 u8 Stab[256];
84 u8 prio; /* the prio of this vq */
85
86/* Variables */
87 unsigned long qave; /* Average queue length: A scaled */
88 int qcount; /* Packets since last random number generation */
89 u32 qR; /* Cached random number */
90
91 psched_time_t qidlestart; /* Start of idle period */
92};
93
94struct gred_sched
95{
96 struct gred_sched_data *tab[MAX_DPs];
97 u32 DPs;
98 u32 def;
99 u8 initd;
100 u8 grio;
101 u8 eqp;
102};
103
104static int
105gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
106{
107 psched_time_t now;
108 struct gred_sched_data *q=NULL;
109 struct gred_sched *t= qdisc_priv(sch);
110 unsigned long qave=0;
111 int i=0;
112
113 if (!t->initd && skb_queue_len(&sch->q) < (sch->dev->tx_queue_len ? : 1)) {
114 D2PRINTK("NO GRED Queues setup yet! Enqueued anyway\n");
115 goto do_enqueue;
116 }
117
118
119 if ( ((skb->tc_index&0xf) > (t->DPs -1)) || !(q=t->tab[skb->tc_index&0xf])) {
120 printk("GRED: setting to default (%d)\n ",t->def);
121 if (!(q=t->tab[t->def])) {
122 DPRINTK("GRED: setting to default FAILED! dropping!! "
123 "(%d)\n ", t->def);
124 goto drop;
125 }
126 /* fix tc_index? --could be controvesial but needed for
127 requeueing */
128 skb->tc_index=(skb->tc_index&0xfffffff0) | t->def;
129 }
130
131 D2PRINTK("gred_enqueue virtualQ 0x%x classid %x backlog %d "
132 "general backlog %d\n",skb->tc_index&0xf,sch->handle,q->backlog,
133 sch->qstats.backlog);
134 /* sum up all the qaves of prios <= to ours to get the new qave*/
135 if (!t->eqp && t->grio) {
136 for (i=0;i<t->DPs;i++) {
137 if ((!t->tab[i]) || (i==q->DP))
138 continue;
139
140 if ((t->tab[i]->prio < q->prio) && (PSCHED_IS_PASTPERFECT(t->tab[i]->qidlestart)))
141 qave +=t->tab[i]->qave;
142 }
143
144 }
145
146 q->packetsin++;
147 q->bytesin+=skb->len;
148
149 if (t->eqp && t->grio) {
150 qave=0;
151 q->qave=t->tab[t->def]->qave;
152 q->qidlestart=t->tab[t->def]->qidlestart;
153 }
154
155 if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
156 long us_idle;
157 PSCHED_GET_TIME(now);
158 us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
159 PSCHED_SET_PASTPERFECT(q->qidlestart);
160
161 q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF];
162 } else {
163 if (t->eqp) {
164 q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
165 } else {
166 q->qave += q->backlog - (q->qave >> q->Wlog);
167 }
168
169 }
170
171
172 if (t->eqp && t->grio)
173 t->tab[t->def]->qave=q->qave;
174
175 if ((q->qave+qave) < q->qth_min) {
176 q->qcount = -1;
177enqueue:
178 if (q->backlog + skb->len <= q->limit) {
179 q->backlog += skb->len;
180do_enqueue:
181 __skb_queue_tail(&sch->q, skb);
182 sch->qstats.backlog += skb->len;
183 sch->bstats.bytes += skb->len;
184 sch->bstats.packets++;
185 return 0;
186 } else {
187 q->pdrop++;
188 }
189
190drop:
191 kfree_skb(skb);
192 sch->qstats.drops++;
193 return NET_XMIT_DROP;
194 }
195 if ((q->qave+qave) >= q->qth_max) {
196 q->qcount = -1;
197 sch->qstats.overlimits++;
198 q->forced++;
199 goto drop;
200 }
201 if (++q->qcount) {
202 if ((((qave+q->qave) - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
203 goto enqueue;
204 q->qcount = 0;
205 q->qR = net_random()&q->Rmask;
206 sch->qstats.overlimits++;
207 q->early++;
208 goto drop;
209 }
210 q->qR = net_random()&q->Rmask;
211 goto enqueue;
212}
213
214static int
215gred_requeue(struct sk_buff *skb, struct Qdisc* sch)
216{
217 struct gred_sched_data *q;
218 struct gred_sched *t= qdisc_priv(sch);
219 q= t->tab[(skb->tc_index&0xf)];
220/* error checking here -- probably unnecessary */
221 PSCHED_SET_PASTPERFECT(q->qidlestart);
222
223 __skb_queue_head(&sch->q, skb);
224 sch->qstats.backlog += skb->len;
225 sch->qstats.requeues++;
226 q->backlog += skb->len;
227 return 0;
228}
229
230static struct sk_buff *
231gred_dequeue(struct Qdisc* sch)
232{
233 struct sk_buff *skb;
234 struct gred_sched_data *q;
235 struct gred_sched *t= qdisc_priv(sch);
236
237 skb = __skb_dequeue(&sch->q);
238 if (skb) {
239 sch->qstats.backlog -= skb->len;
240 q= t->tab[(skb->tc_index&0xf)];
241 if (q) {
242 q->backlog -= skb->len;
243 if (!q->backlog && !t->eqp)
244 PSCHED_GET_TIME(q->qidlestart);
245 } else {
246 D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf);
247 }
248 return skb;
249 }
250
251 if (t->eqp) {
252 q= t->tab[t->def];
253 if (!q)
254 D2PRINTK("no default VQ set: Results will be "
255 "screwed up\n");
256 else
257 PSCHED_GET_TIME(q->qidlestart);
258 }
259
260 return NULL;
261}
262
263static unsigned int gred_drop(struct Qdisc* sch)
264{
265 struct sk_buff *skb;
266
267 struct gred_sched_data *q;
268 struct gred_sched *t= qdisc_priv(sch);
269
270 skb = __skb_dequeue_tail(&sch->q);
271 if (skb) {
272 unsigned int len = skb->len;
273 sch->qstats.backlog -= len;
274 sch->qstats.drops++;
275 q= t->tab[(skb->tc_index&0xf)];
276 if (q) {
277 q->backlog -= len;
278 q->other++;
279 if (!q->backlog && !t->eqp)
280 PSCHED_GET_TIME(q->qidlestart);
281 } else {
282 D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf);
283 }
284
285 kfree_skb(skb);
286 return len;
287 }
288
289 q=t->tab[t->def];
290 if (!q) {
291 D2PRINTK("no default VQ set: Results might be screwed up\n");
292 return 0;
293 }
294
295 PSCHED_GET_TIME(q->qidlestart);
296 return 0;
297
298}
299
300static void gred_reset(struct Qdisc* sch)
301{
302 int i;
303 struct gred_sched_data *q;
304 struct gred_sched *t= qdisc_priv(sch);
305
306 __skb_queue_purge(&sch->q);
307
308 sch->qstats.backlog = 0;
309
310 for (i=0;i<t->DPs;i++) {
311 q= t->tab[i];
312 if (!q)
313 continue;
314 PSCHED_SET_PASTPERFECT(q->qidlestart);
315 q->qave = 0;
316 q->qcount = -1;
317 q->backlog = 0;
318 q->other=0;
319 q->forced=0;
320 q->pdrop=0;
321 q->early=0;
322 }
323}
324
325static int gred_change(struct Qdisc *sch, struct rtattr *opt)
326{
327 struct gred_sched *table = qdisc_priv(sch);
328 struct gred_sched_data *q;
329 struct tc_gred_qopt *ctl;
330 struct tc_gred_sopt *sopt;
331 struct rtattr *tb[TCA_GRED_STAB];
332 struct rtattr *tb2[TCA_GRED_DPS];
333 int i;
334
335 if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt))
336 return -EINVAL;
337
338 if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) {
339 rtattr_parse_nested(tb2, TCA_GRED_DPS, opt);
340
341 if (tb2[TCA_GRED_DPS-1] == 0)
342 return -EINVAL;
343
344 sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]);
345 table->DPs=sopt->DPs;
346 table->def=sopt->def_DP;
347 table->grio=sopt->grio;
348 table->initd=0;
349 /* probably need to clear all the table DP entries as well */
350 return 0;
351 }
352
353
354 if (!table->DPs || tb[TCA_GRED_PARMS-1] == 0 || tb[TCA_GRED_STAB-1] == 0 ||
355 RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) ||
356 RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256)
357 return -EINVAL;
358
359 ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]);
360 if (ctl->DP > MAX_DPs-1 ) {
361 /* misbehaving is punished! Put in the default drop probability */
362 DPRINTK("\nGRED: DP %u not in the proper range fixed. New DP "
363 "set to default at %d\n",ctl->DP,table->def);
364 ctl->DP=table->def;
365 }
366
367 if (table->tab[ctl->DP] == NULL) {
368 table->tab[ctl->DP]=kmalloc(sizeof(struct gred_sched_data),
369 GFP_KERNEL);
370 if (NULL == table->tab[ctl->DP])
371 return -ENOMEM;
372 memset(table->tab[ctl->DP], 0, (sizeof(struct gred_sched_data)));
373 }
374 q= table->tab[ctl->DP];
375
376 if (table->grio) {
377 if (ctl->prio <=0) {
378 if (table->def && table->tab[table->def]) {
379 DPRINTK("\nGRED: DP %u does not have a prio"
380 "setting default to %d\n",ctl->DP,
381 table->tab[table->def]->prio);
382 q->prio=table->tab[table->def]->prio;
383 } else {
384 DPRINTK("\nGRED: DP %u does not have a prio"
385 " setting default to 8\n",ctl->DP);
386 q->prio=8;
387 }
388 } else {
389 q->prio=ctl->prio;
390 }
391 } else {
392 q->prio=8;
393 }
394
395
396 q->DP=ctl->DP;
397 q->Wlog = ctl->Wlog;
398 q->Plog = ctl->Plog;
399 q->limit = ctl->limit;
400 q->Scell_log = ctl->Scell_log;
401 q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
402 q->Scell_max = (255<<q->Scell_log);
403 q->qth_min = ctl->qth_min<<ctl->Wlog;
404 q->qth_max = ctl->qth_max<<ctl->Wlog;
405 q->qave=0;
406 q->backlog=0;
407 q->qcount = -1;
408 q->other=0;
409 q->forced=0;
410 q->pdrop=0;
411 q->early=0;
412
413 PSCHED_SET_PASTPERFECT(q->qidlestart);
414 memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256);
415
416 if ( table->initd && table->grio) {
417 /* this looks ugly but it's not in the fast path */
418 for (i=0;i<table->DPs;i++) {
419 if ((!table->tab[i]) || (i==q->DP) )
420 continue;
421 if (table->tab[i]->prio == q->prio ){
422 /* WRED mode detected */
423 table->eqp=1;
424 break;
425 }
426 }
427 }
428
429 if (!table->initd) {
430 table->initd=1;
431 /*
432 the first entry also goes into the default until
433 over-written
434 */
435
436 if (table->tab[table->def] == NULL) {
437 table->tab[table->def]=
438 kmalloc(sizeof(struct gred_sched_data), GFP_KERNEL);
439 if (NULL == table->tab[table->def])
440 return -ENOMEM;
441
442 memset(table->tab[table->def], 0,
443 (sizeof(struct gred_sched_data)));
444 }
445 q= table->tab[table->def];
446 q->DP=table->def;
447 q->Wlog = ctl->Wlog;
448 q->Plog = ctl->Plog;
449 q->limit = ctl->limit;
450 q->Scell_log = ctl->Scell_log;
451 q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
452 q->Scell_max = (255<<q->Scell_log);
453 q->qth_min = ctl->qth_min<<ctl->Wlog;
454 q->qth_max = ctl->qth_max<<ctl->Wlog;
455
456 if (table->grio)
457 q->prio=table->tab[ctl->DP]->prio;
458 else
459 q->prio=8;
460
461 q->qcount = -1;
462 PSCHED_SET_PASTPERFECT(q->qidlestart);
463 memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256);
464 }
465 return 0;
466
467}
468
469static int gred_init(struct Qdisc *sch, struct rtattr *opt)
470{
471 struct gred_sched *table = qdisc_priv(sch);
472 struct tc_gred_sopt *sopt;
473 struct rtattr *tb[TCA_GRED_STAB];
474 struct rtattr *tb2[TCA_GRED_DPS];
475
476 if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt))
477 return -EINVAL;
478
479 if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) {
480 rtattr_parse_nested(tb2, TCA_GRED_DPS, opt);
481
482 if (tb2[TCA_GRED_DPS-1] == 0)
483 return -EINVAL;
484
485 sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]);
486 table->DPs=sopt->DPs;
487 table->def=sopt->def_DP;
488 table->grio=sopt->grio;
489 table->initd=0;
490 return 0;
491 }
492
493 DPRINTK("\n GRED_INIT error!\n");
494 return -EINVAL;
495}
496
497static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
498{
499 unsigned long qave;
500 struct rtattr *rta;
501 struct tc_gred_qopt *opt = NULL ;
502 struct tc_gred_qopt *dst;
503 struct gred_sched *table = qdisc_priv(sch);
504 struct gred_sched_data *q;
505 int i;
506 unsigned char *b = skb->tail;
507
508 rta = (struct rtattr*)b;
509 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
510
511 opt=kmalloc(sizeof(struct tc_gred_qopt)*MAX_DPs, GFP_KERNEL);
512
513 if (opt == NULL) {
514 DPRINTK("gred_dump:failed to malloc for %Zd\n",
515 sizeof(struct tc_gred_qopt)*MAX_DPs);
516 goto rtattr_failure;
517 }
518
519 memset(opt, 0, (sizeof(struct tc_gred_qopt))*table->DPs);
520
521 if (!table->initd) {
522 DPRINTK("NO GRED Queues setup!\n");
523 }
524
525 for (i=0;i<MAX_DPs;i++) {
526 dst= &opt[i];
527 q= table->tab[i];
528
529 if (!q) {
530 /* hack -- fix at some point with proper message
531 This is how we indicate to tc that there is no VQ
532 at this DP */
533
534 dst->DP=MAX_DPs+i;
535 continue;
536 }
537
538 dst->limit=q->limit;
539 dst->qth_min=q->qth_min>>q->Wlog;
540 dst->qth_max=q->qth_max>>q->Wlog;
541 dst->DP=q->DP;
542 dst->backlog=q->backlog;
543 if (q->qave) {
544 if (table->eqp && table->grio) {
545 q->qidlestart=table->tab[table->def]->qidlestart;
546 q->qave=table->tab[table->def]->qave;
547 }
548 if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
549 long idle;
550 psched_time_t now;
551 PSCHED_GET_TIME(now);
552 idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
553 qave = q->qave >> q->Stab[(idle>>q->Scell_log)&0xFF];
554 dst->qave = qave >> q->Wlog;
555
556 } else {
557 dst->qave = q->qave >> q->Wlog;
558 }
559 } else {
560 dst->qave = 0;
561 }
562
563
564 dst->Wlog = q->Wlog;
565 dst->Plog = q->Plog;
566 dst->Scell_log = q->Scell_log;
567 dst->other = q->other;
568 dst->forced = q->forced;
569 dst->early = q->early;
570 dst->pdrop = q->pdrop;
571 dst->prio = q->prio;
572 dst->packets=q->packetsin;
573 dst->bytesin=q->bytesin;
574 }
575
576 RTA_PUT(skb, TCA_GRED_PARMS, sizeof(struct tc_gred_qopt)*MAX_DPs, opt);
577 rta->rta_len = skb->tail - b;
578
579 kfree(opt);
580 return skb->len;
581
582rtattr_failure:
583 if (opt)
584 kfree(opt);
585 DPRINTK("gred_dump: FAILURE!!!!\n");
586
587/* also free the opt struct here */
588 skb_trim(skb, b - skb->data);
589 return -1;
590}
591
592static void gred_destroy(struct Qdisc *sch)
593{
594 struct gred_sched *table = qdisc_priv(sch);
595 int i;
596
597 for (i = 0;i < table->DPs; i++) {
598 if (table->tab[i])
599 kfree(table->tab[i]);
600 }
601}
602
603static struct Qdisc_ops gred_qdisc_ops = {
604 .next = NULL,
605 .cl_ops = NULL,
606 .id = "gred",
607 .priv_size = sizeof(struct gred_sched),
608 .enqueue = gred_enqueue,
609 .dequeue = gred_dequeue,
610 .requeue = gred_requeue,
611 .drop = gred_drop,
612 .init = gred_init,
613 .reset = gred_reset,
614 .destroy = gred_destroy,
615 .change = gred_change,
616 .dump = gred_dump,
617 .owner = THIS_MODULE,
618};
619
620static int __init gred_module_init(void)
621{
622 return register_qdisc(&gred_qdisc_ops);
623}
624static void __exit gred_module_exit(void)
625{
626 unregister_qdisc(&gred_qdisc_ops);
627}
628module_init(gred_module_init)
629module_exit(gred_module_exit)
630MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
new file mode 100644
index 000000000000..c26764bc4103
--- /dev/null
+++ b/net/sched/sch_hfsc.c
@@ -0,0 +1,1822 @@
1/*
2 * Copyright (c) 2003 Patrick McHardy, <kaber@trash.net>
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version 2
7 * of the License, or (at your option) any later version.
8 *
9 * 2003-10-17 - Ported from altq
10 */
11/*
12 * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved.
13 *
14 * Permission to use, copy, modify, and distribute this software and
15 * its documentation is hereby granted (including for commercial or
16 * for-profit use), provided that both the copyright notice and this
17 * permission notice appear in all copies of the software, derivative
18 * works, or modified versions, and any portions thereof.
19 *
20 * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF
21 * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS
22 * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED
23 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
28 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
29 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
32 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
33 * DAMAGE.
34 *
35 * Carnegie Mellon encourages (but does not require) users of this
36 * software to return any improvements or extensions that they make,
37 * and to grant Carnegie Mellon the rights to redistribute these
38 * changes without encumbrance.
39 */
40/*
41 * H-FSC is described in Proceedings of SIGCOMM'97,
42 * "A Hierarchical Fair Service Curve Algorithm for Link-Sharing,
43 * Real-Time and Priority Service"
44 * by Ion Stoica, Hui Zhang, and T. S. Eugene Ng.
45 *
46 * Oleg Cherevko <olwi@aq.ml.com.ua> added the upperlimit for link-sharing.
47 * when a class has an upperlimit, the fit-time is computed from the
48 * upperlimit service curve. the link-sharing scheduler does not schedule
49 * a class whose fit-time exceeds the current time.
50 */
51
52#include <linux/kernel.h>
53#include <linux/config.h>
54#include <linux/module.h>
55#include <linux/types.h>
56#include <linux/errno.h>
57#include <linux/jiffies.h>
58#include <linux/compiler.h>
59#include <linux/spinlock.h>
60#include <linux/skbuff.h>
61#include <linux/string.h>
62#include <linux/slab.h>
63#include <linux/timer.h>
64#include <linux/list.h>
65#include <linux/rbtree.h>
66#include <linux/init.h>
67#include <linux/netdevice.h>
68#include <linux/rtnetlink.h>
69#include <linux/pkt_sched.h>
70#include <net/pkt_sched.h>
71#include <net/pkt_cls.h>
72#include <asm/system.h>
73#include <asm/div64.h>
74
75#define HFSC_DEBUG 1
76
77/*
78 * kernel internal service curve representation:
79 * coordinates are given by 64 bit unsigned integers.
80 * x-axis: unit is clock count.
81 * y-axis: unit is byte.
82 *
83 * The service curve parameters are converted to the internal
84 * representation. The slope values are scaled to avoid overflow.
85 * the inverse slope values as well as the y-projection of the 1st
86 * segment are kept in order to to avoid 64-bit divide operations
87 * that are expensive on 32-bit architectures.
88 */
89
90struct internal_sc
91{
92 u64 sm1; /* scaled slope of the 1st segment */
93 u64 ism1; /* scaled inverse-slope of the 1st segment */
94 u64 dx; /* the x-projection of the 1st segment */
95 u64 dy; /* the y-projection of the 1st segment */
96 u64 sm2; /* scaled slope of the 2nd segment */
97 u64 ism2; /* scaled inverse-slope of the 2nd segment */
98};
99
100/* runtime service curve */
101struct runtime_sc
102{
103 u64 x; /* current starting position on x-axis */
104 u64 y; /* current starting position on y-axis */
105 u64 sm1; /* scaled slope of the 1st segment */
106 u64 ism1; /* scaled inverse-slope of the 1st segment */
107 u64 dx; /* the x-projection of the 1st segment */
108 u64 dy; /* the y-projection of the 1st segment */
109 u64 sm2; /* scaled slope of the 2nd segment */
110 u64 ism2; /* scaled inverse-slope of the 2nd segment */
111};
112
113enum hfsc_class_flags
114{
115 HFSC_RSC = 0x1,
116 HFSC_FSC = 0x2,
117 HFSC_USC = 0x4
118};
119
120struct hfsc_class
121{
122 u32 classid; /* class id */
123 unsigned int refcnt; /* usage count */
124
125 struct gnet_stats_basic bstats;
126 struct gnet_stats_queue qstats;
127 struct gnet_stats_rate_est rate_est;
128 spinlock_t *stats_lock;
129 unsigned int level; /* class level in hierarchy */
130 struct tcf_proto *filter_list; /* filter list */
131 unsigned int filter_cnt; /* filter count */
132
133 struct hfsc_sched *sched; /* scheduler data */
134 struct hfsc_class *cl_parent; /* parent class */
135 struct list_head siblings; /* sibling classes */
136 struct list_head children; /* child classes */
137 struct Qdisc *qdisc; /* leaf qdisc */
138
139 struct rb_node el_node; /* qdisc's eligible tree member */
140 struct rb_root vt_tree; /* active children sorted by cl_vt */
141 struct rb_node vt_node; /* parent's vt_tree member */
142 struct rb_root cf_tree; /* active children sorted by cl_f */
143 struct rb_node cf_node; /* parent's cf_heap member */
144 struct list_head hlist; /* hash list member */
145 struct list_head dlist; /* drop list member */
146
147 u64 cl_total; /* total work in bytes */
148 u64 cl_cumul; /* cumulative work in bytes done by
149 real-time criteria */
150
151 u64 cl_d; /* deadline*/
152 u64 cl_e; /* eligible time */
153 u64 cl_vt; /* virtual time */
154 u64 cl_f; /* time when this class will fit for
155 link-sharing, max(myf, cfmin) */
156 u64 cl_myf; /* my fit-time (calculated from this
157 class's own upperlimit curve) */
158 u64 cl_myfadj; /* my fit-time adjustment (to cancel
159 history dependence) */
160 u64 cl_cfmin; /* earliest children's fit-time (used
161 with cl_myf to obtain cl_f) */
162 u64 cl_cvtmin; /* minimal virtual time among the
163 children fit for link-sharing
164 (monotonic within a period) */
165 u64 cl_vtadj; /* intra-period cumulative vt
166 adjustment */
167 u64 cl_vtoff; /* inter-period cumulative vt offset */
168 u64 cl_cvtmax; /* max child's vt in the last period */
169 u64 cl_cvtoff; /* cumulative cvtmax of all periods */
170 u64 cl_pcvtoff; /* parent's cvtoff at initalization
171 time */
172
173 struct internal_sc cl_rsc; /* internal real-time service curve */
174 struct internal_sc cl_fsc; /* internal fair service curve */
175 struct internal_sc cl_usc; /* internal upperlimit service curve */
176 struct runtime_sc cl_deadline; /* deadline curve */
177 struct runtime_sc cl_eligible; /* eligible curve */
178 struct runtime_sc cl_virtual; /* virtual curve */
179 struct runtime_sc cl_ulimit; /* upperlimit curve */
180
181 unsigned long cl_flags; /* which curves are valid */
182 unsigned long cl_vtperiod; /* vt period sequence number */
183 unsigned long cl_parentperiod;/* parent's vt period sequence number*/
184 unsigned long cl_nactive; /* number of active children */
185};
186
187#define HFSC_HSIZE 16
188
189struct hfsc_sched
190{
191 u16 defcls; /* default class id */
192 struct hfsc_class root; /* root class */
193 struct list_head clhash[HFSC_HSIZE]; /* class hash */
194 struct rb_root eligible; /* eligible tree */
195 struct list_head droplist; /* active leaf class list (for
196 dropping) */
197 struct sk_buff_head requeue; /* requeued packet */
198 struct timer_list wd_timer; /* watchdog timer */
199};
200
201/*
202 * macros
203 */
204#ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY
205#include <linux/time.h>
206#undef PSCHED_GET_TIME
207#define PSCHED_GET_TIME(stamp) \
208do { \
209 struct timeval tv; \
210 do_gettimeofday(&tv); \
211 (stamp) = 1000000ULL * tv.tv_sec + tv.tv_usec; \
212} while (0)
213#endif
214
215#if HFSC_DEBUG
216#define ASSERT(cond) \
217do { \
218 if (unlikely(!(cond))) \
219 printk("assertion %s failed at %s:%i (%s)\n", \
220 #cond, __FILE__, __LINE__, __FUNCTION__); \
221} while (0)
222#else
223#define ASSERT(cond)
224#endif /* HFSC_DEBUG */
225
226#define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */
227
228
229/*
230 * eligible tree holds backlogged classes being sorted by their eligible times.
231 * there is one eligible tree per hfsc instance.
232 */
233
234static void
235eltree_insert(struct hfsc_class *cl)
236{
237 struct rb_node **p = &cl->sched->eligible.rb_node;
238 struct rb_node *parent = NULL;
239 struct hfsc_class *cl1;
240
241 while (*p != NULL) {
242 parent = *p;
243 cl1 = rb_entry(parent, struct hfsc_class, el_node);
244 if (cl->cl_e >= cl1->cl_e)
245 p = &parent->rb_right;
246 else
247 p = &parent->rb_left;
248 }
249 rb_link_node(&cl->el_node, parent, p);
250 rb_insert_color(&cl->el_node, &cl->sched->eligible);
251}
252
253static inline void
254eltree_remove(struct hfsc_class *cl)
255{
256 rb_erase(&cl->el_node, &cl->sched->eligible);
257}
258
259static inline void
260eltree_update(struct hfsc_class *cl)
261{
262 eltree_remove(cl);
263 eltree_insert(cl);
264}
265
266/* find the class with the minimum deadline among the eligible classes */
267static inline struct hfsc_class *
268eltree_get_mindl(struct hfsc_sched *q, u64 cur_time)
269{
270 struct hfsc_class *p, *cl = NULL;
271 struct rb_node *n;
272
273 for (n = rb_first(&q->eligible); n != NULL; n = rb_next(n)) {
274 p = rb_entry(n, struct hfsc_class, el_node);
275 if (p->cl_e > cur_time)
276 break;
277 if (cl == NULL || p->cl_d < cl->cl_d)
278 cl = p;
279 }
280 return cl;
281}
282
283/* find the class with minimum eligible time among the eligible classes */
284static inline struct hfsc_class *
285eltree_get_minel(struct hfsc_sched *q)
286{
287 struct rb_node *n;
288
289 n = rb_first(&q->eligible);
290 if (n == NULL)
291 return NULL;
292 return rb_entry(n, struct hfsc_class, el_node);
293}
294
295/*
296 * vttree holds holds backlogged child classes being sorted by their virtual
297 * time. each intermediate class has one vttree.
298 */
299static void
300vttree_insert(struct hfsc_class *cl)
301{
302 struct rb_node **p = &cl->cl_parent->vt_tree.rb_node;
303 struct rb_node *parent = NULL;
304 struct hfsc_class *cl1;
305
306 while (*p != NULL) {
307 parent = *p;
308 cl1 = rb_entry(parent, struct hfsc_class, vt_node);
309 if (cl->cl_vt >= cl1->cl_vt)
310 p = &parent->rb_right;
311 else
312 p = &parent->rb_left;
313 }
314 rb_link_node(&cl->vt_node, parent, p);
315 rb_insert_color(&cl->vt_node, &cl->cl_parent->vt_tree);
316}
317
318static inline void
319vttree_remove(struct hfsc_class *cl)
320{
321 rb_erase(&cl->vt_node, &cl->cl_parent->vt_tree);
322}
323
324static inline void
325vttree_update(struct hfsc_class *cl)
326{
327 vttree_remove(cl);
328 vttree_insert(cl);
329}
330
331static inline struct hfsc_class *
332vttree_firstfit(struct hfsc_class *cl, u64 cur_time)
333{
334 struct hfsc_class *p;
335 struct rb_node *n;
336
337 for (n = rb_first(&cl->vt_tree); n != NULL; n = rb_next(n)) {
338 p = rb_entry(n, struct hfsc_class, vt_node);
339 if (p->cl_f <= cur_time)
340 return p;
341 }
342 return NULL;
343}
344
345/*
346 * get the leaf class with the minimum vt in the hierarchy
347 */
348static struct hfsc_class *
349vttree_get_minvt(struct hfsc_class *cl, u64 cur_time)
350{
351 /* if root-class's cfmin is bigger than cur_time nothing to do */
352 if (cl->cl_cfmin > cur_time)
353 return NULL;
354
355 while (cl->level > 0) {
356 cl = vttree_firstfit(cl, cur_time);
357 if (cl == NULL)
358 return NULL;
359 /*
360 * update parent's cl_cvtmin.
361 */
362 if (cl->cl_parent->cl_cvtmin < cl->cl_vt)
363 cl->cl_parent->cl_cvtmin = cl->cl_vt;
364 }
365 return cl;
366}
367
368static void
369cftree_insert(struct hfsc_class *cl)
370{
371 struct rb_node **p = &cl->cl_parent->cf_tree.rb_node;
372 struct rb_node *parent = NULL;
373 struct hfsc_class *cl1;
374
375 while (*p != NULL) {
376 parent = *p;
377 cl1 = rb_entry(parent, struct hfsc_class, cf_node);
378 if (cl->cl_f >= cl1->cl_f)
379 p = &parent->rb_right;
380 else
381 p = &parent->rb_left;
382 }
383 rb_link_node(&cl->cf_node, parent, p);
384 rb_insert_color(&cl->cf_node, &cl->cl_parent->cf_tree);
385}
386
387static inline void
388cftree_remove(struct hfsc_class *cl)
389{
390 rb_erase(&cl->cf_node, &cl->cl_parent->cf_tree);
391}
392
393static inline void
394cftree_update(struct hfsc_class *cl)
395{
396 cftree_remove(cl);
397 cftree_insert(cl);
398}
399
400/*
401 * service curve support functions
402 *
403 * external service curve parameters
404 * m: bps
405 * d: us
406 * internal service curve parameters
407 * sm: (bytes/psched_us) << SM_SHIFT
408 * ism: (psched_us/byte) << ISM_SHIFT
409 * dx: psched_us
410 *
411 * Clock source resolution (CONFIG_NET_SCH_CLK_*)
412 * JIFFIES: for 48<=HZ<=1534 resolution is between 0.63us and 1.27us.
413 * CPU: resolution is between 0.5us and 1us.
414 * GETTIMEOFDAY: resolution is exactly 1us.
415 *
416 * sm and ism are scaled in order to keep effective digits.
417 * SM_SHIFT and ISM_SHIFT are selected to keep at least 4 effective
418 * digits in decimal using the following table.
419 *
420 * Note: We can afford the additional accuracy (altq hfsc keeps at most
421 * 3 effective digits) thanks to the fact that linux clock is bounded
422 * much more tightly.
423 *
424 * bits/sec 100Kbps 1Mbps 10Mbps 100Mbps 1Gbps
425 * ------------+-------------------------------------------------------
426 * bytes/0.5us 6.25e-3 62.5e-3 625e-3 6250e-e 62500e-3
427 * bytes/us 12.5e-3 125e-3 1250e-3 12500e-3 125000e-3
428 * bytes/1.27us 15.875e-3 158.75e-3 1587.5e-3 15875e-3 158750e-3
429 *
430 * 0.5us/byte 160 16 1.6 0.16 0.016
431 * us/byte 80 8 0.8 0.08 0.008
432 * 1.27us/byte 63 6.3 0.63 0.063 0.0063
433 */
434#define SM_SHIFT 20
435#define ISM_SHIFT 18
436
437#define SM_MASK ((1ULL << SM_SHIFT) - 1)
438#define ISM_MASK ((1ULL << ISM_SHIFT) - 1)
439
440static inline u64
441seg_x2y(u64 x, u64 sm)
442{
443 u64 y;
444
445 /*
446 * compute
447 * y = x * sm >> SM_SHIFT
448 * but divide it for the upper and lower bits to avoid overflow
449 */
450 y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT);
451 return y;
452}
453
454static inline u64
455seg_y2x(u64 y, u64 ism)
456{
457 u64 x;
458
459 if (y == 0)
460 x = 0;
461 else if (ism == HT_INFINITY)
462 x = HT_INFINITY;
463 else {
464 x = (y >> ISM_SHIFT) * ism
465 + (((y & ISM_MASK) * ism) >> ISM_SHIFT);
466 }
467 return x;
468}
469
470/* Convert m (bps) into sm (bytes/psched us) */
471static u64
472m2sm(u32 m)
473{
474 u64 sm;
475
476 sm = ((u64)m << SM_SHIFT);
477 sm += PSCHED_JIFFIE2US(HZ) - 1;
478 do_div(sm, PSCHED_JIFFIE2US(HZ));
479 return sm;
480}
481
482/* convert m (bps) into ism (psched us/byte) */
483static u64
484m2ism(u32 m)
485{
486 u64 ism;
487
488 if (m == 0)
489 ism = HT_INFINITY;
490 else {
491 ism = ((u64)PSCHED_JIFFIE2US(HZ) << ISM_SHIFT);
492 ism += m - 1;
493 do_div(ism, m);
494 }
495 return ism;
496}
497
498/* convert d (us) into dx (psched us) */
499static u64
500d2dx(u32 d)
501{
502 u64 dx;
503
504 dx = ((u64)d * PSCHED_JIFFIE2US(HZ));
505 dx += 1000000 - 1;
506 do_div(dx, 1000000);
507 return dx;
508}
509
510/* convert sm (bytes/psched us) into m (bps) */
511static u32
512sm2m(u64 sm)
513{
514 u64 m;
515
516 m = (sm * PSCHED_JIFFIE2US(HZ)) >> SM_SHIFT;
517 return (u32)m;
518}
519
520/* convert dx (psched us) into d (us) */
521static u32
522dx2d(u64 dx)
523{
524 u64 d;
525
526 d = dx * 1000000;
527 do_div(d, PSCHED_JIFFIE2US(HZ));
528 return (u32)d;
529}
530
531static void
532sc2isc(struct tc_service_curve *sc, struct internal_sc *isc)
533{
534 isc->sm1 = m2sm(sc->m1);
535 isc->ism1 = m2ism(sc->m1);
536 isc->dx = d2dx(sc->d);
537 isc->dy = seg_x2y(isc->dx, isc->sm1);
538 isc->sm2 = m2sm(sc->m2);
539 isc->ism2 = m2ism(sc->m2);
540}
541
542/*
543 * initialize the runtime service curve with the given internal
544 * service curve starting at (x, y).
545 */
546static void
547rtsc_init(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y)
548{
549 rtsc->x = x;
550 rtsc->y = y;
551 rtsc->sm1 = isc->sm1;
552 rtsc->ism1 = isc->ism1;
553 rtsc->dx = isc->dx;
554 rtsc->dy = isc->dy;
555 rtsc->sm2 = isc->sm2;
556 rtsc->ism2 = isc->ism2;
557}
558
559/*
560 * calculate the y-projection of the runtime service curve by the
561 * given x-projection value
562 */
563static u64
564rtsc_y2x(struct runtime_sc *rtsc, u64 y)
565{
566 u64 x;
567
568 if (y < rtsc->y)
569 x = rtsc->x;
570 else if (y <= rtsc->y + rtsc->dy) {
571 /* x belongs to the 1st segment */
572 if (rtsc->dy == 0)
573 x = rtsc->x + rtsc->dx;
574 else
575 x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1);
576 } else {
577 /* x belongs to the 2nd segment */
578 x = rtsc->x + rtsc->dx
579 + seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2);
580 }
581 return x;
582}
583
584static u64
585rtsc_x2y(struct runtime_sc *rtsc, u64 x)
586{
587 u64 y;
588
589 if (x <= rtsc->x)
590 y = rtsc->y;
591 else if (x <= rtsc->x + rtsc->dx)
592 /* y belongs to the 1st segment */
593 y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1);
594 else
595 /* y belongs to the 2nd segment */
596 y = rtsc->y + rtsc->dy
597 + seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2);
598 return y;
599}
600
601/*
602 * update the runtime service curve by taking the minimum of the current
603 * runtime service curve and the service curve starting at (x, y).
604 */
605static void
606rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y)
607{
608 u64 y1, y2, dx, dy;
609 u32 dsm;
610
611 if (isc->sm1 <= isc->sm2) {
612 /* service curve is convex */
613 y1 = rtsc_x2y(rtsc, x);
614 if (y1 < y)
615 /* the current rtsc is smaller */
616 return;
617 rtsc->x = x;
618 rtsc->y = y;
619 return;
620 }
621
622 /*
623 * service curve is concave
624 * compute the two y values of the current rtsc
625 * y1: at x
626 * y2: at (x + dx)
627 */
628 y1 = rtsc_x2y(rtsc, x);
629 if (y1 <= y) {
630 /* rtsc is below isc, no change to rtsc */
631 return;
632 }
633
634 y2 = rtsc_x2y(rtsc, x + isc->dx);
635 if (y2 >= y + isc->dy) {
636 /* rtsc is above isc, replace rtsc by isc */
637 rtsc->x = x;
638 rtsc->y = y;
639 rtsc->dx = isc->dx;
640 rtsc->dy = isc->dy;
641 return;
642 }
643
644 /*
645 * the two curves intersect
646 * compute the offsets (dx, dy) using the reverse
647 * function of seg_x2y()
648 * seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y)
649 */
650 dx = (y1 - y) << SM_SHIFT;
651 dsm = isc->sm1 - isc->sm2;
652 do_div(dx, dsm);
653 /*
654 * check if (x, y1) belongs to the 1st segment of rtsc.
655 * if so, add the offset.
656 */
657 if (rtsc->x + rtsc->dx > x)
658 dx += rtsc->x + rtsc->dx - x;
659 dy = seg_x2y(dx, isc->sm1);
660
661 rtsc->x = x;
662 rtsc->y = y;
663 rtsc->dx = dx;
664 rtsc->dy = dy;
665 return;
666}
667
668static void
669init_ed(struct hfsc_class *cl, unsigned int next_len)
670{
671 u64 cur_time;
672
673 PSCHED_GET_TIME(cur_time);
674
675 /* update the deadline curve */
676 rtsc_min(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul);
677
678 /*
679 * update the eligible curve.
680 * for concave, it is equal to the deadline curve.
681 * for convex, it is a linear curve with slope m2.
682 */
683 cl->cl_eligible = cl->cl_deadline;
684 if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) {
685 cl->cl_eligible.dx = 0;
686 cl->cl_eligible.dy = 0;
687 }
688
689 /* compute e and d */
690 cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul);
691 cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
692
693 eltree_insert(cl);
694}
695
696static void
697update_ed(struct hfsc_class *cl, unsigned int next_len)
698{
699 cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul);
700 cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
701
702 eltree_update(cl);
703}
704
705static inline void
706update_d(struct hfsc_class *cl, unsigned int next_len)
707{
708 cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
709}
710
711static inline void
712update_cfmin(struct hfsc_class *cl)
713{
714 struct rb_node *n = rb_first(&cl->cf_tree);
715 struct hfsc_class *p;
716
717 if (n == NULL) {
718 cl->cl_cfmin = 0;
719 return;
720 }
721 p = rb_entry(n, struct hfsc_class, cf_node);
722 cl->cl_cfmin = p->cl_f;
723}
724
725static void
726init_vf(struct hfsc_class *cl, unsigned int len)
727{
728 struct hfsc_class *max_cl;
729 struct rb_node *n;
730 u64 vt, f, cur_time;
731 int go_active;
732
733 cur_time = 0;
734 go_active = 1;
735 for (; cl->cl_parent != NULL; cl = cl->cl_parent) {
736 if (go_active && cl->cl_nactive++ == 0)
737 go_active = 1;
738 else
739 go_active = 0;
740
741 if (go_active) {
742 n = rb_last(&cl->cl_parent->vt_tree);
743 if (n != NULL) {
744 max_cl = rb_entry(n, struct hfsc_class,vt_node);
745 /*
746 * set vt to the average of the min and max
747 * classes. if the parent's period didn't
748 * change, don't decrease vt of the class.
749 */
750 vt = max_cl->cl_vt;
751 if (cl->cl_parent->cl_cvtmin != 0)
752 vt = (cl->cl_parent->cl_cvtmin + vt)/2;
753
754 if (cl->cl_parent->cl_vtperiod !=
755 cl->cl_parentperiod || vt > cl->cl_vt)
756 cl->cl_vt = vt;
757 } else {
758 /*
759 * first child for a new parent backlog period.
760 * add parent's cvtmax to cvtoff to make a new
761 * vt (vtoff + vt) larger than the vt in the
762 * last period for all children.
763 */
764 vt = cl->cl_parent->cl_cvtmax;
765 cl->cl_parent->cl_cvtoff += vt;
766 cl->cl_parent->cl_cvtmax = 0;
767 cl->cl_parent->cl_cvtmin = 0;
768 cl->cl_vt = 0;
769 }
770
771 cl->cl_vtoff = cl->cl_parent->cl_cvtoff -
772 cl->cl_pcvtoff;
773
774 /* update the virtual curve */
775 vt = cl->cl_vt + cl->cl_vtoff;
776 rtsc_min(&cl->cl_virtual, &cl->cl_fsc, vt,
777 cl->cl_total);
778 if (cl->cl_virtual.x == vt) {
779 cl->cl_virtual.x -= cl->cl_vtoff;
780 cl->cl_vtoff = 0;
781 }
782 cl->cl_vtadj = 0;
783
784 cl->cl_vtperiod++; /* increment vt period */
785 cl->cl_parentperiod = cl->cl_parent->cl_vtperiod;
786 if (cl->cl_parent->cl_nactive == 0)
787 cl->cl_parentperiod++;
788 cl->cl_f = 0;
789
790 vttree_insert(cl);
791 cftree_insert(cl);
792
793 if (cl->cl_flags & HFSC_USC) {
794 /* class has upper limit curve */
795 if (cur_time == 0)
796 PSCHED_GET_TIME(cur_time);
797
798 /* update the ulimit curve */
799 rtsc_min(&cl->cl_ulimit, &cl->cl_usc, cur_time,
800 cl->cl_total);
801 /* compute myf */
802 cl->cl_myf = rtsc_y2x(&cl->cl_ulimit,
803 cl->cl_total);
804 cl->cl_myfadj = 0;
805 }
806 }
807
808 f = max(cl->cl_myf, cl->cl_cfmin);
809 if (f != cl->cl_f) {
810 cl->cl_f = f;
811 cftree_update(cl);
812 update_cfmin(cl->cl_parent);
813 }
814 }
815}
816
817static void
818update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
819{
820 u64 f; /* , myf_bound, delta; */
821 int go_passive = 0;
822
823 if (cl->qdisc->q.qlen == 0 && cl->cl_flags & HFSC_FSC)
824 go_passive = 1;
825
826 for (; cl->cl_parent != NULL; cl = cl->cl_parent) {
827 cl->cl_total += len;
828
829 if (!(cl->cl_flags & HFSC_FSC) || cl->cl_nactive == 0)
830 continue;
831
832 if (go_passive && --cl->cl_nactive == 0)
833 go_passive = 1;
834 else
835 go_passive = 0;
836
837 if (go_passive) {
838 /* no more active child, going passive */
839
840 /* update cvtmax of the parent class */
841 if (cl->cl_vt > cl->cl_parent->cl_cvtmax)
842 cl->cl_parent->cl_cvtmax = cl->cl_vt;
843
844 /* remove this class from the vt tree */
845 vttree_remove(cl);
846
847 cftree_remove(cl);
848 update_cfmin(cl->cl_parent);
849
850 continue;
851 }
852
853 /*
854 * update vt and f
855 */
856 cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total)
857 - cl->cl_vtoff + cl->cl_vtadj;
858
859 /*
860 * if vt of the class is smaller than cvtmin,
861 * the class was skipped in the past due to non-fit.
862 * if so, we need to adjust vtadj.
863 */
864 if (cl->cl_vt < cl->cl_parent->cl_cvtmin) {
865 cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt;
866 cl->cl_vt = cl->cl_parent->cl_cvtmin;
867 }
868
869 /* update the vt tree */
870 vttree_update(cl);
871
872 if (cl->cl_flags & HFSC_USC) {
873 cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit,
874 cl->cl_total);
875#if 0
876 /*
877 * This code causes classes to stay way under their
878 * limit when multiple classes are used at gigabit
879 * speed. needs investigation. -kaber
880 */
881 /*
882 * if myf lags behind by more than one clock tick
883 * from the current time, adjust myfadj to prevent
884 * a rate-limited class from going greedy.
885 * in a steady state under rate-limiting, myf
886 * fluctuates within one clock tick.
887 */
888 myf_bound = cur_time - PSCHED_JIFFIE2US(1);
889 if (cl->cl_myf < myf_bound) {
890 delta = cur_time - cl->cl_myf;
891 cl->cl_myfadj += delta;
892 cl->cl_myf += delta;
893 }
894#endif
895 }
896
897 f = max(cl->cl_myf, cl->cl_cfmin);
898 if (f != cl->cl_f) {
899 cl->cl_f = f;
900 cftree_update(cl);
901 update_cfmin(cl->cl_parent);
902 }
903 }
904}
905
906static void
907set_active(struct hfsc_class *cl, unsigned int len)
908{
909 if (cl->cl_flags & HFSC_RSC)
910 init_ed(cl, len);
911 if (cl->cl_flags & HFSC_FSC)
912 init_vf(cl, len);
913
914 list_add_tail(&cl->dlist, &cl->sched->droplist);
915}
916
917static void
918set_passive(struct hfsc_class *cl)
919{
920 if (cl->cl_flags & HFSC_RSC)
921 eltree_remove(cl);
922
923 list_del(&cl->dlist);
924
925 /*
926 * vttree is now handled in update_vf() so that update_vf(cl, 0, 0)
927 * needs to be called explicitly to remove a class from vttree.
928 */
929}
930
931/*
932 * hack to get length of first packet in queue.
933 */
934static unsigned int
935qdisc_peek_len(struct Qdisc *sch)
936{
937 struct sk_buff *skb;
938 unsigned int len;
939
940 skb = sch->dequeue(sch);
941 if (skb == NULL) {
942 if (net_ratelimit())
943 printk("qdisc_peek_len: non work-conserving qdisc ?\n");
944 return 0;
945 }
946 len = skb->len;
947 if (unlikely(sch->ops->requeue(skb, sch) != NET_XMIT_SUCCESS)) {
948 if (net_ratelimit())
949 printk("qdisc_peek_len: failed to requeue\n");
950 return 0;
951 }
952 return len;
953}
954
955static void
956hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl)
957{
958 unsigned int len = cl->qdisc->q.qlen;
959
960 qdisc_reset(cl->qdisc);
961 if (len > 0) {
962 update_vf(cl, 0, 0);
963 set_passive(cl);
964 sch->q.qlen -= len;
965 }
966}
967
968static void
969hfsc_adjust_levels(struct hfsc_class *cl)
970{
971 struct hfsc_class *p;
972 unsigned int level;
973
974 do {
975 level = 0;
976 list_for_each_entry(p, &cl->children, siblings) {
977 if (p->level > level)
978 level = p->level;
979 }
980 cl->level = level + 1;
981 } while ((cl = cl->cl_parent) != NULL);
982}
983
984static inline unsigned int
985hfsc_hash(u32 h)
986{
987 h ^= h >> 8;
988 h ^= h >> 4;
989
990 return h & (HFSC_HSIZE - 1);
991}
992
993static inline struct hfsc_class *
994hfsc_find_class(u32 classid, struct Qdisc *sch)
995{
996 struct hfsc_sched *q = qdisc_priv(sch);
997 struct hfsc_class *cl;
998
999 list_for_each_entry(cl, &q->clhash[hfsc_hash(classid)], hlist) {
1000 if (cl->classid == classid)
1001 return cl;
1002 }
1003 return NULL;
1004}
1005
1006static void
1007hfsc_change_rsc(struct hfsc_class *cl, struct tc_service_curve *rsc,
1008 u64 cur_time)
1009{
1010 sc2isc(rsc, &cl->cl_rsc);
1011 rtsc_init(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul);
1012 cl->cl_eligible = cl->cl_deadline;
1013 if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) {
1014 cl->cl_eligible.dx = 0;
1015 cl->cl_eligible.dy = 0;
1016 }
1017 cl->cl_flags |= HFSC_RSC;
1018}
1019
1020static void
1021hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc)
1022{
1023 sc2isc(fsc, &cl->cl_fsc);
1024 rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total);
1025 cl->cl_flags |= HFSC_FSC;
1026}
1027
1028static void
1029hfsc_change_usc(struct hfsc_class *cl, struct tc_service_curve *usc,
1030 u64 cur_time)
1031{
1032 sc2isc(usc, &cl->cl_usc);
1033 rtsc_init(&cl->cl_ulimit, &cl->cl_usc, cur_time, cl->cl_total);
1034 cl->cl_flags |= HFSC_USC;
1035}
1036
1037static int
1038hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
1039 struct rtattr **tca, unsigned long *arg)
1040{
1041 struct hfsc_sched *q = qdisc_priv(sch);
1042 struct hfsc_class *cl = (struct hfsc_class *)*arg;
1043 struct hfsc_class *parent = NULL;
1044 struct rtattr *opt = tca[TCA_OPTIONS-1];
1045 struct rtattr *tb[TCA_HFSC_MAX];
1046 struct tc_service_curve *rsc = NULL, *fsc = NULL, *usc = NULL;
1047 u64 cur_time;
1048
1049 if (opt == NULL || rtattr_parse_nested(tb, TCA_HFSC_MAX, opt))
1050 return -EINVAL;
1051
1052 if (tb[TCA_HFSC_RSC-1]) {
1053 if (RTA_PAYLOAD(tb[TCA_HFSC_RSC-1]) < sizeof(*rsc))
1054 return -EINVAL;
1055 rsc = RTA_DATA(tb[TCA_HFSC_RSC-1]);
1056 if (rsc->m1 == 0 && rsc->m2 == 0)
1057 rsc = NULL;
1058 }
1059
1060 if (tb[TCA_HFSC_FSC-1]) {
1061 if (RTA_PAYLOAD(tb[TCA_HFSC_FSC-1]) < sizeof(*fsc))
1062 return -EINVAL;
1063 fsc = RTA_DATA(tb[TCA_HFSC_FSC-1]);
1064 if (fsc->m1 == 0 && fsc->m2 == 0)
1065 fsc = NULL;
1066 }
1067
1068 if (tb[TCA_HFSC_USC-1]) {
1069 if (RTA_PAYLOAD(tb[TCA_HFSC_USC-1]) < sizeof(*usc))
1070 return -EINVAL;
1071 usc = RTA_DATA(tb[TCA_HFSC_USC-1]);
1072 if (usc->m1 == 0 && usc->m2 == 0)
1073 usc = NULL;
1074 }
1075
1076 if (cl != NULL) {
1077 if (parentid) {
1078 if (cl->cl_parent && cl->cl_parent->classid != parentid)
1079 return -EINVAL;
1080 if (cl->cl_parent == NULL && parentid != TC_H_ROOT)
1081 return -EINVAL;
1082 }
1083 PSCHED_GET_TIME(cur_time);
1084
1085 sch_tree_lock(sch);
1086 if (rsc != NULL)
1087 hfsc_change_rsc(cl, rsc, cur_time);
1088 if (fsc != NULL)
1089 hfsc_change_fsc(cl, fsc);
1090 if (usc != NULL)
1091 hfsc_change_usc(cl, usc, cur_time);
1092
1093 if (cl->qdisc->q.qlen != 0) {
1094 if (cl->cl_flags & HFSC_RSC)
1095 update_ed(cl, qdisc_peek_len(cl->qdisc));
1096 if (cl->cl_flags & HFSC_FSC)
1097 update_vf(cl, 0, cur_time);
1098 }
1099 sch_tree_unlock(sch);
1100
1101#ifdef CONFIG_NET_ESTIMATOR
1102 if (tca[TCA_RATE-1])
1103 gen_replace_estimator(&cl->bstats, &cl->rate_est,
1104 cl->stats_lock, tca[TCA_RATE-1]);
1105#endif
1106 return 0;
1107 }
1108
1109 if (parentid == TC_H_ROOT)
1110 return -EEXIST;
1111
1112 parent = &q->root;
1113 if (parentid) {
1114 parent = hfsc_find_class(parentid, sch);
1115 if (parent == NULL)
1116 return -ENOENT;
1117 }
1118
1119 if (classid == 0 || TC_H_MAJ(classid ^ sch->handle) != 0)
1120 return -EINVAL;
1121 if (hfsc_find_class(classid, sch))
1122 return -EEXIST;
1123
1124 if (rsc == NULL && fsc == NULL)
1125 return -EINVAL;
1126
1127 cl = kmalloc(sizeof(struct hfsc_class), GFP_KERNEL);
1128 if (cl == NULL)
1129 return -ENOBUFS;
1130 memset(cl, 0, sizeof(struct hfsc_class));
1131
1132 if (rsc != NULL)
1133 hfsc_change_rsc(cl, rsc, 0);
1134 if (fsc != NULL)
1135 hfsc_change_fsc(cl, fsc);
1136 if (usc != NULL)
1137 hfsc_change_usc(cl, usc, 0);
1138
1139 cl->refcnt = 1;
1140 cl->classid = classid;
1141 cl->sched = q;
1142 cl->cl_parent = parent;
1143 cl->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
1144 if (cl->qdisc == NULL)
1145 cl->qdisc = &noop_qdisc;
1146 cl->stats_lock = &sch->dev->queue_lock;
1147 INIT_LIST_HEAD(&cl->children);
1148 cl->vt_tree = RB_ROOT;
1149 cl->cf_tree = RB_ROOT;
1150
1151 sch_tree_lock(sch);
1152 list_add_tail(&cl->hlist, &q->clhash[hfsc_hash(classid)]);
1153 list_add_tail(&cl->siblings, &parent->children);
1154 if (parent->level == 0)
1155 hfsc_purge_queue(sch, parent);
1156 hfsc_adjust_levels(parent);
1157 cl->cl_pcvtoff = parent->cl_cvtoff;
1158 sch_tree_unlock(sch);
1159
1160#ifdef CONFIG_NET_ESTIMATOR
1161 if (tca[TCA_RATE-1])
1162 gen_new_estimator(&cl->bstats, &cl->rate_est,
1163 cl->stats_lock, tca[TCA_RATE-1]);
1164#endif
1165 *arg = (unsigned long)cl;
1166 return 0;
1167}
1168
1169static void
1170hfsc_destroy_filters(struct tcf_proto **fl)
1171{
1172 struct tcf_proto *tp;
1173
1174 while ((tp = *fl) != NULL) {
1175 *fl = tp->next;
1176 tcf_destroy(tp);
1177 }
1178}
1179
1180static void
1181hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
1182{
1183 struct hfsc_sched *q = qdisc_priv(sch);
1184
1185 hfsc_destroy_filters(&cl->filter_list);
1186 qdisc_destroy(cl->qdisc);
1187#ifdef CONFIG_NET_ESTIMATOR
1188 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1189#endif
1190 if (cl != &q->root)
1191 kfree(cl);
1192}
1193
1194static int
1195hfsc_delete_class(struct Qdisc *sch, unsigned long arg)
1196{
1197 struct hfsc_sched *q = qdisc_priv(sch);
1198 struct hfsc_class *cl = (struct hfsc_class *)arg;
1199
1200 if (cl->level > 0 || cl->filter_cnt > 0 || cl == &q->root)
1201 return -EBUSY;
1202
1203 sch_tree_lock(sch);
1204
1205 list_del(&cl->hlist);
1206 list_del(&cl->siblings);
1207 hfsc_adjust_levels(cl->cl_parent);
1208 hfsc_purge_queue(sch, cl);
1209 if (--cl->refcnt == 0)
1210 hfsc_destroy_class(sch, cl);
1211
1212 sch_tree_unlock(sch);
1213 return 0;
1214}
1215
1216static struct hfsc_class *
1217hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
1218{
1219 struct hfsc_sched *q = qdisc_priv(sch);
1220 struct hfsc_class *cl;
1221 struct tcf_result res;
1222 struct tcf_proto *tcf;
1223 int result;
1224
1225 if (TC_H_MAJ(skb->priority ^ sch->handle) == 0 &&
1226 (cl = hfsc_find_class(skb->priority, sch)) != NULL)
1227 if (cl->level == 0)
1228 return cl;
1229
1230 *qerr = NET_XMIT_DROP;
1231 tcf = q->root.filter_list;
1232 while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
1233#ifdef CONFIG_NET_CLS_ACT
1234 switch (result) {
1235 case TC_ACT_QUEUED:
1236 case TC_ACT_STOLEN:
1237 *qerr = NET_XMIT_SUCCESS;
1238 case TC_ACT_SHOT:
1239 return NULL;
1240 }
1241#elif defined(CONFIG_NET_CLS_POLICE)
1242 if (result == TC_POLICE_SHOT)
1243 return NULL;
1244#endif
1245 if ((cl = (struct hfsc_class *)res.class) == NULL) {
1246 if ((cl = hfsc_find_class(res.classid, sch)) == NULL)
1247 break; /* filter selected invalid classid */
1248 }
1249
1250 if (cl->level == 0)
1251 return cl; /* hit leaf class */
1252
1253 /* apply inner filter chain */
1254 tcf = cl->filter_list;
1255 }
1256
1257 /* classification failed, try default class */
1258 cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
1259 if (cl == NULL || cl->level > 0)
1260 return NULL;
1261
1262 return cl;
1263}
1264
1265static int
1266hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1267 struct Qdisc **old)
1268{
1269 struct hfsc_class *cl = (struct hfsc_class *)arg;
1270
1271 if (cl == NULL)
1272 return -ENOENT;
1273 if (cl->level > 0)
1274 return -EINVAL;
1275 if (new == NULL) {
1276 new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
1277 if (new == NULL)
1278 new = &noop_qdisc;
1279 }
1280
1281 sch_tree_lock(sch);
1282 hfsc_purge_queue(sch, cl);
1283 *old = xchg(&cl->qdisc, new);
1284 sch_tree_unlock(sch);
1285 return 0;
1286}
1287
1288static struct Qdisc *
1289hfsc_class_leaf(struct Qdisc *sch, unsigned long arg)
1290{
1291 struct hfsc_class *cl = (struct hfsc_class *)arg;
1292
1293 if (cl != NULL && cl->level == 0)
1294 return cl->qdisc;
1295
1296 return NULL;
1297}
1298
1299static unsigned long
1300hfsc_get_class(struct Qdisc *sch, u32 classid)
1301{
1302 struct hfsc_class *cl = hfsc_find_class(classid, sch);
1303
1304 if (cl != NULL)
1305 cl->refcnt++;
1306
1307 return (unsigned long)cl;
1308}
1309
1310static void
1311hfsc_put_class(struct Qdisc *sch, unsigned long arg)
1312{
1313 struct hfsc_class *cl = (struct hfsc_class *)arg;
1314
1315 if (--cl->refcnt == 0)
1316 hfsc_destroy_class(sch, cl);
1317}
1318
1319static unsigned long
1320hfsc_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid)
1321{
1322 struct hfsc_class *p = (struct hfsc_class *)parent;
1323 struct hfsc_class *cl = hfsc_find_class(classid, sch);
1324
1325 if (cl != NULL) {
1326 if (p != NULL && p->level <= cl->level)
1327 return 0;
1328 cl->filter_cnt++;
1329 }
1330
1331 return (unsigned long)cl;
1332}
1333
1334static void
1335hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg)
1336{
1337 struct hfsc_class *cl = (struct hfsc_class *)arg;
1338
1339 cl->filter_cnt--;
1340}
1341
1342static struct tcf_proto **
1343hfsc_tcf_chain(struct Qdisc *sch, unsigned long arg)
1344{
1345 struct hfsc_sched *q = qdisc_priv(sch);
1346 struct hfsc_class *cl = (struct hfsc_class *)arg;
1347
1348 if (cl == NULL)
1349 cl = &q->root;
1350
1351 return &cl->filter_list;
1352}
1353
1354static int
1355hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc)
1356{
1357 struct tc_service_curve tsc;
1358
1359 tsc.m1 = sm2m(sc->sm1);
1360 tsc.d = dx2d(sc->dx);
1361 tsc.m2 = sm2m(sc->sm2);
1362 RTA_PUT(skb, attr, sizeof(tsc), &tsc);
1363
1364 return skb->len;
1365
1366 rtattr_failure:
1367 return -1;
1368}
1369
1370static inline int
1371hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl)
1372{
1373 if ((cl->cl_flags & HFSC_RSC) &&
1374 (hfsc_dump_sc(skb, TCA_HFSC_RSC, &cl->cl_rsc) < 0))
1375 goto rtattr_failure;
1376
1377 if ((cl->cl_flags & HFSC_FSC) &&
1378 (hfsc_dump_sc(skb, TCA_HFSC_FSC, &cl->cl_fsc) < 0))
1379 goto rtattr_failure;
1380
1381 if ((cl->cl_flags & HFSC_USC) &&
1382 (hfsc_dump_sc(skb, TCA_HFSC_USC, &cl->cl_usc) < 0))
1383 goto rtattr_failure;
1384
1385 return skb->len;
1386
1387 rtattr_failure:
1388 return -1;
1389}
1390
1391static int
1392hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb,
1393 struct tcmsg *tcm)
1394{
1395 struct hfsc_class *cl = (struct hfsc_class *)arg;
1396 unsigned char *b = skb->tail;
1397 struct rtattr *rta = (struct rtattr *)b;
1398
1399 tcm->tcm_parent = cl->cl_parent ? cl->cl_parent->classid : TC_H_ROOT;
1400 tcm->tcm_handle = cl->classid;
1401 if (cl->level == 0)
1402 tcm->tcm_info = cl->qdisc->handle;
1403
1404 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1405 if (hfsc_dump_curves(skb, cl) < 0)
1406 goto rtattr_failure;
1407 rta->rta_len = skb->tail - b;
1408 return skb->len;
1409
1410 rtattr_failure:
1411 skb_trim(skb, b - skb->data);
1412 return -1;
1413}
1414
1415static int
1416hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1417 struct gnet_dump *d)
1418{
1419 struct hfsc_class *cl = (struct hfsc_class *)arg;
1420 struct tc_hfsc_stats xstats;
1421
1422 cl->qstats.qlen = cl->qdisc->q.qlen;
1423 xstats.level = cl->level;
1424 xstats.period = cl->cl_vtperiod;
1425 xstats.work = cl->cl_total;
1426 xstats.rtwork = cl->cl_cumul;
1427
1428 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1429#ifdef CONFIG_NET_ESTIMATOR
1430 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1431#endif
1432 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1433 return -1;
1434
1435 return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
1436}
1437
1438
1439
1440static void
1441hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1442{
1443 struct hfsc_sched *q = qdisc_priv(sch);
1444 struct hfsc_class *cl;
1445 unsigned int i;
1446
1447 if (arg->stop)
1448 return;
1449
1450 for (i = 0; i < HFSC_HSIZE; i++) {
1451 list_for_each_entry(cl, &q->clhash[i], hlist) {
1452 if (arg->count < arg->skip) {
1453 arg->count++;
1454 continue;
1455 }
1456 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1457 arg->stop = 1;
1458 return;
1459 }
1460 arg->count++;
1461 }
1462 }
1463}
1464
1465static void
1466hfsc_watchdog(unsigned long arg)
1467{
1468 struct Qdisc *sch = (struct Qdisc *)arg;
1469
1470 sch->flags &= ~TCQ_F_THROTTLED;
1471 netif_schedule(sch->dev);
1472}
1473
1474static void
1475hfsc_schedule_watchdog(struct Qdisc *sch, u64 cur_time)
1476{
1477 struct hfsc_sched *q = qdisc_priv(sch);
1478 struct hfsc_class *cl;
1479 u64 next_time = 0;
1480 long delay;
1481
1482 if ((cl = eltree_get_minel(q)) != NULL)
1483 next_time = cl->cl_e;
1484 if (q->root.cl_cfmin != 0) {
1485 if (next_time == 0 || next_time > q->root.cl_cfmin)
1486 next_time = q->root.cl_cfmin;
1487 }
1488 ASSERT(next_time != 0);
1489 delay = next_time - cur_time;
1490 delay = PSCHED_US2JIFFIE(delay);
1491
1492 sch->flags |= TCQ_F_THROTTLED;
1493 mod_timer(&q->wd_timer, jiffies + delay);
1494}
1495
1496static int
1497hfsc_init_qdisc(struct Qdisc *sch, struct rtattr *opt)
1498{
1499 struct hfsc_sched *q = qdisc_priv(sch);
1500 struct tc_hfsc_qopt *qopt;
1501 unsigned int i;
1502
1503 if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
1504 return -EINVAL;
1505 qopt = RTA_DATA(opt);
1506
1507 sch->stats_lock = &sch->dev->queue_lock;
1508
1509 q->defcls = qopt->defcls;
1510 for (i = 0; i < HFSC_HSIZE; i++)
1511 INIT_LIST_HEAD(&q->clhash[i]);
1512 q->eligible = RB_ROOT;
1513 INIT_LIST_HEAD(&q->droplist);
1514 skb_queue_head_init(&q->requeue);
1515
1516 q->root.refcnt = 1;
1517 q->root.classid = sch->handle;
1518 q->root.sched = q;
1519 q->root.qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
1520 if (q->root.qdisc == NULL)
1521 q->root.qdisc = &noop_qdisc;
1522 q->root.stats_lock = &sch->dev->queue_lock;
1523 INIT_LIST_HEAD(&q->root.children);
1524 q->root.vt_tree = RB_ROOT;
1525 q->root.cf_tree = RB_ROOT;
1526
1527 list_add(&q->root.hlist, &q->clhash[hfsc_hash(q->root.classid)]);
1528
1529 init_timer(&q->wd_timer);
1530 q->wd_timer.function = hfsc_watchdog;
1531 q->wd_timer.data = (unsigned long)sch;
1532
1533 return 0;
1534}
1535
1536static int
1537hfsc_change_qdisc(struct Qdisc *sch, struct rtattr *opt)
1538{
1539 struct hfsc_sched *q = qdisc_priv(sch);
1540 struct tc_hfsc_qopt *qopt;
1541
1542 if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
1543 return -EINVAL;
1544 qopt = RTA_DATA(opt);
1545
1546 sch_tree_lock(sch);
1547 q->defcls = qopt->defcls;
1548 sch_tree_unlock(sch);
1549
1550 return 0;
1551}
1552
1553static void
1554hfsc_reset_class(struct hfsc_class *cl)
1555{
1556 cl->cl_total = 0;
1557 cl->cl_cumul = 0;
1558 cl->cl_d = 0;
1559 cl->cl_e = 0;
1560 cl->cl_vt = 0;
1561 cl->cl_vtadj = 0;
1562 cl->cl_vtoff = 0;
1563 cl->cl_cvtmin = 0;
1564 cl->cl_cvtmax = 0;
1565 cl->cl_cvtoff = 0;
1566 cl->cl_pcvtoff = 0;
1567 cl->cl_vtperiod = 0;
1568 cl->cl_parentperiod = 0;
1569 cl->cl_f = 0;
1570 cl->cl_myf = 0;
1571 cl->cl_myfadj = 0;
1572 cl->cl_cfmin = 0;
1573 cl->cl_nactive = 0;
1574
1575 cl->vt_tree = RB_ROOT;
1576 cl->cf_tree = RB_ROOT;
1577 qdisc_reset(cl->qdisc);
1578
1579 if (cl->cl_flags & HFSC_RSC)
1580 rtsc_init(&cl->cl_deadline, &cl->cl_rsc, 0, 0);
1581 if (cl->cl_flags & HFSC_FSC)
1582 rtsc_init(&cl->cl_virtual, &cl->cl_fsc, 0, 0);
1583 if (cl->cl_flags & HFSC_USC)
1584 rtsc_init(&cl->cl_ulimit, &cl->cl_usc, 0, 0);
1585}
1586
1587static void
1588hfsc_reset_qdisc(struct Qdisc *sch)
1589{
1590 struct hfsc_sched *q = qdisc_priv(sch);
1591 struct hfsc_class *cl;
1592 unsigned int i;
1593
1594 for (i = 0; i < HFSC_HSIZE; i++) {
1595 list_for_each_entry(cl, &q->clhash[i], hlist)
1596 hfsc_reset_class(cl);
1597 }
1598 __skb_queue_purge(&q->requeue);
1599 q->eligible = RB_ROOT;
1600 INIT_LIST_HEAD(&q->droplist);
1601 del_timer(&q->wd_timer);
1602 sch->flags &= ~TCQ_F_THROTTLED;
1603 sch->q.qlen = 0;
1604}
1605
1606static void
1607hfsc_destroy_qdisc(struct Qdisc *sch)
1608{
1609 struct hfsc_sched *q = qdisc_priv(sch);
1610 struct hfsc_class *cl, *next;
1611 unsigned int i;
1612
1613 for (i = 0; i < HFSC_HSIZE; i++) {
1614 list_for_each_entry_safe(cl, next, &q->clhash[i], hlist)
1615 hfsc_destroy_class(sch, cl);
1616 }
1617 __skb_queue_purge(&q->requeue);
1618 del_timer(&q->wd_timer);
1619}
1620
1621static int
1622hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)
1623{
1624 struct hfsc_sched *q = qdisc_priv(sch);
1625 unsigned char *b = skb->tail;
1626 struct tc_hfsc_qopt qopt;
1627
1628 qopt.defcls = q->defcls;
1629 RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
1630 return skb->len;
1631
1632 rtattr_failure:
1633 skb_trim(skb, b - skb->data);
1634 return -1;
1635}
1636
1637static int
1638hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
1639{
1640 struct hfsc_class *cl;
1641 unsigned int len;
1642 int err;
1643
1644 cl = hfsc_classify(skb, sch, &err);
1645 if (cl == NULL) {
1646 if (err == NET_XMIT_DROP)
1647 sch->qstats.drops++;
1648 kfree_skb(skb);
1649 return err;
1650 }
1651
1652 len = skb->len;
1653 err = cl->qdisc->enqueue(skb, cl->qdisc);
1654 if (unlikely(err != NET_XMIT_SUCCESS)) {
1655 cl->qstats.drops++;
1656 sch->qstats.drops++;
1657 return err;
1658 }
1659
1660 if (cl->qdisc->q.qlen == 1)
1661 set_active(cl, len);
1662
1663 cl->bstats.packets++;
1664 cl->bstats.bytes += len;
1665 sch->bstats.packets++;
1666 sch->bstats.bytes += len;
1667 sch->q.qlen++;
1668
1669 return NET_XMIT_SUCCESS;
1670}
1671
1672static struct sk_buff *
1673hfsc_dequeue(struct Qdisc *sch)
1674{
1675 struct hfsc_sched *q = qdisc_priv(sch);
1676 struct hfsc_class *cl;
1677 struct sk_buff *skb;
1678 u64 cur_time;
1679 unsigned int next_len;
1680 int realtime = 0;
1681
1682 if (sch->q.qlen == 0)
1683 return NULL;
1684 if ((skb = __skb_dequeue(&q->requeue)))
1685 goto out;
1686
1687 PSCHED_GET_TIME(cur_time);
1688
1689 /*
1690 * if there are eligible classes, use real-time criteria.
1691 * find the class with the minimum deadline among
1692 * the eligible classes.
1693 */
1694 if ((cl = eltree_get_mindl(q, cur_time)) != NULL) {
1695 realtime = 1;
1696 } else {
1697 /*
1698 * use link-sharing criteria
1699 * get the class with the minimum vt in the hierarchy
1700 */
1701 cl = vttree_get_minvt(&q->root, cur_time);
1702 if (cl == NULL) {
1703 sch->qstats.overlimits++;
1704 hfsc_schedule_watchdog(sch, cur_time);
1705 return NULL;
1706 }
1707 }
1708
1709 skb = cl->qdisc->dequeue(cl->qdisc);
1710 if (skb == NULL) {
1711 if (net_ratelimit())
1712 printk("HFSC: Non-work-conserving qdisc ?\n");
1713 return NULL;
1714 }
1715
1716 update_vf(cl, skb->len, cur_time);
1717 if (realtime)
1718 cl->cl_cumul += skb->len;
1719
1720 if (cl->qdisc->q.qlen != 0) {
1721 if (cl->cl_flags & HFSC_RSC) {
1722 /* update ed */
1723 next_len = qdisc_peek_len(cl->qdisc);
1724 if (realtime)
1725 update_ed(cl, next_len);
1726 else
1727 update_d(cl, next_len);
1728 }
1729 } else {
1730 /* the class becomes passive */
1731 set_passive(cl);
1732 }
1733
1734 out:
1735 sch->flags &= ~TCQ_F_THROTTLED;
1736 sch->q.qlen--;
1737
1738 return skb;
1739}
1740
1741static int
1742hfsc_requeue(struct sk_buff *skb, struct Qdisc *sch)
1743{
1744 struct hfsc_sched *q = qdisc_priv(sch);
1745
1746 __skb_queue_head(&q->requeue, skb);
1747 sch->q.qlen++;
1748 sch->qstats.requeues++;
1749 return NET_XMIT_SUCCESS;
1750}
1751
1752static unsigned int
1753hfsc_drop(struct Qdisc *sch)
1754{
1755 struct hfsc_sched *q = qdisc_priv(sch);
1756 struct hfsc_class *cl;
1757 unsigned int len;
1758
1759 list_for_each_entry(cl, &q->droplist, dlist) {
1760 if (cl->qdisc->ops->drop != NULL &&
1761 (len = cl->qdisc->ops->drop(cl->qdisc)) > 0) {
1762 if (cl->qdisc->q.qlen == 0) {
1763 update_vf(cl, 0, 0);
1764 set_passive(cl);
1765 } else {
1766 list_move_tail(&cl->dlist, &q->droplist);
1767 }
1768 cl->qstats.drops++;
1769 sch->qstats.drops++;
1770 sch->q.qlen--;
1771 return len;
1772 }
1773 }
1774 return 0;
1775}
1776
1777static struct Qdisc_class_ops hfsc_class_ops = {
1778 .change = hfsc_change_class,
1779 .delete = hfsc_delete_class,
1780 .graft = hfsc_graft_class,
1781 .leaf = hfsc_class_leaf,
1782 .get = hfsc_get_class,
1783 .put = hfsc_put_class,
1784 .bind_tcf = hfsc_bind_tcf,
1785 .unbind_tcf = hfsc_unbind_tcf,
1786 .tcf_chain = hfsc_tcf_chain,
1787 .dump = hfsc_dump_class,
1788 .dump_stats = hfsc_dump_class_stats,
1789 .walk = hfsc_walk
1790};
1791
1792static struct Qdisc_ops hfsc_qdisc_ops = {
1793 .id = "hfsc",
1794 .init = hfsc_init_qdisc,
1795 .change = hfsc_change_qdisc,
1796 .reset = hfsc_reset_qdisc,
1797 .destroy = hfsc_destroy_qdisc,
1798 .dump = hfsc_dump_qdisc,
1799 .enqueue = hfsc_enqueue,
1800 .dequeue = hfsc_dequeue,
1801 .requeue = hfsc_requeue,
1802 .drop = hfsc_drop,
1803 .cl_ops = &hfsc_class_ops,
1804 .priv_size = sizeof(struct hfsc_sched),
1805 .owner = THIS_MODULE
1806};
1807
1808static int __init
1809hfsc_init(void)
1810{
1811 return register_qdisc(&hfsc_qdisc_ops);
1812}
1813
1814static void __exit
1815hfsc_cleanup(void)
1816{
1817 unregister_qdisc(&hfsc_qdisc_ops);
1818}
1819
1820MODULE_LICENSE("GPL");
1821module_init(hfsc_init);
1822module_exit(hfsc_cleanup);
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
new file mode 100644
index 000000000000..a85935e7d53d
--- /dev/null
+++ b/net/sched/sch_htb.c
@@ -0,0 +1,1759 @@
1/* vim: ts=8 sw=8
2 * net/sched/sch_htb.c Hierarchical token bucket, feed tree version
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Martin Devera, <devik@cdi.cz>
10 *
11 * Credits (in time order) for older HTB versions:
12 * Stef Coene <stef.coene@docum.org>
13 * HTB support at LARTC mailing list
14 * Ondrej Kraus, <krauso@barr.cz>
15 * found missing INIT_QDISC(htb)
16 * Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17 * helped a lot to locate nasty class stall bug
18 * Andi Kleen, Jamal Hadi, Bert Hubert
19 * code review and helpful comments on shaping
20 * Tomasz Wrona, <tw@eter.tym.pl>
21 * created test case so that I was able to fix nasty bug
22 * Wilfried Weissmann
23 * spotted bug in dequeue code and helped with fix
24 * Jiri Fojtasek
25 * fixed requeue routine
26 * and many others. thanks.
27 *
28 * $Id: sch_htb.c,v 1.25 2003/12/07 11:08:25 devik Exp devik $
29 */
30#include <linux/config.h>
31#include <linux/module.h>
32#include <asm/uaccess.h>
33#include <asm/system.h>
34#include <linux/bitops.h>
35#include <linux/types.h>
36#include <linux/kernel.h>
37#include <linux/sched.h>
38#include <linux/string.h>
39#include <linux/mm.h>
40#include <linux/socket.h>
41#include <linux/sockios.h>
42#include <linux/in.h>
43#include <linux/errno.h>
44#include <linux/interrupt.h>
45#include <linux/if_ether.h>
46#include <linux/inet.h>
47#include <linux/netdevice.h>
48#include <linux/etherdevice.h>
49#include <linux/notifier.h>
50#include <net/ip.h>
51#include <net/route.h>
52#include <linux/skbuff.h>
53#include <linux/list.h>
54#include <linux/compiler.h>
55#include <net/sock.h>
56#include <net/pkt_sched.h>
57#include <linux/rbtree.h>
58
59/* HTB algorithm.
60 Author: devik@cdi.cz
61 ========================================================================
62 HTB is like TBF with multiple classes. It is also similar to CBQ because
63 it allows to assign priority to each class in hierarchy.
64 In fact it is another implementation of Floyd's formal sharing.
65
66 Levels:
67 Each class is assigned level. Leaf has ALWAYS level 0 and root
68 classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
69 one less than their parent.
70*/
71
72#define HTB_HSIZE 16 /* classid hash size */
73#define HTB_EWMAC 2 /* rate average over HTB_EWMAC*HTB_HSIZE sec */
74#undef HTB_DEBUG /* compile debugging support (activated by tc tool) */
75#define HTB_RATECM 1 /* whether to use rate computer */
76#define HTB_HYSTERESIS 1/* whether to use mode hysteresis for speedup */
77#define HTB_QLOCK(S) spin_lock_bh(&(S)->dev->queue_lock)
78#define HTB_QUNLOCK(S) spin_unlock_bh(&(S)->dev->queue_lock)
79#define HTB_VER 0x30011 /* major must be matched with number suplied by TC as version */
80
81#if HTB_VER >> 16 != TC_HTB_PROTOVER
82#error "Mismatched sch_htb.c and pkt_sch.h"
83#endif
84
85/* debugging support; S is subsystem, these are defined:
86 0 - netlink messages
87 1 - enqueue
88 2 - drop & requeue
89 3 - dequeue main
90 4 - dequeue one prio DRR part
91 5 - dequeue class accounting
92 6 - class overlimit status computation
93 7 - hint tree
94 8 - event queue
95 10 - rate estimator
96 11 - classifier
97 12 - fast dequeue cache
98
99 L is level; 0 = none, 1 = basic info, 2 = detailed, 3 = full
100 q->debug uint32 contains 16 2-bit fields one for subsystem starting
101 from LSB
102 */
103#ifdef HTB_DEBUG
104#define HTB_DBG_COND(S,L) (((q->debug>>(2*S))&3) >= L)
105#define HTB_DBG(S,L,FMT,ARG...) if (HTB_DBG_COND(S,L)) \
106 printk(KERN_DEBUG FMT,##ARG)
107#define HTB_CHCL(cl) BUG_TRAP((cl)->magic == HTB_CMAGIC)
108#define HTB_PASSQ q,
109#define HTB_ARGQ struct htb_sched *q,
110#define static
111#undef __inline__
112#define __inline__
113#undef inline
114#define inline
115#define HTB_CMAGIC 0xFEFAFEF1
116#define htb_safe_rb_erase(N,R) do { BUG_TRAP((N)->rb_color != -1); \
117 if ((N)->rb_color == -1) break; \
118 rb_erase(N,R); \
119 (N)->rb_color = -1; } while (0)
120#else
121#define HTB_DBG_COND(S,L) (0)
122#define HTB_DBG(S,L,FMT,ARG...)
123#define HTB_PASSQ
124#define HTB_ARGQ
125#define HTB_CHCL(cl)
126#define htb_safe_rb_erase(N,R) rb_erase(N,R)
127#endif
128
129
130/* used internaly to keep status of single class */
131enum htb_cmode {
132 HTB_CANT_SEND, /* class can't send and can't borrow */
133 HTB_MAY_BORROW, /* class can't send but may borrow */
134 HTB_CAN_SEND /* class can send */
135};
136
137/* interior & leaf nodes; props specific to leaves are marked L: */
138struct htb_class
139{
140#ifdef HTB_DEBUG
141 unsigned magic;
142#endif
143 /* general class parameters */
144 u32 classid;
145 struct gnet_stats_basic bstats;
146 struct gnet_stats_queue qstats;
147 struct gnet_stats_rate_est rate_est;
148 struct tc_htb_xstats xstats;/* our special stats */
149 int refcnt; /* usage count of this class */
150
151#ifdef HTB_RATECM
152 /* rate measurement counters */
153 unsigned long rate_bytes,sum_bytes;
154 unsigned long rate_packets,sum_packets;
155#endif
156
157 /* topology */
158 int level; /* our level (see above) */
159 struct htb_class *parent; /* parent class */
160 struct list_head hlist; /* classid hash list item */
161 struct list_head sibling; /* sibling list item */
162 struct list_head children; /* children list */
163
164 union {
165 struct htb_class_leaf {
166 struct Qdisc *q;
167 int prio;
168 int aprio;
169 int quantum;
170 int deficit[TC_HTB_MAXDEPTH];
171 struct list_head drop_list;
172 } leaf;
173 struct htb_class_inner {
174 struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */
175 struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */
176 /* When class changes from state 1->2 and disconnects from
177 parent's feed then we lost ptr value and start from the
178 first child again. Here we store classid of the
179 last valid ptr (used when ptr is NULL). */
180 u32 last_ptr_id[TC_HTB_NUMPRIO];
181 } inner;
182 } un;
183 struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */
184 struct rb_node pq_node; /* node for event queue */
185 unsigned long pq_key; /* the same type as jiffies global */
186
187 int prio_activity; /* for which prios are we active */
188 enum htb_cmode cmode; /* current mode of the class */
189
190 /* class attached filters */
191 struct tcf_proto *filter_list;
192 int filter_cnt;
193
194 int warned; /* only one warning about non work conserving .. */
195
196 /* token bucket parameters */
197 struct qdisc_rate_table *rate; /* rate table of the class itself */
198 struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */
199 long buffer,cbuffer; /* token bucket depth/rate */
200 long mbuffer; /* max wait time */
201 long tokens,ctokens; /* current number of tokens */
202 psched_time_t t_c; /* checkpoint time */
203};
204
205/* TODO: maybe compute rate when size is too large .. or drop ? */
206static __inline__ long L2T(struct htb_class *cl,struct qdisc_rate_table *rate,
207 int size)
208{
209 int slot = size >> rate->rate.cell_log;
210 if (slot > 255) {
211 cl->xstats.giants++;
212 slot = 255;
213 }
214 return rate->data[slot];
215}
216
217struct htb_sched
218{
219 struct list_head root; /* root classes list */
220 struct list_head hash[HTB_HSIZE]; /* hashed by classid */
221 struct list_head drops[TC_HTB_NUMPRIO]; /* active leaves (for drops) */
222
223 /* self list - roots of self generating tree */
224 struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
225 int row_mask[TC_HTB_MAXDEPTH];
226 struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
227 u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
228
229 /* self wait list - roots of wait PQs per row */
230 struct rb_root wait_pq[TC_HTB_MAXDEPTH];
231
232 /* time of nearest event per level (row) */
233 unsigned long near_ev_cache[TC_HTB_MAXDEPTH];
234
235 /* cached value of jiffies in dequeue */
236 unsigned long jiffies;
237
238 /* whether we hit non-work conserving class during this dequeue; we use */
239 int nwc_hit; /* this to disable mindelay complaint in dequeue */
240
241 int defcls; /* class where unclassified flows go to */
242 u32 debug; /* subsystem debug levels */
243
244 /* filters for qdisc itself */
245 struct tcf_proto *filter_list;
246 int filter_cnt;
247
248 int rate2quantum; /* quant = rate / rate2quantum */
249 psched_time_t now; /* cached dequeue time */
250 struct timer_list timer; /* send delay timer */
251#ifdef HTB_RATECM
252 struct timer_list rttim; /* rate computer timer */
253 int recmp_bucket; /* which hash bucket to recompute next */
254#endif
255
256 /* non shaped skbs; let them go directly thru */
257 struct sk_buff_head direct_queue;
258 int direct_qlen; /* max qlen of above */
259
260 long direct_pkts;
261};
262
263/* compute hash of size HTB_HSIZE for given handle */
264static __inline__ int htb_hash(u32 h)
265{
266#if HTB_HSIZE != 16
267 #error "Declare new hash for your HTB_HSIZE"
268#endif
269 h ^= h>>8; /* stolen from cbq_hash */
270 h ^= h>>4;
271 return h & 0xf;
272}
273
274/* find class in global hash table using given handle */
275static __inline__ struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
276{
277 struct htb_sched *q = qdisc_priv(sch);
278 struct list_head *p;
279 if (TC_H_MAJ(handle) != sch->handle)
280 return NULL;
281
282 list_for_each (p,q->hash+htb_hash(handle)) {
283 struct htb_class *cl = list_entry(p,struct htb_class,hlist);
284 if (cl->classid == handle)
285 return cl;
286 }
287 return NULL;
288}
289
290/**
291 * htb_classify - classify a packet into class
292 *
293 * It returns NULL if the packet should be dropped or -1 if the packet
294 * should be passed directly thru. In all other cases leaf class is returned.
295 * We allow direct class selection by classid in priority. The we examine
296 * filters in qdisc and in inner nodes (if higher filter points to the inner
297 * node). If we end up with classid MAJOR:0 we enqueue the skb into special
298 * internal fifo (direct). These packets then go directly thru. If we still
299 * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
300 * then finish and return direct queue.
301 */
302#define HTB_DIRECT (struct htb_class*)-1
303static inline u32 htb_classid(struct htb_class *cl)
304{
305 return (cl && cl != HTB_DIRECT) ? cl->classid : TC_H_UNSPEC;
306}
307
308static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
309{
310 struct htb_sched *q = qdisc_priv(sch);
311 struct htb_class *cl;
312 struct tcf_result res;
313 struct tcf_proto *tcf;
314 int result;
315
316 /* allow to select class by setting skb->priority to valid classid;
317 note that nfmark can be used too by attaching filter fw with no
318 rules in it */
319 if (skb->priority == sch->handle)
320 return HTB_DIRECT; /* X:0 (direct flow) selected */
321 if ((cl = htb_find(skb->priority,sch)) != NULL && cl->level == 0)
322 return cl;
323
324 *qerr = NET_XMIT_DROP;
325 tcf = q->filter_list;
326 while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
327#ifdef CONFIG_NET_CLS_ACT
328 switch (result) {
329 case TC_ACT_QUEUED:
330 case TC_ACT_STOLEN:
331 *qerr = NET_XMIT_SUCCESS;
332 case TC_ACT_SHOT:
333 return NULL;
334 }
335#elif defined(CONFIG_NET_CLS_POLICE)
336 if (result == TC_POLICE_SHOT)
337 return HTB_DIRECT;
338#endif
339 if ((cl = (void*)res.class) == NULL) {
340 if (res.classid == sch->handle)
341 return HTB_DIRECT; /* X:0 (direct flow) */
342 if ((cl = htb_find(res.classid,sch)) == NULL)
343 break; /* filter selected invalid classid */
344 }
345 if (!cl->level)
346 return cl; /* we hit leaf; return it */
347
348 /* we have got inner class; apply inner filter chain */
349 tcf = cl->filter_list;
350 }
351 /* classification failed; try to use default class */
352 cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle),q->defcls),sch);
353 if (!cl || cl->level)
354 return HTB_DIRECT; /* bad default .. this is safe bet */
355 return cl;
356}
357
358#ifdef HTB_DEBUG
359static void htb_next_rb_node(struct rb_node **n);
360#define HTB_DUMTREE(root,memb) if(root) { \
361 struct rb_node *n = (root)->rb_node; \
362 while (n->rb_left) n = n->rb_left; \
363 while (n) { \
364 struct htb_class *cl = rb_entry(n, struct htb_class, memb); \
365 printk(" %x",cl->classid); htb_next_rb_node (&n); \
366 } }
367
368static void htb_debug_dump (struct htb_sched *q)
369{
370 int i,p;
371 printk(KERN_DEBUG "htb*g j=%lu lj=%lu\n",jiffies,q->jiffies);
372 /* rows */
373 for (i=TC_HTB_MAXDEPTH-1;i>=0;i--) {
374 printk(KERN_DEBUG "htb*r%d m=%x",i,q->row_mask[i]);
375 for (p=0;p<TC_HTB_NUMPRIO;p++) {
376 if (!q->row[i][p].rb_node) continue;
377 printk(" p%d:",p);
378 HTB_DUMTREE(q->row[i]+p,node[p]);
379 }
380 printk("\n");
381 }
382 /* classes */
383 for (i = 0; i < HTB_HSIZE; i++) {
384 struct list_head *l;
385 list_for_each (l,q->hash+i) {
386 struct htb_class *cl = list_entry(l,struct htb_class,hlist);
387 long diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer);
388 printk(KERN_DEBUG "htb*c%x m=%d t=%ld c=%ld pq=%lu df=%ld ql=%d "
389 "pa=%x f:",
390 cl->classid,cl->cmode,cl->tokens,cl->ctokens,
391 cl->pq_node.rb_color==-1?0:cl->pq_key,diff,
392 cl->level?0:cl->un.leaf.q->q.qlen,cl->prio_activity);
393 if (cl->level)
394 for (p=0;p<TC_HTB_NUMPRIO;p++) {
395 if (!cl->un.inner.feed[p].rb_node) continue;
396 printk(" p%d a=%x:",p,cl->un.inner.ptr[p]?rb_entry(cl->un.inner.ptr[p], struct htb_class,node[p])->classid:0);
397 HTB_DUMTREE(cl->un.inner.feed+p,node[p]);
398 }
399 printk("\n");
400 }
401 }
402}
403#endif
404/**
405 * htb_add_to_id_tree - adds class to the round robin list
406 *
407 * Routine adds class to the list (actually tree) sorted by classid.
408 * Make sure that class is not already on such list for given prio.
409 */
410static void htb_add_to_id_tree (HTB_ARGQ struct rb_root *root,
411 struct htb_class *cl,int prio)
412{
413 struct rb_node **p = &root->rb_node, *parent = NULL;
414 HTB_DBG(7,3,"htb_add_id_tree cl=%X prio=%d\n",cl->classid,prio);
415#ifdef HTB_DEBUG
416 if (cl->node[prio].rb_color != -1) { BUG_TRAP(0); return; }
417 HTB_CHCL(cl);
418 if (*p) {
419 struct htb_class *x = rb_entry(*p,struct htb_class,node[prio]);
420 HTB_CHCL(x);
421 }
422#endif
423 while (*p) {
424 struct htb_class *c; parent = *p;
425 c = rb_entry(parent, struct htb_class, node[prio]);
426 HTB_CHCL(c);
427 if (cl->classid > c->classid)
428 p = &parent->rb_right;
429 else
430 p = &parent->rb_left;
431 }
432 rb_link_node(&cl->node[prio], parent, p);
433 rb_insert_color(&cl->node[prio], root);
434}
435
436/**
437 * htb_add_to_wait_tree - adds class to the event queue with delay
438 *
439 * The class is added to priority event queue to indicate that class will
440 * change its mode in cl->pq_key microseconds. Make sure that class is not
441 * already in the queue.
442 */
443static void htb_add_to_wait_tree (struct htb_sched *q,
444 struct htb_class *cl,long delay,int debug_hint)
445{
446 struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
447 HTB_DBG(7,3,"htb_add_wt cl=%X key=%lu\n",cl->classid,cl->pq_key);
448#ifdef HTB_DEBUG
449 if (cl->pq_node.rb_color != -1) { BUG_TRAP(0); return; }
450 HTB_CHCL(cl);
451 if ((delay <= 0 || delay > cl->mbuffer) && net_ratelimit())
452 printk(KERN_ERR "HTB: suspicious delay in wait_tree d=%ld cl=%X h=%d\n",delay,cl->classid,debug_hint);
453#endif
454 cl->pq_key = q->jiffies + PSCHED_US2JIFFIE(delay);
455 if (cl->pq_key == q->jiffies)
456 cl->pq_key++;
457
458 /* update the nearest event cache */
459 if (time_after(q->near_ev_cache[cl->level], cl->pq_key))
460 q->near_ev_cache[cl->level] = cl->pq_key;
461
462 while (*p) {
463 struct htb_class *c; parent = *p;
464 c = rb_entry(parent, struct htb_class, pq_node);
465 if (time_after_eq(cl->pq_key, c->pq_key))
466 p = &parent->rb_right;
467 else
468 p = &parent->rb_left;
469 }
470 rb_link_node(&cl->pq_node, parent, p);
471 rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
472}
473
474/**
475 * htb_next_rb_node - finds next node in binary tree
476 *
477 * When we are past last key we return NULL.
478 * Average complexity is 2 steps per call.
479 */
480static void htb_next_rb_node(struct rb_node **n)
481{
482 *n = rb_next(*n);
483}
484
485/**
486 * htb_add_class_to_row - add class to its row
487 *
488 * The class is added to row at priorities marked in mask.
489 * It does nothing if mask == 0.
490 */
491static inline void htb_add_class_to_row(struct htb_sched *q,
492 struct htb_class *cl,int mask)
493{
494 HTB_DBG(7,2,"htb_addrow cl=%X mask=%X rmask=%X\n",
495 cl->classid,mask,q->row_mask[cl->level]);
496 HTB_CHCL(cl);
497 q->row_mask[cl->level] |= mask;
498 while (mask) {
499 int prio = ffz(~mask);
500 mask &= ~(1 << prio);
501 htb_add_to_id_tree(HTB_PASSQ q->row[cl->level]+prio,cl,prio);
502 }
503}
504
505/**
506 * htb_remove_class_from_row - removes class from its row
507 *
508 * The class is removed from row at priorities marked in mask.
509 * It does nothing if mask == 0.
510 */
511static __inline__ void htb_remove_class_from_row(struct htb_sched *q,
512 struct htb_class *cl,int mask)
513{
514 int m = 0;
515 HTB_CHCL(cl);
516 while (mask) {
517 int prio = ffz(~mask);
518 mask &= ~(1 << prio);
519 if (q->ptr[cl->level][prio] == cl->node+prio)
520 htb_next_rb_node(q->ptr[cl->level]+prio);
521 htb_safe_rb_erase(cl->node + prio,q->row[cl->level]+prio);
522 if (!q->row[cl->level][prio].rb_node)
523 m |= 1 << prio;
524 }
525 HTB_DBG(7,2,"htb_delrow cl=%X mask=%X rmask=%X maskdel=%X\n",
526 cl->classid,mask,q->row_mask[cl->level],m);
527 q->row_mask[cl->level] &= ~m;
528}
529
530/**
531 * htb_activate_prios - creates active classe's feed chain
532 *
533 * The class is connected to ancestors and/or appropriate rows
534 * for priorities it is participating on. cl->cmode must be new
535 * (activated) mode. It does nothing if cl->prio_activity == 0.
536 */
537static void htb_activate_prios(struct htb_sched *q,struct htb_class *cl)
538{
539 struct htb_class *p = cl->parent;
540 long m,mask = cl->prio_activity;
541 HTB_DBG(7,2,"htb_act_prios cl=%X mask=%lX cmode=%d\n",cl->classid,mask,cl->cmode);
542 HTB_CHCL(cl);
543
544 while (cl->cmode == HTB_MAY_BORROW && p && mask) {
545 HTB_CHCL(p);
546 m = mask; while (m) {
547 int prio = ffz(~m);
548 m &= ~(1 << prio);
549
550 if (p->un.inner.feed[prio].rb_node)
551 /* parent already has its feed in use so that
552 reset bit in mask as parent is already ok */
553 mask &= ~(1 << prio);
554
555 htb_add_to_id_tree(HTB_PASSQ p->un.inner.feed+prio,cl,prio);
556 }
557 HTB_DBG(7,3,"htb_act_pr_aft p=%X pact=%X mask=%lX pmode=%d\n",
558 p->classid,p->prio_activity,mask,p->cmode);
559 p->prio_activity |= mask;
560 cl = p; p = cl->parent;
561 HTB_CHCL(cl);
562 }
563 if (cl->cmode == HTB_CAN_SEND && mask)
564 htb_add_class_to_row(q,cl,mask);
565}
566
567/**
568 * htb_deactivate_prios - remove class from feed chain
569 *
570 * cl->cmode must represent old mode (before deactivation). It does
571 * nothing if cl->prio_activity == 0. Class is removed from all feed
572 * chains and rows.
573 */
574static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
575{
576 struct htb_class *p = cl->parent;
577 long m,mask = cl->prio_activity;
578 HTB_DBG(7,2,"htb_deact_prios cl=%X mask=%lX cmode=%d\n",cl->classid,mask,cl->cmode);
579 HTB_CHCL(cl);
580
581 while (cl->cmode == HTB_MAY_BORROW && p && mask) {
582 m = mask; mask = 0;
583 while (m) {
584 int prio = ffz(~m);
585 m &= ~(1 << prio);
586
587 if (p->un.inner.ptr[prio] == cl->node+prio) {
588 /* we are removing child which is pointed to from
589 parent feed - forget the pointer but remember
590 classid */
591 p->un.inner.last_ptr_id[prio] = cl->classid;
592 p->un.inner.ptr[prio] = NULL;
593 }
594
595 htb_safe_rb_erase(cl->node + prio,p->un.inner.feed + prio);
596
597 if (!p->un.inner.feed[prio].rb_node)
598 mask |= 1 << prio;
599 }
600 HTB_DBG(7,3,"htb_deact_pr_aft p=%X pact=%X mask=%lX pmode=%d\n",
601 p->classid,p->prio_activity,mask,p->cmode);
602 p->prio_activity &= ~mask;
603 cl = p; p = cl->parent;
604 HTB_CHCL(cl);
605 }
606 if (cl->cmode == HTB_CAN_SEND && mask)
607 htb_remove_class_from_row(q,cl,mask);
608}
609
610/**
611 * htb_class_mode - computes and returns current class mode
612 *
613 * It computes cl's mode at time cl->t_c+diff and returns it. If mode
614 * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
615 * from now to time when cl will change its state.
616 * Also it is worth to note that class mode doesn't change simply
617 * at cl->{c,}tokens == 0 but there can rather be hysteresis of
618 * 0 .. -cl->{c,}buffer range. It is meant to limit number of
619 * mode transitions per time unit. The speed gain is about 1/6.
620 */
621static __inline__ enum htb_cmode
622htb_class_mode(struct htb_class *cl,long *diff)
623{
624 long toks;
625
626 if ((toks = (cl->ctokens + *diff)) < (
627#if HTB_HYSTERESIS
628 cl->cmode != HTB_CANT_SEND ? -cl->cbuffer :
629#endif
630 0)) {
631 *diff = -toks;
632 return HTB_CANT_SEND;
633 }
634 if ((toks = (cl->tokens + *diff)) >= (
635#if HTB_HYSTERESIS
636 cl->cmode == HTB_CAN_SEND ? -cl->buffer :
637#endif
638 0))
639 return HTB_CAN_SEND;
640
641 *diff = -toks;
642 return HTB_MAY_BORROW;
643}
644
645/**
646 * htb_change_class_mode - changes classe's mode
647 *
648 * This should be the only way how to change classe's mode under normal
649 * cirsumstances. Routine will update feed lists linkage, change mode
650 * and add class to the wait event queue if appropriate. New mode should
651 * be different from old one and cl->pq_key has to be valid if changing
652 * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
653 */
654static void
655htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
656{
657 enum htb_cmode new_mode = htb_class_mode(cl,diff);
658
659 HTB_CHCL(cl);
660 HTB_DBG(7,1,"htb_chging_clmode %d->%d cl=%X\n",cl->cmode,new_mode,cl->classid);
661
662 if (new_mode == cl->cmode)
663 return;
664
665 if (cl->prio_activity) { /* not necessary: speed optimization */
666 if (cl->cmode != HTB_CANT_SEND)
667 htb_deactivate_prios(q,cl);
668 cl->cmode = new_mode;
669 if (new_mode != HTB_CANT_SEND)
670 htb_activate_prios(q,cl);
671 } else
672 cl->cmode = new_mode;
673}
674
675/**
676 * htb_activate - inserts leaf cl into appropriate active feeds
677 *
678 * Routine learns (new) priority of leaf and activates feed chain
679 * for the prio. It can be called on already active leaf safely.
680 * It also adds leaf into droplist.
681 */
682static __inline__ void htb_activate(struct htb_sched *q,struct htb_class *cl)
683{
684 BUG_TRAP(!cl->level && cl->un.leaf.q && cl->un.leaf.q->q.qlen);
685 HTB_CHCL(cl);
686 if (!cl->prio_activity) {
687 cl->prio_activity = 1 << (cl->un.leaf.aprio = cl->un.leaf.prio);
688 htb_activate_prios(q,cl);
689 list_add_tail(&cl->un.leaf.drop_list,q->drops+cl->un.leaf.aprio);
690 }
691}
692
693/**
694 * htb_deactivate - remove leaf cl from active feeds
695 *
696 * Make sure that leaf is active. In the other words it can't be called
697 * with non-active leaf. It also removes class from the drop list.
698 */
699static __inline__ void
700htb_deactivate(struct htb_sched *q,struct htb_class *cl)
701{
702 BUG_TRAP(cl->prio_activity);
703 HTB_CHCL(cl);
704 htb_deactivate_prios(q,cl);
705 cl->prio_activity = 0;
706 list_del_init(&cl->un.leaf.drop_list);
707}
708
709static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
710{
711 int ret;
712 struct htb_sched *q = qdisc_priv(sch);
713 struct htb_class *cl = htb_classify(skb,sch,&ret);
714
715 if (cl == HTB_DIRECT) {
716 /* enqueue to helper queue */
717 if (q->direct_queue.qlen < q->direct_qlen) {
718 __skb_queue_tail(&q->direct_queue, skb);
719 q->direct_pkts++;
720 }
721#ifdef CONFIG_NET_CLS_ACT
722 } else if (!cl) {
723 if (ret == NET_XMIT_DROP)
724 sch->qstats.drops++;
725 kfree_skb (skb);
726 return ret;
727#endif
728 } else if (cl->un.leaf.q->enqueue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) {
729 sch->qstats.drops++;
730 cl->qstats.drops++;
731 return NET_XMIT_DROP;
732 } else {
733 cl->bstats.packets++; cl->bstats.bytes += skb->len;
734 htb_activate (q,cl);
735 }
736
737 sch->q.qlen++;
738 sch->bstats.packets++; sch->bstats.bytes += skb->len;
739 HTB_DBG(1,1,"htb_enq_ok cl=%X skb=%p\n",(cl && cl != HTB_DIRECT)?cl->classid:0,skb);
740 return NET_XMIT_SUCCESS;
741}
742
743/* TODO: requeuing packet charges it to policers again !! */
744static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch)
745{
746 struct htb_sched *q = qdisc_priv(sch);
747 int ret = NET_XMIT_SUCCESS;
748 struct htb_class *cl = htb_classify(skb,sch, &ret);
749 struct sk_buff *tskb;
750
751 if (cl == HTB_DIRECT || !cl) {
752 /* enqueue to helper queue */
753 if (q->direct_queue.qlen < q->direct_qlen && cl) {
754 __skb_queue_head(&q->direct_queue, skb);
755 } else {
756 __skb_queue_head(&q->direct_queue, skb);
757 tskb = __skb_dequeue_tail(&q->direct_queue);
758 kfree_skb (tskb);
759 sch->qstats.drops++;
760 return NET_XMIT_CN;
761 }
762 } else if (cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) {
763 sch->qstats.drops++;
764 cl->qstats.drops++;
765 return NET_XMIT_DROP;
766 } else
767 htb_activate (q,cl);
768
769 sch->q.qlen++;
770 sch->qstats.requeues++;
771 HTB_DBG(1,1,"htb_req_ok cl=%X skb=%p\n",(cl && cl != HTB_DIRECT)?cl->classid:0,skb);
772 return NET_XMIT_SUCCESS;
773}
774
775static void htb_timer(unsigned long arg)
776{
777 struct Qdisc *sch = (struct Qdisc*)arg;
778 sch->flags &= ~TCQ_F_THROTTLED;
779 wmb();
780 netif_schedule(sch->dev);
781}
782
783#ifdef HTB_RATECM
784#define RT_GEN(D,R) R+=D-(R/HTB_EWMAC);D=0
785static void htb_rate_timer(unsigned long arg)
786{
787 struct Qdisc *sch = (struct Qdisc*)arg;
788 struct htb_sched *q = qdisc_priv(sch);
789 struct list_head *p;
790
791 /* lock queue so that we can muck with it */
792 HTB_QLOCK(sch);
793 HTB_DBG(10,1,"htb_rttmr j=%ld\n",jiffies);
794
795 q->rttim.expires = jiffies + HZ;
796 add_timer(&q->rttim);
797
798 /* scan and recompute one bucket at time */
799 if (++q->recmp_bucket >= HTB_HSIZE)
800 q->recmp_bucket = 0;
801 list_for_each (p,q->hash+q->recmp_bucket) {
802 struct htb_class *cl = list_entry(p,struct htb_class,hlist);
803 HTB_DBG(10,2,"htb_rttmr_cl cl=%X sbyte=%lu spkt=%lu\n",
804 cl->classid,cl->sum_bytes,cl->sum_packets);
805 RT_GEN (cl->sum_bytes,cl->rate_bytes);
806 RT_GEN (cl->sum_packets,cl->rate_packets);
807 }
808 HTB_QUNLOCK(sch);
809}
810#endif
811
812/**
813 * htb_charge_class - charges amount "bytes" to leaf and ancestors
814 *
815 * Routine assumes that packet "bytes" long was dequeued from leaf cl
816 * borrowing from "level". It accounts bytes to ceil leaky bucket for
817 * leaf and all ancestors and to rate bucket for ancestors at levels
818 * "level" and higher. It also handles possible change of mode resulting
819 * from the update. Note that mode can also increase here (MAY_BORROW to
820 * CAN_SEND) because we can use more precise clock that event queue here.
821 * In such case we remove class from event queue first.
822 */
823static void htb_charge_class(struct htb_sched *q,struct htb_class *cl,
824 int level,int bytes)
825{
826 long toks,diff;
827 enum htb_cmode old_mode;
828 HTB_DBG(5,1,"htb_chrg_cl cl=%X lev=%d len=%d\n",cl->classid,level,bytes);
829
830#define HTB_ACCNT(T,B,R) toks = diff + cl->T; \
831 if (toks > cl->B) toks = cl->B; \
832 toks -= L2T(cl, cl->R, bytes); \
833 if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \
834 cl->T = toks
835
836 while (cl) {
837 HTB_CHCL(cl);
838 diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer);
839#ifdef HTB_DEBUG
840 if (diff > cl->mbuffer || diff < 0 || PSCHED_TLESS(q->now, cl->t_c)) {
841 if (net_ratelimit())
842 printk(KERN_ERR "HTB: bad diff in charge, cl=%X diff=%lX now=%Lu then=%Lu j=%lu\n",
843 cl->classid, diff,
844#ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY
845 q->now.tv_sec * 1000000ULL + q->now.tv_usec,
846 cl->t_c.tv_sec * 1000000ULL + cl->t_c.tv_usec,
847#else
848 (unsigned long long) q->now,
849 (unsigned long long) cl->t_c,
850#endif
851 q->jiffies);
852 diff = 1000;
853 }
854#endif
855 if (cl->level >= level) {
856 if (cl->level == level) cl->xstats.lends++;
857 HTB_ACCNT (tokens,buffer,rate);
858 } else {
859 cl->xstats.borrows++;
860 cl->tokens += diff; /* we moved t_c; update tokens */
861 }
862 HTB_ACCNT (ctokens,cbuffer,ceil);
863 cl->t_c = q->now;
864 HTB_DBG(5,2,"htb_chrg_clp cl=%X diff=%ld tok=%ld ctok=%ld\n",cl->classid,diff,cl->tokens,cl->ctokens);
865
866 old_mode = cl->cmode; diff = 0;
867 htb_change_class_mode(q,cl,&diff);
868 if (old_mode != cl->cmode) {
869 if (old_mode != HTB_CAN_SEND)
870 htb_safe_rb_erase(&cl->pq_node,q->wait_pq+cl->level);
871 if (cl->cmode != HTB_CAN_SEND)
872 htb_add_to_wait_tree (q,cl,diff,1);
873 }
874
875#ifdef HTB_RATECM
876 /* update rate counters */
877 cl->sum_bytes += bytes; cl->sum_packets++;
878#endif
879
880 /* update byte stats except for leaves which are already updated */
881 if (cl->level) {
882 cl->bstats.bytes += bytes;
883 cl->bstats.packets++;
884 }
885 cl = cl->parent;
886 }
887}
888
889/**
890 * htb_do_events - make mode changes to classes at the level
891 *
892 * Scans event queue for pending events and applies them. Returns jiffies to
893 * next pending event (0 for no event in pq).
894 * Note: Aplied are events whose have cl->pq_key <= jiffies.
895 */
896static long htb_do_events(struct htb_sched *q,int level)
897{
898 int i;
899 HTB_DBG(8,1,"htb_do_events l=%d root=%p rmask=%X\n",
900 level,q->wait_pq[level].rb_node,q->row_mask[level]);
901 for (i = 0; i < 500; i++) {
902 struct htb_class *cl;
903 long diff;
904 struct rb_node *p = q->wait_pq[level].rb_node;
905 if (!p) return 0;
906 while (p->rb_left) p = p->rb_left;
907
908 cl = rb_entry(p, struct htb_class, pq_node);
909 if (time_after(cl->pq_key, q->jiffies)) {
910 HTB_DBG(8,3,"htb_do_ev_ret delay=%ld\n",cl->pq_key - q->jiffies);
911 return cl->pq_key - q->jiffies;
912 }
913 htb_safe_rb_erase(p,q->wait_pq+level);
914 diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer);
915#ifdef HTB_DEBUG
916 if (diff > cl->mbuffer || diff < 0 || PSCHED_TLESS(q->now, cl->t_c)) {
917 if (net_ratelimit())
918 printk(KERN_ERR "HTB: bad diff in events, cl=%X diff=%lX now=%Lu then=%Lu j=%lu\n",
919 cl->classid, diff,
920#ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY
921 q->now.tv_sec * 1000000ULL + q->now.tv_usec,
922 cl->t_c.tv_sec * 1000000ULL + cl->t_c.tv_usec,
923#else
924 (unsigned long long) q->now,
925 (unsigned long long) cl->t_c,
926#endif
927 q->jiffies);
928 diff = 1000;
929 }
930#endif
931 htb_change_class_mode(q,cl,&diff);
932 if (cl->cmode != HTB_CAN_SEND)
933 htb_add_to_wait_tree (q,cl,diff,2);
934 }
935 if (net_ratelimit())
936 printk(KERN_WARNING "htb: too many events !\n");
937 return HZ/10;
938}
939
940/* Returns class->node+prio from id-tree where classe's id is >= id. NULL
941 is no such one exists. */
942static struct rb_node *
943htb_id_find_next_upper(int prio,struct rb_node *n,u32 id)
944{
945 struct rb_node *r = NULL;
946 while (n) {
947 struct htb_class *cl = rb_entry(n,struct htb_class,node[prio]);
948 if (id == cl->classid) return n;
949
950 if (id > cl->classid) {
951 n = n->rb_right;
952 } else {
953 r = n;
954 n = n->rb_left;
955 }
956 }
957 return r;
958}
959
960/**
961 * htb_lookup_leaf - returns next leaf class in DRR order
962 *
963 * Find leaf where current feed pointers points to.
964 */
965static struct htb_class *
966htb_lookup_leaf(HTB_ARGQ struct rb_root *tree,int prio,struct rb_node **pptr,u32 *pid)
967{
968 int i;
969 struct {
970 struct rb_node *root;
971 struct rb_node **pptr;
972 u32 *pid;
973 } stk[TC_HTB_MAXDEPTH],*sp = stk;
974
975 BUG_TRAP(tree->rb_node);
976 sp->root = tree->rb_node;
977 sp->pptr = pptr;
978 sp->pid = pid;
979
980 for (i = 0; i < 65535; i++) {
981 HTB_DBG(4,2,"htb_lleaf ptr=%p pid=%X\n",*sp->pptr,*sp->pid);
982
983 if (!*sp->pptr && *sp->pid) {
984 /* ptr was invalidated but id is valid - try to recover
985 the original or next ptr */
986 *sp->pptr = htb_id_find_next_upper(prio,sp->root,*sp->pid);
987 }
988 *sp->pid = 0; /* ptr is valid now so that remove this hint as it
989 can become out of date quickly */
990 if (!*sp->pptr) { /* we are at right end; rewind & go up */
991 *sp->pptr = sp->root;
992 while ((*sp->pptr)->rb_left)
993 *sp->pptr = (*sp->pptr)->rb_left;
994 if (sp > stk) {
995 sp--;
996 BUG_TRAP(*sp->pptr); if(!*sp->pptr) return NULL;
997 htb_next_rb_node (sp->pptr);
998 }
999 } else {
1000 struct htb_class *cl;
1001 cl = rb_entry(*sp->pptr,struct htb_class,node[prio]);
1002 HTB_CHCL(cl);
1003 if (!cl->level)
1004 return cl;
1005 (++sp)->root = cl->un.inner.feed[prio].rb_node;
1006 sp->pptr = cl->un.inner.ptr+prio;
1007 sp->pid = cl->un.inner.last_ptr_id+prio;
1008 }
1009 }
1010 BUG_TRAP(0);
1011 return NULL;
1012}
1013
1014/* dequeues packet at given priority and level; call only if
1015 you are sure that there is active class at prio/level */
1016static struct sk_buff *
1017htb_dequeue_tree(struct htb_sched *q,int prio,int level)
1018{
1019 struct sk_buff *skb = NULL;
1020 struct htb_class *cl,*start;
1021 /* look initial class up in the row */
1022 start = cl = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio,prio,
1023 q->ptr[level]+prio,q->last_ptr_id[level]+prio);
1024
1025 do {
1026next:
1027 BUG_TRAP(cl);
1028 if (!cl) return NULL;
1029 HTB_DBG(4,1,"htb_deq_tr prio=%d lev=%d cl=%X defic=%d\n",
1030 prio,level,cl->classid,cl->un.leaf.deficit[level]);
1031
1032 /* class can be empty - it is unlikely but can be true if leaf
1033 qdisc drops packets in enqueue routine or if someone used
1034 graft operation on the leaf since last dequeue;
1035 simply deactivate and skip such class */
1036 if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
1037 struct htb_class *next;
1038 htb_deactivate(q,cl);
1039
1040 /* row/level might become empty */
1041 if ((q->row_mask[level] & (1 << prio)) == 0)
1042 return NULL;
1043
1044 next = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio,
1045 prio,q->ptr[level]+prio,q->last_ptr_id[level]+prio);
1046
1047 if (cl == start) /* fix start if we just deleted it */
1048 start = next;
1049 cl = next;
1050 goto next;
1051 }
1052
1053 if (likely((skb = cl->un.leaf.q->dequeue(cl->un.leaf.q)) != NULL))
1054 break;
1055 if (!cl->warned) {
1056 printk(KERN_WARNING "htb: class %X isn't work conserving ?!\n",cl->classid);
1057 cl->warned = 1;
1058 }
1059 q->nwc_hit++;
1060 htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio);
1061 cl = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio,prio,q->ptr[level]+prio,
1062 q->last_ptr_id[level]+prio);
1063
1064 } while (cl != start);
1065
1066 if (likely(skb != NULL)) {
1067 if ((cl->un.leaf.deficit[level] -= skb->len) < 0) {
1068 HTB_DBG(4,2,"htb_next_cl oldptr=%p quant_add=%d\n",
1069 level?cl->parent->un.inner.ptr[prio]:q->ptr[0][prio],cl->un.leaf.quantum);
1070 cl->un.leaf.deficit[level] += cl->un.leaf.quantum;
1071 htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio);
1072 }
1073 /* this used to be after charge_class but this constelation
1074 gives us slightly better performance */
1075 if (!cl->un.leaf.q->q.qlen)
1076 htb_deactivate (q,cl);
1077 htb_charge_class (q,cl,level,skb->len);
1078 }
1079 return skb;
1080}
1081
1082static void htb_delay_by(struct Qdisc *sch,long delay)
1083{
1084 struct htb_sched *q = qdisc_priv(sch);
1085 if (delay <= 0) delay = 1;
1086 if (unlikely(delay > 5*HZ)) {
1087 if (net_ratelimit())
1088 printk(KERN_INFO "HTB delay %ld > 5sec\n", delay);
1089 delay = 5*HZ;
1090 }
1091 /* why don't use jiffies here ? because expires can be in past */
1092 mod_timer(&q->timer, q->jiffies + delay);
1093 sch->flags |= TCQ_F_THROTTLED;
1094 sch->qstats.overlimits++;
1095 HTB_DBG(3,1,"htb_deq t_delay=%ld\n",delay);
1096}
1097
1098static struct sk_buff *htb_dequeue(struct Qdisc *sch)
1099{
1100 struct sk_buff *skb = NULL;
1101 struct htb_sched *q = qdisc_priv(sch);
1102 int level;
1103 long min_delay;
1104#ifdef HTB_DEBUG
1105 int evs_used = 0;
1106#endif
1107
1108 q->jiffies = jiffies;
1109 HTB_DBG(3,1,"htb_deq dircnt=%d qlen=%d\n",skb_queue_len(&q->direct_queue),
1110 sch->q.qlen);
1111
1112 /* try to dequeue direct packets as high prio (!) to minimize cpu work */
1113 if ((skb = __skb_dequeue(&q->direct_queue)) != NULL) {
1114 sch->flags &= ~TCQ_F_THROTTLED;
1115 sch->q.qlen--;
1116 return skb;
1117 }
1118
1119 if (!sch->q.qlen) goto fin;
1120 PSCHED_GET_TIME(q->now);
1121
1122 min_delay = LONG_MAX;
1123 q->nwc_hit = 0;
1124 for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
1125 /* common case optimization - skip event handler quickly */
1126 int m;
1127 long delay;
1128 if (time_after_eq(q->jiffies, q->near_ev_cache[level])) {
1129 delay = htb_do_events(q,level);
1130 q->near_ev_cache[level] = q->jiffies + (delay ? delay : HZ);
1131#ifdef HTB_DEBUG
1132 evs_used++;
1133#endif
1134 } else
1135 delay = q->near_ev_cache[level] - q->jiffies;
1136
1137 if (delay && min_delay > delay)
1138 min_delay = delay;
1139 m = ~q->row_mask[level];
1140 while (m != (int)(-1)) {
1141 int prio = ffz (m);
1142 m |= 1 << prio;
1143 skb = htb_dequeue_tree(q,prio,level);
1144 if (likely(skb != NULL)) {
1145 sch->q.qlen--;
1146 sch->flags &= ~TCQ_F_THROTTLED;
1147 goto fin;
1148 }
1149 }
1150 }
1151#ifdef HTB_DEBUG
1152 if (!q->nwc_hit && min_delay >= 10*HZ && net_ratelimit()) {
1153 if (min_delay == LONG_MAX) {
1154 printk(KERN_ERR "HTB: dequeue bug (%d,%lu,%lu), report it please !\n",
1155 evs_used,q->jiffies,jiffies);
1156 htb_debug_dump(q);
1157 } else
1158 printk(KERN_WARNING "HTB: mindelay=%ld, some class has "
1159 "too small rate\n",min_delay);
1160 }
1161#endif
1162 htb_delay_by (sch,min_delay > 5*HZ ? 5*HZ : min_delay);
1163fin:
1164 HTB_DBG(3,1,"htb_deq_end %s j=%lu skb=%p\n",sch->dev->name,q->jiffies,skb);
1165 return skb;
1166}
1167
1168/* try to drop from each class (by prio) until one succeed */
1169static unsigned int htb_drop(struct Qdisc* sch)
1170{
1171 struct htb_sched *q = qdisc_priv(sch);
1172 int prio;
1173
1174 for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
1175 struct list_head *p;
1176 list_for_each (p,q->drops+prio) {
1177 struct htb_class *cl = list_entry(p, struct htb_class,
1178 un.leaf.drop_list);
1179 unsigned int len;
1180 if (cl->un.leaf.q->ops->drop &&
1181 (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
1182 sch->q.qlen--;
1183 if (!cl->un.leaf.q->q.qlen)
1184 htb_deactivate (q,cl);
1185 return len;
1186 }
1187 }
1188 }
1189 return 0;
1190}
1191
1192/* reset all classes */
1193/* always caled under BH & queue lock */
1194static void htb_reset(struct Qdisc* sch)
1195{
1196 struct htb_sched *q = qdisc_priv(sch);
1197 int i;
1198 HTB_DBG(0,1,"htb_reset sch=%p, handle=%X\n",sch,sch->handle);
1199
1200 for (i = 0; i < HTB_HSIZE; i++) {
1201 struct list_head *p;
1202 list_for_each (p,q->hash+i) {
1203 struct htb_class *cl = list_entry(p,struct htb_class,hlist);
1204 if (cl->level)
1205 memset(&cl->un.inner,0,sizeof(cl->un.inner));
1206 else {
1207 if (cl->un.leaf.q)
1208 qdisc_reset(cl->un.leaf.q);
1209 INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1210 }
1211 cl->prio_activity = 0;
1212 cl->cmode = HTB_CAN_SEND;
1213#ifdef HTB_DEBUG
1214 cl->pq_node.rb_color = -1;
1215 memset(cl->node,255,sizeof(cl->node));
1216#endif
1217
1218 }
1219 }
1220 sch->flags &= ~TCQ_F_THROTTLED;
1221 del_timer(&q->timer);
1222 __skb_queue_purge(&q->direct_queue);
1223 sch->q.qlen = 0;
1224 memset(q->row,0,sizeof(q->row));
1225 memset(q->row_mask,0,sizeof(q->row_mask));
1226 memset(q->wait_pq,0,sizeof(q->wait_pq));
1227 memset(q->ptr,0,sizeof(q->ptr));
1228 for (i = 0; i < TC_HTB_NUMPRIO; i++)
1229 INIT_LIST_HEAD(q->drops+i);
1230}
1231
1232static int htb_init(struct Qdisc *sch, struct rtattr *opt)
1233{
1234 struct htb_sched *q = qdisc_priv(sch);
1235 struct rtattr *tb[TCA_HTB_INIT];
1236 struct tc_htb_glob *gopt;
1237 int i;
1238#ifdef HTB_DEBUG
1239 printk(KERN_INFO "HTB init, kernel part version %d.%d\n",
1240 HTB_VER >> 16,HTB_VER & 0xffff);
1241#endif
1242 if (!opt || rtattr_parse_nested(tb, TCA_HTB_INIT, opt) ||
1243 tb[TCA_HTB_INIT-1] == NULL ||
1244 RTA_PAYLOAD(tb[TCA_HTB_INIT-1]) < sizeof(*gopt)) {
1245 printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
1246 return -EINVAL;
1247 }
1248 gopt = RTA_DATA(tb[TCA_HTB_INIT-1]);
1249 if (gopt->version != HTB_VER >> 16) {
1250 printk(KERN_ERR "HTB: need tc/htb version %d (minor is %d), you have %d\n",
1251 HTB_VER >> 16,HTB_VER & 0xffff,gopt->version);
1252 return -EINVAL;
1253 }
1254 q->debug = gopt->debug;
1255 HTB_DBG(0,1,"htb_init sch=%p handle=%X r2q=%d\n",sch,sch->handle,gopt->rate2quantum);
1256
1257 INIT_LIST_HEAD(&q->root);
1258 for (i = 0; i < HTB_HSIZE; i++)
1259 INIT_LIST_HEAD(q->hash+i);
1260 for (i = 0; i < TC_HTB_NUMPRIO; i++)
1261 INIT_LIST_HEAD(q->drops+i);
1262
1263 init_timer(&q->timer);
1264 skb_queue_head_init(&q->direct_queue);
1265
1266 q->direct_qlen = sch->dev->tx_queue_len;
1267 if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */
1268 q->direct_qlen = 2;
1269 q->timer.function = htb_timer;
1270 q->timer.data = (unsigned long)sch;
1271
1272#ifdef HTB_RATECM
1273 init_timer(&q->rttim);
1274 q->rttim.function = htb_rate_timer;
1275 q->rttim.data = (unsigned long)sch;
1276 q->rttim.expires = jiffies + HZ;
1277 add_timer(&q->rttim);
1278#endif
1279 if ((q->rate2quantum = gopt->rate2quantum) < 1)
1280 q->rate2quantum = 1;
1281 q->defcls = gopt->defcls;
1282
1283 return 0;
1284}
1285
1286static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1287{
1288 struct htb_sched *q = qdisc_priv(sch);
1289 unsigned char *b = skb->tail;
1290 struct rtattr *rta;
1291 struct tc_htb_glob gopt;
1292 HTB_DBG(0,1,"htb_dump sch=%p, handle=%X\n",sch,sch->handle);
1293 HTB_QLOCK(sch);
1294 gopt.direct_pkts = q->direct_pkts;
1295
1296#ifdef HTB_DEBUG
1297 if (HTB_DBG_COND(0,2))
1298 htb_debug_dump(q);
1299#endif
1300 gopt.version = HTB_VER;
1301 gopt.rate2quantum = q->rate2quantum;
1302 gopt.defcls = q->defcls;
1303 gopt.debug = q->debug;
1304 rta = (struct rtattr*)b;
1305 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1306 RTA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt);
1307 rta->rta_len = skb->tail - b;
1308 HTB_QUNLOCK(sch);
1309 return skb->len;
1310rtattr_failure:
1311 HTB_QUNLOCK(sch);
1312 skb_trim(skb, skb->tail - skb->data);
1313 return -1;
1314}
1315
1316static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1317 struct sk_buff *skb, struct tcmsg *tcm)
1318{
1319#ifdef HTB_DEBUG
1320 struct htb_sched *q = qdisc_priv(sch);
1321#endif
1322 struct htb_class *cl = (struct htb_class*)arg;
1323 unsigned char *b = skb->tail;
1324 struct rtattr *rta;
1325 struct tc_htb_opt opt;
1326
1327 HTB_DBG(0,1,"htb_dump_class handle=%X clid=%X\n",sch->handle,cl->classid);
1328
1329 HTB_QLOCK(sch);
1330 tcm->tcm_parent = cl->parent ? cl->parent->classid : TC_H_ROOT;
1331 tcm->tcm_handle = cl->classid;
1332 if (!cl->level && cl->un.leaf.q)
1333 tcm->tcm_info = cl->un.leaf.q->handle;
1334
1335 rta = (struct rtattr*)b;
1336 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1337
1338 memset (&opt,0,sizeof(opt));
1339
1340 opt.rate = cl->rate->rate; opt.buffer = cl->buffer;
1341 opt.ceil = cl->ceil->rate; opt.cbuffer = cl->cbuffer;
1342 opt.quantum = cl->un.leaf.quantum; opt.prio = cl->un.leaf.prio;
1343 opt.level = cl->level;
1344 RTA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt);
1345 rta->rta_len = skb->tail - b;
1346 HTB_QUNLOCK(sch);
1347 return skb->len;
1348rtattr_failure:
1349 HTB_QUNLOCK(sch);
1350 skb_trim(skb, b - skb->data);
1351 return -1;
1352}
1353
1354static int
1355htb_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1356 struct gnet_dump *d)
1357{
1358 struct htb_class *cl = (struct htb_class*)arg;
1359
1360#ifdef HTB_RATECM
1361 cl->rate_est.bps = cl->rate_bytes/(HTB_EWMAC*HTB_HSIZE);
1362 cl->rate_est.pps = cl->rate_packets/(HTB_EWMAC*HTB_HSIZE);
1363#endif
1364
1365 if (!cl->level && cl->un.leaf.q)
1366 cl->qstats.qlen = cl->un.leaf.q->q.qlen;
1367 cl->xstats.tokens = cl->tokens;
1368 cl->xstats.ctokens = cl->ctokens;
1369
1370 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1371 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1372 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1373 return -1;
1374
1375 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1376}
1377
1378static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1379 struct Qdisc **old)
1380{
1381 struct htb_class *cl = (struct htb_class*)arg;
1382
1383 if (cl && !cl->level) {
1384 if (new == NULL && (new = qdisc_create_dflt(sch->dev,
1385 &pfifo_qdisc_ops)) == NULL)
1386 return -ENOBUFS;
1387 sch_tree_lock(sch);
1388 if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) {
1389 if (cl->prio_activity)
1390 htb_deactivate (qdisc_priv(sch),cl);
1391
1392 /* TODO: is it correct ? Why CBQ doesn't do it ? */
1393 sch->q.qlen -= (*old)->q.qlen;
1394 qdisc_reset(*old);
1395 }
1396 sch_tree_unlock(sch);
1397 return 0;
1398 }
1399 return -ENOENT;
1400}
1401
1402static struct Qdisc * htb_leaf(struct Qdisc *sch, unsigned long arg)
1403{
1404 struct htb_class *cl = (struct htb_class*)arg;
1405 return (cl && !cl->level) ? cl->un.leaf.q : NULL;
1406}
1407
1408static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1409{
1410#ifdef HTB_DEBUG
1411 struct htb_sched *q = qdisc_priv(sch);
1412#endif
1413 struct htb_class *cl = htb_find(classid,sch);
1414 HTB_DBG(0,1,"htb_get clid=%X q=%p cl=%p ref=%d\n",classid,q,cl,cl?cl->refcnt:0);
1415 if (cl)
1416 cl->refcnt++;
1417 return (unsigned long)cl;
1418}
1419
1420static void htb_destroy_filters(struct tcf_proto **fl)
1421{
1422 struct tcf_proto *tp;
1423
1424 while ((tp = *fl) != NULL) {
1425 *fl = tp->next;
1426 tcf_destroy(tp);
1427 }
1428}
1429
1430static void htb_destroy_class(struct Qdisc* sch,struct htb_class *cl)
1431{
1432 struct htb_sched *q = qdisc_priv(sch);
1433 HTB_DBG(0,1,"htb_destrycls clid=%X ref=%d\n", cl?cl->classid:0,cl?cl->refcnt:0);
1434 if (!cl->level) {
1435 BUG_TRAP(cl->un.leaf.q);
1436 sch->q.qlen -= cl->un.leaf.q->q.qlen;
1437 qdisc_destroy(cl->un.leaf.q);
1438 }
1439 qdisc_put_rtab(cl->rate);
1440 qdisc_put_rtab(cl->ceil);
1441
1442 htb_destroy_filters (&cl->filter_list);
1443
1444 while (!list_empty(&cl->children))
1445 htb_destroy_class (sch,list_entry(cl->children.next,
1446 struct htb_class,sibling));
1447
1448 /* note: this delete may happen twice (see htb_delete) */
1449 list_del(&cl->hlist);
1450 list_del(&cl->sibling);
1451
1452 if (cl->prio_activity)
1453 htb_deactivate (q,cl);
1454
1455 if (cl->cmode != HTB_CAN_SEND)
1456 htb_safe_rb_erase(&cl->pq_node,q->wait_pq+cl->level);
1457
1458 kfree(cl);
1459}
1460
1461/* always caled under BH & queue lock */
1462static void htb_destroy(struct Qdisc* sch)
1463{
1464 struct htb_sched *q = qdisc_priv(sch);
1465 HTB_DBG(0,1,"htb_destroy q=%p\n",q);
1466
1467 del_timer_sync (&q->timer);
1468#ifdef HTB_RATECM
1469 del_timer_sync (&q->rttim);
1470#endif
1471 /* This line used to be after htb_destroy_class call below
1472 and surprisingly it worked in 2.4. But it must precede it
1473 because filter need its target class alive to be able to call
1474 unbind_filter on it (without Oops). */
1475 htb_destroy_filters(&q->filter_list);
1476
1477 while (!list_empty(&q->root))
1478 htb_destroy_class (sch,list_entry(q->root.next,
1479 struct htb_class,sibling));
1480
1481 __skb_queue_purge(&q->direct_queue);
1482}
1483
1484static int htb_delete(struct Qdisc *sch, unsigned long arg)
1485{
1486 struct htb_sched *q = qdisc_priv(sch);
1487 struct htb_class *cl = (struct htb_class*)arg;
1488 HTB_DBG(0,1,"htb_delete q=%p cl=%X ref=%d\n",q,cl?cl->classid:0,cl?cl->refcnt:0);
1489
1490 // TODO: why don't allow to delete subtree ? references ? does
1491 // tc subsys quarantee us that in htb_destroy it holds no class
1492 // refs so that we can remove children safely there ?
1493 if (!list_empty(&cl->children) || cl->filter_cnt)
1494 return -EBUSY;
1495
1496 sch_tree_lock(sch);
1497
1498 /* delete from hash and active; remainder in destroy_class */
1499 list_del_init(&cl->hlist);
1500 if (cl->prio_activity)
1501 htb_deactivate (q,cl);
1502
1503 if (--cl->refcnt == 0)
1504 htb_destroy_class(sch,cl);
1505
1506 sch_tree_unlock(sch);
1507 return 0;
1508}
1509
1510static void htb_put(struct Qdisc *sch, unsigned long arg)
1511{
1512#ifdef HTB_DEBUG
1513 struct htb_sched *q = qdisc_priv(sch);
1514#endif
1515 struct htb_class *cl = (struct htb_class*)arg;
1516 HTB_DBG(0,1,"htb_put q=%p cl=%X ref=%d\n",q,cl?cl->classid:0,cl?cl->refcnt:0);
1517
1518 if (--cl->refcnt == 0)
1519 htb_destroy_class(sch,cl);
1520}
1521
1522static int htb_change_class(struct Qdisc *sch, u32 classid,
1523 u32 parentid, struct rtattr **tca, unsigned long *arg)
1524{
1525 int err = -EINVAL;
1526 struct htb_sched *q = qdisc_priv(sch);
1527 struct htb_class *cl = (struct htb_class*)*arg,*parent;
1528 struct rtattr *opt = tca[TCA_OPTIONS-1];
1529 struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
1530 struct rtattr *tb[TCA_HTB_RTAB];
1531 struct tc_htb_opt *hopt;
1532
1533 /* extract all subattrs from opt attr */
1534 if (!opt || rtattr_parse_nested(tb, TCA_HTB_RTAB, opt) ||
1535 tb[TCA_HTB_PARMS-1] == NULL ||
1536 RTA_PAYLOAD(tb[TCA_HTB_PARMS-1]) < sizeof(*hopt))
1537 goto failure;
1538
1539 parent = parentid == TC_H_ROOT ? NULL : htb_find (parentid,sch);
1540
1541 hopt = RTA_DATA(tb[TCA_HTB_PARMS-1]);
1542 HTB_DBG(0,1,"htb_chg cl=%p(%X), clid=%X, parid=%X, opt/prio=%d, rate=%u, buff=%d, quant=%d\n", cl,cl?cl->classid:0,classid,parentid,(int)hopt->prio,hopt->rate.rate,hopt->buffer,hopt->quantum);
1543 rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB-1]);
1544 ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB-1]);
1545 if (!rtab || !ctab) goto failure;
1546
1547 if (!cl) { /* new class */
1548 struct Qdisc *new_q;
1549 /* check for valid classid */
1550 if (!classid || TC_H_MAJ(classid^sch->handle) || htb_find(classid,sch))
1551 goto failure;
1552
1553 /* check maximal depth */
1554 if (parent && parent->parent && parent->parent->level < 2) {
1555 printk(KERN_ERR "htb: tree is too deep\n");
1556 goto failure;
1557 }
1558 err = -ENOBUFS;
1559 if ((cl = kmalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
1560 goto failure;
1561
1562 memset(cl, 0, sizeof(*cl));
1563 cl->refcnt = 1;
1564 INIT_LIST_HEAD(&cl->sibling);
1565 INIT_LIST_HEAD(&cl->hlist);
1566 INIT_LIST_HEAD(&cl->children);
1567 INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1568#ifdef HTB_DEBUG
1569 cl->magic = HTB_CMAGIC;
1570#endif
1571
1572 /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1573 so that can't be used inside of sch_tree_lock
1574 -- thanks to Karlis Peisenieks */
1575 new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
1576 sch_tree_lock(sch);
1577 if (parent && !parent->level) {
1578 /* turn parent into inner node */
1579 sch->q.qlen -= parent->un.leaf.q->q.qlen;
1580 qdisc_destroy (parent->un.leaf.q);
1581 if (parent->prio_activity)
1582 htb_deactivate (q,parent);
1583
1584 /* remove from evt list because of level change */
1585 if (parent->cmode != HTB_CAN_SEND) {
1586 htb_safe_rb_erase(&parent->pq_node,q->wait_pq /*+0*/);
1587 parent->cmode = HTB_CAN_SEND;
1588 }
1589 parent->level = (parent->parent ? parent->parent->level
1590 : TC_HTB_MAXDEPTH) - 1;
1591 memset (&parent->un.inner,0,sizeof(parent->un.inner));
1592 }
1593 /* leaf (we) needs elementary qdisc */
1594 cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1595
1596 cl->classid = classid; cl->parent = parent;
1597
1598 /* set class to be in HTB_CAN_SEND state */
1599 cl->tokens = hopt->buffer;
1600 cl->ctokens = hopt->cbuffer;
1601 cl->mbuffer = 60000000; /* 1min */
1602 PSCHED_GET_TIME(cl->t_c);
1603 cl->cmode = HTB_CAN_SEND;
1604
1605 /* attach to the hash list and parent's family */
1606 list_add_tail(&cl->hlist, q->hash+htb_hash(classid));
1607 list_add_tail(&cl->sibling, parent ? &parent->children : &q->root);
1608#ifdef HTB_DEBUG
1609 {
1610 int i;
1611 for (i = 0; i < TC_HTB_NUMPRIO; i++) cl->node[i].rb_color = -1;
1612 cl->pq_node.rb_color = -1;
1613 }
1614#endif
1615 } else sch_tree_lock(sch);
1616
1617 /* it used to be a nasty bug here, we have to check that node
1618 is really leaf before changing cl->un.leaf ! */
1619 if (!cl->level) {
1620 cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum;
1621 if (!hopt->quantum && cl->un.leaf.quantum < 1000) {
1622 printk(KERN_WARNING "HTB: quantum of class %X is small. Consider r2q change.\n", cl->classid);
1623 cl->un.leaf.quantum = 1000;
1624 }
1625 if (!hopt->quantum && cl->un.leaf.quantum > 200000) {
1626 printk(KERN_WARNING "HTB: quantum of class %X is big. Consider r2q change.\n", cl->classid);
1627 cl->un.leaf.quantum = 200000;
1628 }
1629 if (hopt->quantum)
1630 cl->un.leaf.quantum = hopt->quantum;
1631 if ((cl->un.leaf.prio = hopt->prio) >= TC_HTB_NUMPRIO)
1632 cl->un.leaf.prio = TC_HTB_NUMPRIO - 1;
1633 }
1634
1635 cl->buffer = hopt->buffer;
1636 cl->cbuffer = hopt->cbuffer;
1637 if (cl->rate) qdisc_put_rtab(cl->rate); cl->rate = rtab;
1638 if (cl->ceil) qdisc_put_rtab(cl->ceil); cl->ceil = ctab;
1639 sch_tree_unlock(sch);
1640
1641 *arg = (unsigned long)cl;
1642 return 0;
1643
1644failure:
1645 if (rtab) qdisc_put_rtab(rtab);
1646 if (ctab) qdisc_put_rtab(ctab);
1647 return err;
1648}
1649
1650static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
1651{
1652 struct htb_sched *q = qdisc_priv(sch);
1653 struct htb_class *cl = (struct htb_class *)arg;
1654 struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
1655 HTB_DBG(0,2,"htb_tcf q=%p clid=%X fref=%d fl=%p\n",q,cl?cl->classid:0,cl?cl->filter_cnt:q->filter_cnt,*fl);
1656 return fl;
1657}
1658
1659static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1660 u32 classid)
1661{
1662 struct htb_sched *q = qdisc_priv(sch);
1663 struct htb_class *cl = htb_find (classid,sch);
1664 HTB_DBG(0,2,"htb_bind q=%p clid=%X cl=%p fref=%d\n",q,classid,cl,cl?cl->filter_cnt:q->filter_cnt);
1665 /*if (cl && !cl->level) return 0;
1666 The line above used to be there to prevent attaching filters to
1667 leaves. But at least tc_index filter uses this just to get class
1668 for other reasons so that we have to allow for it.
1669 ----
1670 19.6.2002 As Werner explained it is ok - bind filter is just
1671 another way to "lock" the class - unlike "get" this lock can
1672 be broken by class during destroy IIUC.
1673 */
1674 if (cl)
1675 cl->filter_cnt++;
1676 else
1677 q->filter_cnt++;
1678 return (unsigned long)cl;
1679}
1680
1681static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1682{
1683 struct htb_sched *q = qdisc_priv(sch);
1684 struct htb_class *cl = (struct htb_class *)arg;
1685 HTB_DBG(0,2,"htb_unbind q=%p cl=%p fref=%d\n",q,cl,cl?cl->filter_cnt:q->filter_cnt);
1686 if (cl)
1687 cl->filter_cnt--;
1688 else
1689 q->filter_cnt--;
1690}
1691
1692static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1693{
1694 struct htb_sched *q = qdisc_priv(sch);
1695 int i;
1696
1697 if (arg->stop)
1698 return;
1699
1700 for (i = 0; i < HTB_HSIZE; i++) {
1701 struct list_head *p;
1702 list_for_each (p,q->hash+i) {
1703 struct htb_class *cl = list_entry(p,struct htb_class,hlist);
1704 if (arg->count < arg->skip) {
1705 arg->count++;
1706 continue;
1707 }
1708 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1709 arg->stop = 1;
1710 return;
1711 }
1712 arg->count++;
1713 }
1714 }
1715}
1716
1717static struct Qdisc_class_ops htb_class_ops = {
1718 .graft = htb_graft,
1719 .leaf = htb_leaf,
1720 .get = htb_get,
1721 .put = htb_put,
1722 .change = htb_change_class,
1723 .delete = htb_delete,
1724 .walk = htb_walk,
1725 .tcf_chain = htb_find_tcf,
1726 .bind_tcf = htb_bind_filter,
1727 .unbind_tcf = htb_unbind_filter,
1728 .dump = htb_dump_class,
1729 .dump_stats = htb_dump_class_stats,
1730};
1731
1732static struct Qdisc_ops htb_qdisc_ops = {
1733 .next = NULL,
1734 .cl_ops = &htb_class_ops,
1735 .id = "htb",
1736 .priv_size = sizeof(struct htb_sched),
1737 .enqueue = htb_enqueue,
1738 .dequeue = htb_dequeue,
1739 .requeue = htb_requeue,
1740 .drop = htb_drop,
1741 .init = htb_init,
1742 .reset = htb_reset,
1743 .destroy = htb_destroy,
1744 .change = NULL /* htb_change */,
1745 .dump = htb_dump,
1746 .owner = THIS_MODULE,
1747};
1748
1749static int __init htb_module_init(void)
1750{
1751 return register_qdisc(&htb_qdisc_ops);
1752}
1753static void __exit htb_module_exit(void)
1754{
1755 unregister_qdisc(&htb_qdisc_ops);
1756}
1757module_init(htb_module_init)
1758module_exit(htb_module_exit)
1759MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
new file mode 100644
index 000000000000..8edc32a6ad2f
--- /dev/null
+++ b/net/sched/sch_ingress.c
@@ -0,0 +1,436 @@
1/* net/sched/sch_ingress.c - Ingress qdisc
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Authors: Jamal Hadi Salim 1999
8 */
9
10#include <linux/config.h>
11#include <linux/module.h>
12#include <linux/types.h>
13#include <linux/skbuff.h>
14#include <linux/netdevice.h>
15#include <linux/rtnetlink.h>
16#include <linux/netfilter_ipv4.h>
17#include <linux/netfilter_ipv6.h>
18#include <linux/netfilter.h>
19#include <linux/smp.h>
20#include <net/pkt_sched.h>
21#include <asm/byteorder.h>
22#include <asm/uaccess.h>
23#include <linux/kmod.h>
24#include <linux/stat.h>
25#include <linux/interrupt.h>
26#include <linux/list.h>
27
28
29#undef DEBUG_INGRESS
30
31#ifdef DEBUG_INGRESS /* control */
32#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
33#else
34#define DPRINTK(format,args...)
35#endif
36
37#if 0 /* data */
38#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
39#else
40#define D2PRINTK(format,args...)
41#endif
42
43
44#define PRIV(sch) qdisc_priv(sch)
45
46
47/* Thanks to Doron Oz for this hack
48*/
49#ifndef CONFIG_NET_CLS_ACT
50#ifdef CONFIG_NETFILTER
51static int nf_registered;
52#endif
53#endif
54
55struct ingress_qdisc_data {
56 struct Qdisc *q;
57 struct tcf_proto *filter_list;
58};
59
60
61/* ------------------------- Class/flow operations ------------------------- */
62
63
64static int ingress_graft(struct Qdisc *sch,unsigned long arg,
65 struct Qdisc *new,struct Qdisc **old)
66{
67#ifdef DEBUG_INGRESS
68 struct ingress_qdisc_data *p = PRIV(sch);
69#endif
70
71 DPRINTK("ingress_graft(sch %p,[qdisc %p],new %p,old %p)\n",
72 sch, p, new, old);
73 DPRINTK("\n ingress_graft: You cannot add qdiscs to classes");
74 return 1;
75}
76
77
78static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
79{
80 return NULL;
81}
82
83
84static unsigned long ingress_get(struct Qdisc *sch,u32 classid)
85{
86#ifdef DEBUG_INGRESS
87 struct ingress_qdisc_data *p = PRIV(sch);
88#endif
89 DPRINTK("ingress_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid);
90 return TC_H_MIN(classid) + 1;
91}
92
93
94static unsigned long ingress_bind_filter(struct Qdisc *sch,
95 unsigned long parent, u32 classid)
96{
97 return ingress_get(sch, classid);
98}
99
100
101static void ingress_put(struct Qdisc *sch, unsigned long cl)
102{
103}
104
105
106static int ingress_change(struct Qdisc *sch, u32 classid, u32 parent,
107 struct rtattr **tca, unsigned long *arg)
108{
109#ifdef DEBUG_INGRESS
110 struct ingress_qdisc_data *p = PRIV(sch);
111#endif
112 DPRINTK("ingress_change(sch %p,[qdisc %p],classid %x,parent %x),"
113 "arg 0x%lx\n", sch, p, classid, parent, *arg);
114 DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment");
115 return 0;
116}
117
118
119
120static void ingress_walk(struct Qdisc *sch,struct qdisc_walker *walker)
121{
122#ifdef DEBUG_INGRESS
123 struct ingress_qdisc_data *p = PRIV(sch);
124#endif
125 DPRINTK("ingress_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
126 DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment");
127}
128
129
130static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch,unsigned long cl)
131{
132 struct ingress_qdisc_data *p = PRIV(sch);
133
134 return &p->filter_list;
135}
136
137
138/* --------------------------- Qdisc operations ---------------------------- */
139
140
141static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch)
142{
143 struct ingress_qdisc_data *p = PRIV(sch);
144 struct tcf_result res;
145 int result;
146
147 D2PRINTK("ingress_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
148 result = tc_classify(skb, p->filter_list, &res);
149 D2PRINTK("result %d class 0x%04x\n", result, res.classid);
150 /*
151 * Unlike normal "enqueue" functions, ingress_enqueue returns a
152 * firewall FW_* code.
153 */
154#ifdef CONFIG_NET_CLS_ACT
155 sch->bstats.packets++;
156 sch->bstats.bytes += skb->len;
157 switch (result) {
158 case TC_ACT_SHOT:
159 result = TC_ACT_SHOT;
160 sch->qstats.drops++;
161 break;
162 case TC_ACT_STOLEN:
163 case TC_ACT_QUEUED:
164 result = TC_ACT_STOLEN;
165 break;
166 case TC_ACT_RECLASSIFY:
167 case TC_ACT_OK:
168 case TC_ACT_UNSPEC:
169 default:
170 skb->tc_index = TC_H_MIN(res.classid);
171 result = TC_ACT_OK;
172 break;
173 };
174/* backward compat */
175#else
176#ifdef CONFIG_NET_CLS_POLICE
177 switch (result) {
178 case TC_POLICE_SHOT:
179 result = NF_DROP;
180 sch->qstats.drops++;
181 break;
182 case TC_POLICE_RECLASSIFY: /* DSCP remarking here ? */
183 case TC_POLICE_OK:
184 case TC_POLICE_UNSPEC:
185 default:
186 sch->bstats.packets++;
187 sch->bstats.bytes += skb->len;
188 result = NF_ACCEPT;
189 break;
190 };
191
192#else
193 D2PRINTK("Overriding result to ACCEPT\n");
194 result = NF_ACCEPT;
195 sch->bstats.packets++;
196 sch->bstats.bytes += skb->len;
197#endif
198#endif
199
200 return result;
201}
202
203
204static struct sk_buff *ingress_dequeue(struct Qdisc *sch)
205{
206/*
207 struct ingress_qdisc_data *p = PRIV(sch);
208 D2PRINTK("ingress_dequeue(sch %p,[qdisc %p])\n",sch,PRIV(p));
209*/
210 return NULL;
211}
212
213
214static int ingress_requeue(struct sk_buff *skb,struct Qdisc *sch)
215{
216/*
217 struct ingress_qdisc_data *p = PRIV(sch);
218 D2PRINTK("ingress_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,PRIV(p));
219*/
220 return 0;
221}
222
223static unsigned int ingress_drop(struct Qdisc *sch)
224{
225#ifdef DEBUG_INGRESS
226 struct ingress_qdisc_data *p = PRIV(sch);
227#endif
228 DPRINTK("ingress_drop(sch %p,[qdisc %p])\n", sch, p);
229 return 0;
230}
231
232#ifndef CONFIG_NET_CLS_ACT
233#ifdef CONFIG_NETFILTER
234static unsigned int
235ing_hook(unsigned int hook, struct sk_buff **pskb,
236 const struct net_device *indev,
237 const struct net_device *outdev,
238 int (*okfn)(struct sk_buff *))
239{
240
241 struct Qdisc *q;
242 struct sk_buff *skb = *pskb;
243 struct net_device *dev = skb->dev;
244 int fwres=NF_ACCEPT;
245
246 DPRINTK("ing_hook: skb %s dev=%s len=%u\n",
247 skb->sk ? "(owned)" : "(unowned)",
248 skb->dev ? (*pskb)->dev->name : "(no dev)",
249 skb->len);
250
251/*
252revisit later: Use a private since lock dev->queue_lock is also
253used on the egress (might slow things for an iota)
254*/
255
256 if (dev->qdisc_ingress) {
257 spin_lock(&dev->queue_lock);
258 if ((q = dev->qdisc_ingress) != NULL)
259 fwres = q->enqueue(skb, q);
260 spin_unlock(&dev->queue_lock);
261 }
262
263 return fwres;
264}
265
266/* after ipt_filter */
267static struct nf_hook_ops ing_ops = {
268 .hook = ing_hook,
269 .owner = THIS_MODULE,
270 .pf = PF_INET,
271 .hooknum = NF_IP_PRE_ROUTING,
272 .priority = NF_IP_PRI_FILTER + 1,
273};
274
275static struct nf_hook_ops ing6_ops = {
276 .hook = ing_hook,
277 .owner = THIS_MODULE,
278 .pf = PF_INET6,
279 .hooknum = NF_IP6_PRE_ROUTING,
280 .priority = NF_IP6_PRI_FILTER + 1,
281};
282
283#endif
284#endif
285
286static int ingress_init(struct Qdisc *sch,struct rtattr *opt)
287{
288 struct ingress_qdisc_data *p = PRIV(sch);
289
290/* Make sure either netfilter or preferably CLS_ACT is
291* compiled in */
292#ifndef CONFIG_NET_CLS_ACT
293#ifndef CONFIG_NETFILTER
294 printk("You MUST compile classifier actions into the kernel\n");
295 return -EINVAL;
296#else
297 printk("Ingress scheduler: Classifier actions prefered over netfilter\n");
298#endif
299#endif
300
301#ifndef CONFIG_NET_CLS_ACT
302#ifdef CONFIG_NETFILTER
303 if (!nf_registered) {
304 if (nf_register_hook(&ing_ops) < 0) {
305 printk("ingress qdisc registration error \n");
306 return -EINVAL;
307 }
308 nf_registered++;
309
310 if (nf_register_hook(&ing6_ops) < 0) {
311 printk("IPv6 ingress qdisc registration error, " \
312 "disabling IPv6 support.\n");
313 } else
314 nf_registered++;
315 }
316#endif
317#endif
318
319 DPRINTK("ingress_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt);
320 p->q = &noop_qdisc;
321 return 0;
322}
323
324
325static void ingress_reset(struct Qdisc *sch)
326{
327 struct ingress_qdisc_data *p = PRIV(sch);
328
329 DPRINTK("ingress_reset(sch %p,[qdisc %p])\n", sch, p);
330
331/*
332#if 0
333*/
334/* for future use */
335 qdisc_reset(p->q);
336/*
337#endif
338*/
339}
340
341/* ------------------------------------------------------------- */
342
343
344/* ------------------------------------------------------------- */
345
346static void ingress_destroy(struct Qdisc *sch)
347{
348 struct ingress_qdisc_data *p = PRIV(sch);
349 struct tcf_proto *tp;
350
351 DPRINTK("ingress_destroy(sch %p,[qdisc %p])\n", sch, p);
352 while (p->filter_list) {
353 tp = p->filter_list;
354 p->filter_list = tp->next;
355 tcf_destroy(tp);
356 }
357#if 0
358/* for future use */
359 qdisc_destroy(p->q);
360#endif
361}
362
363
364static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
365{
366 unsigned char *b = skb->tail;
367 struct rtattr *rta;
368
369 rta = (struct rtattr *) b;
370 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
371 rta->rta_len = skb->tail - b;
372 return skb->len;
373
374rtattr_failure:
375 skb_trim(skb, b - skb->data);
376 return -1;
377}
378
379static struct Qdisc_class_ops ingress_class_ops = {
380 .graft = ingress_graft,
381 .leaf = ingress_leaf,
382 .get = ingress_get,
383 .put = ingress_put,
384 .change = ingress_change,
385 .delete = NULL,
386 .walk = ingress_walk,
387 .tcf_chain = ingress_find_tcf,
388 .bind_tcf = ingress_bind_filter,
389 .unbind_tcf = ingress_put,
390 .dump = NULL,
391};
392
393static struct Qdisc_ops ingress_qdisc_ops = {
394 .next = NULL,
395 .cl_ops = &ingress_class_ops,
396 .id = "ingress",
397 .priv_size = sizeof(struct ingress_qdisc_data),
398 .enqueue = ingress_enqueue,
399 .dequeue = ingress_dequeue,
400 .requeue = ingress_requeue,
401 .drop = ingress_drop,
402 .init = ingress_init,
403 .reset = ingress_reset,
404 .destroy = ingress_destroy,
405 .change = NULL,
406 .dump = ingress_dump,
407 .owner = THIS_MODULE,
408};
409
410static int __init ingress_module_init(void)
411{
412 int ret = 0;
413
414 if ((ret = register_qdisc(&ingress_qdisc_ops)) < 0) {
415 printk("Unable to register Ingress qdisc\n");
416 return ret;
417 }
418
419 return ret;
420}
421static void __exit ingress_module_exit(void)
422{
423 unregister_qdisc(&ingress_qdisc_ops);
424#ifndef CONFIG_NET_CLS_ACT
425#ifdef CONFIG_NETFILTER
426 if (nf_registered) {
427 nf_unregister_hook(&ing_ops);
428 if (nf_registered > 1)
429 nf_unregister_hook(&ing6_ops);
430 }
431#endif
432#endif
433}
434module_init(ingress_module_init)
435module_exit(ingress_module_exit)
436MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
new file mode 100644
index 000000000000..31c29deb139d
--- /dev/null
+++ b/net/sched/sch_netem.c
@@ -0,0 +1,598 @@
1/*
2 * net/sched/sch_netem.c Network emulator
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Many of the algorithms and ideas for this came from
10 * NIST Net which is not copyrighted.
11 *
12 * Authors: Stephen Hemminger <shemminger@osdl.org>
13 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14 */
15
16#include <linux/config.h>
17#include <linux/module.h>
18#include <linux/bitops.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/errno.h>
22#include <linux/netdevice.h>
23#include <linux/skbuff.h>
24#include <linux/rtnetlink.h>
25
26#include <net/pkt_sched.h>
27
28/* Network Emulation Queuing algorithm.
29 ====================================
30
31 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
32 Network Emulation Tool
33 [2] Luigi Rizzo, DummyNet for FreeBSD
34
35 ----------------------------------------------------------------
36
37 This started out as a simple way to delay outgoing packets to
38 test TCP but has grown to include most of the functionality
39 of a full blown network emulator like NISTnet. It can delay
40 packets and add random jitter (and correlation). The random
41 distribution can be loaded from a table as well to provide
42 normal, Pareto, or experimental curves. Packet loss,
43 duplication, and reordering can also be emulated.
44
45 This qdisc does not do classification that can be handled in
46 layering other disciplines. It does not need to do bandwidth
47 control either since that can be handled by using token
48 bucket or other rate control.
49
50 The simulator is limited by the Linux timer resolution
51 and will create packet bursts on the HZ boundary (1ms).
52*/
53
54struct netem_sched_data {
55 struct Qdisc *qdisc;
56 struct sk_buff_head delayed;
57 struct timer_list timer;
58
59 u32 latency;
60 u32 loss;
61 u32 limit;
62 u32 counter;
63 u32 gap;
64 u32 jitter;
65 u32 duplicate;
66
67 struct crndstate {
68 unsigned long last;
69 unsigned long rho;
70 } delay_cor, loss_cor, dup_cor;
71
72 struct disttable {
73 u32 size;
74 s16 table[0];
75 } *delay_dist;
76};
77
78/* Time stamp put into socket buffer control block */
79struct netem_skb_cb {
80 psched_time_t time_to_send;
81};
82
83/* init_crandom - initialize correlated random number generator
84 * Use entropy source for initial seed.
85 */
86static void init_crandom(struct crndstate *state, unsigned long rho)
87{
88 state->rho = rho;
89 state->last = net_random();
90}
91
92/* get_crandom - correlated random number generator
93 * Next number depends on last value.
94 * rho is scaled to avoid floating point.
95 */
96static unsigned long get_crandom(struct crndstate *state)
97{
98 u64 value, rho;
99 unsigned long answer;
100
101 if (state->rho == 0) /* no correllation */
102 return net_random();
103
104 value = net_random();
105 rho = (u64)state->rho + 1;
106 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
107 state->last = answer;
108 return answer;
109}
110
111/* tabledist - return a pseudo-randomly distributed value with mean mu and
112 * std deviation sigma. Uses table lookup to approximate the desired
113 * distribution, and a uniformly-distributed pseudo-random source.
114 */
115static long tabledist(unsigned long mu, long sigma,
116 struct crndstate *state, const struct disttable *dist)
117{
118 long t, x;
119 unsigned long rnd;
120
121 if (sigma == 0)
122 return mu;
123
124 rnd = get_crandom(state);
125
126 /* default uniform distribution */
127 if (dist == NULL)
128 return (rnd % (2*sigma)) - sigma + mu;
129
130 t = dist->table[rnd % dist->size];
131 x = (sigma % NETEM_DIST_SCALE) * t;
132 if (x >= 0)
133 x += NETEM_DIST_SCALE/2;
134 else
135 x -= NETEM_DIST_SCALE/2;
136
137 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
138}
139
140/* Put skb in the private delayed queue. */
141static int delay_skb(struct Qdisc *sch, struct sk_buff *skb)
142{
143 struct netem_sched_data *q = qdisc_priv(sch);
144 struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
145 psched_tdiff_t td;
146 psched_time_t now;
147
148 PSCHED_GET_TIME(now);
149 td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist);
150 PSCHED_TADD2(now, td, cb->time_to_send);
151
152 /* Always queue at tail to keep packets in order */
153 if (likely(q->delayed.qlen < q->limit)) {
154 __skb_queue_tail(&q->delayed, skb);
155 if (!timer_pending(&q->timer)) {
156 q->timer.expires = jiffies + PSCHED_US2JIFFIE(td);
157 add_timer(&q->timer);
158 }
159 return NET_XMIT_SUCCESS;
160 }
161
162 kfree_skb(skb);
163 return NET_XMIT_DROP;
164}
165
166static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
167{
168 struct netem_sched_data *q = qdisc_priv(sch);
169 struct sk_buff *skb2;
170 int ret;
171
172 pr_debug("netem_enqueue skb=%p @%lu\n", skb, jiffies);
173
174 /* Random packet drop 0 => none, ~0 => all */
175 if (q->loss && q->loss >= get_crandom(&q->loss_cor)) {
176 pr_debug("netem_enqueue: random loss\n");
177 sch->qstats.drops++;
178 kfree_skb(skb);
179 return 0; /* lie about loss so TCP doesn't know */
180 }
181
182 /* Random duplication */
183 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)
184 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
185 pr_debug("netem_enqueue: dup %p\n", skb2);
186
187 if (delay_skb(sch, skb2)) {
188 sch->q.qlen++;
189 sch->bstats.bytes += skb2->len;
190 sch->bstats.packets++;
191 } else
192 sch->qstats.drops++;
193 }
194
195 /* If doing simple delay then gap == 0 so all packets
196 * go into the delayed holding queue
197 * otherwise if doing out of order only "1 out of gap"
198 * packets will be delayed.
199 */
200 if (q->counter < q->gap) {
201 ++q->counter;
202 ret = q->qdisc->enqueue(skb, q->qdisc);
203 } else {
204 q->counter = 0;
205 ret = delay_skb(sch, skb);
206 }
207
208 if (likely(ret == NET_XMIT_SUCCESS)) {
209 sch->q.qlen++;
210 sch->bstats.bytes += skb->len;
211 sch->bstats.packets++;
212 } else
213 sch->qstats.drops++;
214
215 return ret;
216}
217
218/* Requeue packets but don't change time stamp */
219static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch)
220{
221 struct netem_sched_data *q = qdisc_priv(sch);
222 int ret;
223
224 if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
225 sch->q.qlen++;
226 sch->qstats.requeues++;
227 }
228
229 return ret;
230}
231
232static unsigned int netem_drop(struct Qdisc* sch)
233{
234 struct netem_sched_data *q = qdisc_priv(sch);
235 unsigned int len;
236
237 if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) {
238 sch->q.qlen--;
239 sch->qstats.drops++;
240 }
241 return len;
242}
243
244/* Dequeue packet.
245 * Move all packets that are ready to send from the delay holding
246 * list to the underlying qdisc, then just call dequeue
247 */
248static struct sk_buff *netem_dequeue(struct Qdisc *sch)
249{
250 struct netem_sched_data *q = qdisc_priv(sch);
251 struct sk_buff *skb;
252
253 skb = q->qdisc->dequeue(q->qdisc);
254 if (skb)
255 sch->q.qlen--;
256 return skb;
257}
258
259static void netem_watchdog(unsigned long arg)
260{
261 struct Qdisc *sch = (struct Qdisc *)arg;
262 struct netem_sched_data *q = qdisc_priv(sch);
263 struct net_device *dev = sch->dev;
264 struct sk_buff *skb;
265 psched_time_t now;
266
267 pr_debug("netem_watchdog: fired @%lu\n", jiffies);
268
269 spin_lock_bh(&dev->queue_lock);
270 PSCHED_GET_TIME(now);
271
272 while ((skb = skb_peek(&q->delayed)) != NULL) {
273 const struct netem_skb_cb *cb
274 = (const struct netem_skb_cb *)skb->cb;
275 long delay
276 = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
277 pr_debug("netem_watchdog: skb %p@%lu %ld\n",
278 skb, jiffies, delay);
279
280 /* if more time remaining? */
281 if (delay > 0) {
282 mod_timer(&q->timer, jiffies + delay);
283 break;
284 }
285 __skb_unlink(skb, &q->delayed);
286
287 if (q->qdisc->enqueue(skb, q->qdisc)) {
288 sch->q.qlen--;
289 sch->qstats.drops++;
290 }
291 }
292 qdisc_run(dev);
293 spin_unlock_bh(&dev->queue_lock);
294}
295
296static void netem_reset(struct Qdisc *sch)
297{
298 struct netem_sched_data *q = qdisc_priv(sch);
299
300 qdisc_reset(q->qdisc);
301 skb_queue_purge(&q->delayed);
302
303 sch->q.qlen = 0;
304 del_timer_sync(&q->timer);
305}
306
307static int set_fifo_limit(struct Qdisc *q, int limit)
308{
309 struct rtattr *rta;
310 int ret = -ENOMEM;
311
312 rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
313 if (rta) {
314 rta->rta_type = RTM_NEWQDISC;
315 rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
316 ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
317
318 ret = q->ops->change(q, rta);
319 kfree(rta);
320 }
321 return ret;
322}
323
324/*
325 * Distribution data is a variable size payload containing
326 * signed 16 bit values.
327 */
328static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr)
329{
330 struct netem_sched_data *q = qdisc_priv(sch);
331 unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16);
332 const __s16 *data = RTA_DATA(attr);
333 struct disttable *d;
334 int i;
335
336 if (n > 65536)
337 return -EINVAL;
338
339 d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
340 if (!d)
341 return -ENOMEM;
342
343 d->size = n;
344 for (i = 0; i < n; i++)
345 d->table[i] = data[i];
346
347 spin_lock_bh(&sch->dev->queue_lock);
348 d = xchg(&q->delay_dist, d);
349 spin_unlock_bh(&sch->dev->queue_lock);
350
351 kfree(d);
352 return 0;
353}
354
355static int get_correlation(struct Qdisc *sch, const struct rtattr *attr)
356{
357 struct netem_sched_data *q = qdisc_priv(sch);
358 const struct tc_netem_corr *c = RTA_DATA(attr);
359
360 if (RTA_PAYLOAD(attr) != sizeof(*c))
361 return -EINVAL;
362
363 init_crandom(&q->delay_cor, c->delay_corr);
364 init_crandom(&q->loss_cor, c->loss_corr);
365 init_crandom(&q->dup_cor, c->dup_corr);
366 return 0;
367}
368
369static int netem_change(struct Qdisc *sch, struct rtattr *opt)
370{
371 struct netem_sched_data *q = qdisc_priv(sch);
372 struct tc_netem_qopt *qopt;
373 int ret;
374
375 if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
376 return -EINVAL;
377
378 qopt = RTA_DATA(opt);
379 ret = set_fifo_limit(q->qdisc, qopt->limit);
380 if (ret) {
381 pr_debug("netem: can't set fifo limit\n");
382 return ret;
383 }
384
385 q->latency = qopt->latency;
386 q->jitter = qopt->jitter;
387 q->limit = qopt->limit;
388 q->gap = qopt->gap;
389 q->loss = qopt->loss;
390 q->duplicate = qopt->duplicate;
391
392 /* Handle nested options after initial queue options.
393 * Should have put all options in nested format but too late now.
394 */
395 if (RTA_PAYLOAD(opt) > sizeof(*qopt)) {
396 struct rtattr *tb[TCA_NETEM_MAX];
397 if (rtattr_parse(tb, TCA_NETEM_MAX,
398 RTA_DATA(opt) + sizeof(*qopt),
399 RTA_PAYLOAD(opt) - sizeof(*qopt)))
400 return -EINVAL;
401
402 if (tb[TCA_NETEM_CORR-1]) {
403 ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]);
404 if (ret)
405 return ret;
406 }
407
408 if (tb[TCA_NETEM_DELAY_DIST-1]) {
409 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]);
410 if (ret)
411 return ret;
412 }
413 }
414
415
416 return 0;
417}
418
419static int netem_init(struct Qdisc *sch, struct rtattr *opt)
420{
421 struct netem_sched_data *q = qdisc_priv(sch);
422 int ret;
423
424 if (!opt)
425 return -EINVAL;
426
427 skb_queue_head_init(&q->delayed);
428 init_timer(&q->timer);
429 q->timer.function = netem_watchdog;
430 q->timer.data = (unsigned long) sch;
431 q->counter = 0;
432
433 q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
434 if (!q->qdisc) {
435 pr_debug("netem: qdisc create failed\n");
436 return -ENOMEM;
437 }
438
439 ret = netem_change(sch, opt);
440 if (ret) {
441 pr_debug("netem: change failed\n");
442 qdisc_destroy(q->qdisc);
443 }
444 return ret;
445}
446
447static void netem_destroy(struct Qdisc *sch)
448{
449 struct netem_sched_data *q = qdisc_priv(sch);
450
451 del_timer_sync(&q->timer);
452 qdisc_destroy(q->qdisc);
453 kfree(q->delay_dist);
454}
455
456static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
457{
458 const struct netem_sched_data *q = qdisc_priv(sch);
459 unsigned char *b = skb->tail;
460 struct rtattr *rta = (struct rtattr *) b;
461 struct tc_netem_qopt qopt;
462 struct tc_netem_corr cor;
463
464 qopt.latency = q->latency;
465 qopt.jitter = q->jitter;
466 qopt.limit = q->limit;
467 qopt.loss = q->loss;
468 qopt.gap = q->gap;
469 qopt.duplicate = q->duplicate;
470 RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
471
472 cor.delay_corr = q->delay_cor.rho;
473 cor.loss_corr = q->loss_cor.rho;
474 cor.dup_corr = q->dup_cor.rho;
475 RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
476 rta->rta_len = skb->tail - b;
477
478 return skb->len;
479
480rtattr_failure:
481 skb_trim(skb, b - skb->data);
482 return -1;
483}
484
485static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
486 struct sk_buff *skb, struct tcmsg *tcm)
487{
488 struct netem_sched_data *q = qdisc_priv(sch);
489
490 if (cl != 1) /* only one class */
491 return -ENOENT;
492
493 tcm->tcm_handle |= TC_H_MIN(1);
494 tcm->tcm_info = q->qdisc->handle;
495
496 return 0;
497}
498
499static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
500 struct Qdisc **old)
501{
502 struct netem_sched_data *q = qdisc_priv(sch);
503
504 if (new == NULL)
505 new = &noop_qdisc;
506
507 sch_tree_lock(sch);
508 *old = xchg(&q->qdisc, new);
509 qdisc_reset(*old);
510 sch->q.qlen = 0;
511 sch_tree_unlock(sch);
512
513 return 0;
514}
515
516static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
517{
518 struct netem_sched_data *q = qdisc_priv(sch);
519 return q->qdisc;
520}
521
522static unsigned long netem_get(struct Qdisc *sch, u32 classid)
523{
524 return 1;
525}
526
527static void netem_put(struct Qdisc *sch, unsigned long arg)
528{
529}
530
531static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
532 struct rtattr **tca, unsigned long *arg)
533{
534 return -ENOSYS;
535}
536
537static int netem_delete(struct Qdisc *sch, unsigned long arg)
538{
539 return -ENOSYS;
540}
541
542static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
543{
544 if (!walker->stop) {
545 if (walker->count >= walker->skip)
546 if (walker->fn(sch, 1, walker) < 0) {
547 walker->stop = 1;
548 return;
549 }
550 walker->count++;
551 }
552}
553
554static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl)
555{
556 return NULL;
557}
558
559static struct Qdisc_class_ops netem_class_ops = {
560 .graft = netem_graft,
561 .leaf = netem_leaf,
562 .get = netem_get,
563 .put = netem_put,
564 .change = netem_change_class,
565 .delete = netem_delete,
566 .walk = netem_walk,
567 .tcf_chain = netem_find_tcf,
568 .dump = netem_dump_class,
569};
570
571static struct Qdisc_ops netem_qdisc_ops = {
572 .id = "netem",
573 .cl_ops = &netem_class_ops,
574 .priv_size = sizeof(struct netem_sched_data),
575 .enqueue = netem_enqueue,
576 .dequeue = netem_dequeue,
577 .requeue = netem_requeue,
578 .drop = netem_drop,
579 .init = netem_init,
580 .reset = netem_reset,
581 .destroy = netem_destroy,
582 .change = netem_change,
583 .dump = netem_dump,
584 .owner = THIS_MODULE,
585};
586
587
588static int __init netem_module_init(void)
589{
590 return register_qdisc(&netem_qdisc_ops);
591}
592static void __exit netem_module_exit(void)
593{
594 unregister_qdisc(&netem_qdisc_ops);
595}
596module_init(netem_module_init)
597module_exit(netem_module_exit)
598MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
new file mode 100644
index 000000000000..3ac0f495bad0
--- /dev/null
+++ b/net/sched/sch_prio.c
@@ -0,0 +1,444 @@
1/*
2 * net/sched/sch_prio.c Simple 3-band priority "scheduler".
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 * Fixes: 19990609: J Hadi Salim <hadi@nortelnetworks.com>:
11 * Init -- EINVAL when opt undefined
12 */
13
14#include <linux/config.h>
15#include <linux/module.h>
16#include <asm/uaccess.h>
17#include <asm/system.h>
18#include <linux/bitops.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/sched.h>
22#include <linux/string.h>
23#include <linux/mm.h>
24#include <linux/socket.h>
25#include <linux/sockios.h>
26#include <linux/in.h>
27#include <linux/errno.h>
28#include <linux/interrupt.h>
29#include <linux/if_ether.h>
30#include <linux/inet.h>
31#include <linux/netdevice.h>
32#include <linux/etherdevice.h>
33#include <linux/notifier.h>
34#include <net/ip.h>
35#include <net/route.h>
36#include <linux/skbuff.h>
37#include <net/sock.h>
38#include <net/pkt_sched.h>
39
40
41struct prio_sched_data
42{
43 int bands;
44 struct tcf_proto *filter_list;
45 u8 prio2band[TC_PRIO_MAX+1];
46 struct Qdisc *queues[TCQ_PRIO_BANDS];
47};
48
49
50static struct Qdisc *
51prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
52{
53 struct prio_sched_data *q = qdisc_priv(sch);
54 u32 band = skb->priority;
55 struct tcf_result res;
56
57 *qerr = NET_XMIT_DROP;
58 if (TC_H_MAJ(skb->priority) != sch->handle) {
59#ifdef CONFIG_NET_CLS_ACT
60 switch (tc_classify(skb, q->filter_list, &res)) {
61 case TC_ACT_STOLEN:
62 case TC_ACT_QUEUED:
63 *qerr = NET_XMIT_SUCCESS;
64 case TC_ACT_SHOT:
65 return NULL;
66 };
67
68 if (!q->filter_list ) {
69#else
70 if (!q->filter_list || tc_classify(skb, q->filter_list, &res)) {
71#endif
72 if (TC_H_MAJ(band))
73 band = 0;
74 return q->queues[q->prio2band[band&TC_PRIO_MAX]];
75 }
76 band = res.classid;
77 }
78 band = TC_H_MIN(band) - 1;
79 if (band > q->bands)
80 return q->queues[q->prio2band[0]];
81
82 return q->queues[band];
83}
84
85static int
86prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
87{
88 struct Qdisc *qdisc;
89 int ret;
90
91 qdisc = prio_classify(skb, sch, &ret);
92#ifdef CONFIG_NET_CLS_ACT
93 if (qdisc == NULL) {
94 if (ret == NET_XMIT_DROP)
95 sch->qstats.drops++;
96 kfree_skb(skb);
97 return ret;
98 }
99#endif
100
101 if ((ret = qdisc->enqueue(skb, qdisc)) == NET_XMIT_SUCCESS) {
102 sch->bstats.bytes += skb->len;
103 sch->bstats.packets++;
104 sch->q.qlen++;
105 return NET_XMIT_SUCCESS;
106 }
107 sch->qstats.drops++;
108 return ret;
109}
110
111
112static int
113prio_requeue(struct sk_buff *skb, struct Qdisc* sch)
114{
115 struct Qdisc *qdisc;
116 int ret;
117
118 qdisc = prio_classify(skb, sch, &ret);
119#ifdef CONFIG_NET_CLS_ACT
120 if (qdisc == NULL) {
121 if (ret == NET_XMIT_DROP)
122 sch->qstats.drops++;
123 kfree_skb(skb);
124 return ret;
125 }
126#endif
127
128 if ((ret = qdisc->ops->requeue(skb, qdisc)) == NET_XMIT_SUCCESS) {
129 sch->q.qlen++;
130 sch->qstats.requeues++;
131 return 0;
132 }
133 sch->qstats.drops++;
134 return NET_XMIT_DROP;
135}
136
137
138static struct sk_buff *
139prio_dequeue(struct Qdisc* sch)
140{
141 struct sk_buff *skb;
142 struct prio_sched_data *q = qdisc_priv(sch);
143 int prio;
144 struct Qdisc *qdisc;
145
146 for (prio = 0; prio < q->bands; prio++) {
147 qdisc = q->queues[prio];
148 skb = qdisc->dequeue(qdisc);
149 if (skb) {
150 sch->q.qlen--;
151 return skb;
152 }
153 }
154 return NULL;
155
156}
157
158static unsigned int prio_drop(struct Qdisc* sch)
159{
160 struct prio_sched_data *q = qdisc_priv(sch);
161 int prio;
162 unsigned int len;
163 struct Qdisc *qdisc;
164
165 for (prio = q->bands-1; prio >= 0; prio--) {
166 qdisc = q->queues[prio];
167 if ((len = qdisc->ops->drop(qdisc)) != 0) {
168 sch->q.qlen--;
169 return len;
170 }
171 }
172 return 0;
173}
174
175
176static void
177prio_reset(struct Qdisc* sch)
178{
179 int prio;
180 struct prio_sched_data *q = qdisc_priv(sch);
181
182 for (prio=0; prio<q->bands; prio++)
183 qdisc_reset(q->queues[prio]);
184 sch->q.qlen = 0;
185}
186
187static void
188prio_destroy(struct Qdisc* sch)
189{
190 int prio;
191 struct prio_sched_data *q = qdisc_priv(sch);
192 struct tcf_proto *tp;
193
194 while ((tp = q->filter_list) != NULL) {
195 q->filter_list = tp->next;
196 tcf_destroy(tp);
197 }
198
199 for (prio=0; prio<q->bands; prio++)
200 qdisc_destroy(q->queues[prio]);
201}
202
203static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
204{
205 struct prio_sched_data *q = qdisc_priv(sch);
206 struct tc_prio_qopt *qopt = RTA_DATA(opt);
207 int i;
208
209 if (opt->rta_len < RTA_LENGTH(sizeof(*qopt)))
210 return -EINVAL;
211 if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
212 return -EINVAL;
213
214 for (i=0; i<=TC_PRIO_MAX; i++) {
215 if (qopt->priomap[i] >= qopt->bands)
216 return -EINVAL;
217 }
218
219 sch_tree_lock(sch);
220 q->bands = qopt->bands;
221 memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
222
223 for (i=q->bands; i<TCQ_PRIO_BANDS; i++) {
224 struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc);
225 if (child != &noop_qdisc)
226 qdisc_destroy(child);
227 }
228 sch_tree_unlock(sch);
229
230 for (i=0; i<=TC_PRIO_MAX; i++) {
231 int band = q->prio2band[i];
232 if (q->queues[band] == &noop_qdisc) {
233 struct Qdisc *child;
234 child = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
235 if (child) {
236 sch_tree_lock(sch);
237 child = xchg(&q->queues[band], child);
238
239 if (child != &noop_qdisc)
240 qdisc_destroy(child);
241 sch_tree_unlock(sch);
242 }
243 }
244 }
245 return 0;
246}
247
248static int prio_init(struct Qdisc *sch, struct rtattr *opt)
249{
250 struct prio_sched_data *q = qdisc_priv(sch);
251 int i;
252
253 for (i=0; i<TCQ_PRIO_BANDS; i++)
254 q->queues[i] = &noop_qdisc;
255
256 if (opt == NULL) {
257 return -EINVAL;
258 } else {
259 int err;
260
261 if ((err= prio_tune(sch, opt)) != 0)
262 return err;
263 }
264 return 0;
265}
266
267static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
268{
269 struct prio_sched_data *q = qdisc_priv(sch);
270 unsigned char *b = skb->tail;
271 struct tc_prio_qopt opt;
272
273 opt.bands = q->bands;
274 memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
275 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
276 return skb->len;
277
278rtattr_failure:
279 skb_trim(skb, b - skb->data);
280 return -1;
281}
282
283static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
284 struct Qdisc **old)
285{
286 struct prio_sched_data *q = qdisc_priv(sch);
287 unsigned long band = arg - 1;
288
289 if (band >= q->bands)
290 return -EINVAL;
291
292 if (new == NULL)
293 new = &noop_qdisc;
294
295 sch_tree_lock(sch);
296 *old = q->queues[band];
297 q->queues[band] = new;
298 sch->q.qlen -= (*old)->q.qlen;
299 qdisc_reset(*old);
300 sch_tree_unlock(sch);
301
302 return 0;
303}
304
305static struct Qdisc *
306prio_leaf(struct Qdisc *sch, unsigned long arg)
307{
308 struct prio_sched_data *q = qdisc_priv(sch);
309 unsigned long band = arg - 1;
310
311 if (band >= q->bands)
312 return NULL;
313
314 return q->queues[band];
315}
316
317static unsigned long prio_get(struct Qdisc *sch, u32 classid)
318{
319 struct prio_sched_data *q = qdisc_priv(sch);
320 unsigned long band = TC_H_MIN(classid);
321
322 if (band - 1 >= q->bands)
323 return 0;
324 return band;
325}
326
327static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid)
328{
329 return prio_get(sch, classid);
330}
331
332
333static void prio_put(struct Qdisc *q, unsigned long cl)
334{
335 return;
336}
337
338static int prio_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg)
339{
340 unsigned long cl = *arg;
341 struct prio_sched_data *q = qdisc_priv(sch);
342
343 if (cl - 1 > q->bands)
344 return -ENOENT;
345 return 0;
346}
347
348static int prio_delete(struct Qdisc *sch, unsigned long cl)
349{
350 struct prio_sched_data *q = qdisc_priv(sch);
351 if (cl - 1 > q->bands)
352 return -ENOENT;
353 return 0;
354}
355
356
357static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb,
358 struct tcmsg *tcm)
359{
360 struct prio_sched_data *q = qdisc_priv(sch);
361
362 if (cl - 1 > q->bands)
363 return -ENOENT;
364 tcm->tcm_handle |= TC_H_MIN(cl);
365 if (q->queues[cl-1])
366 tcm->tcm_info = q->queues[cl-1]->handle;
367 return 0;
368}
369
370static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
371{
372 struct prio_sched_data *q = qdisc_priv(sch);
373 int prio;
374
375 if (arg->stop)
376 return;
377
378 for (prio = 0; prio < q->bands; prio++) {
379 if (arg->count < arg->skip) {
380 arg->count++;
381 continue;
382 }
383 if (arg->fn(sch, prio+1, arg) < 0) {
384 arg->stop = 1;
385 break;
386 }
387 arg->count++;
388 }
389}
390
391static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl)
392{
393 struct prio_sched_data *q = qdisc_priv(sch);
394
395 if (cl)
396 return NULL;
397 return &q->filter_list;
398}
399
400static struct Qdisc_class_ops prio_class_ops = {
401 .graft = prio_graft,
402 .leaf = prio_leaf,
403 .get = prio_get,
404 .put = prio_put,
405 .change = prio_change,
406 .delete = prio_delete,
407 .walk = prio_walk,
408 .tcf_chain = prio_find_tcf,
409 .bind_tcf = prio_bind,
410 .unbind_tcf = prio_put,
411 .dump = prio_dump_class,
412};
413
414static struct Qdisc_ops prio_qdisc_ops = {
415 .next = NULL,
416 .cl_ops = &prio_class_ops,
417 .id = "prio",
418 .priv_size = sizeof(struct prio_sched_data),
419 .enqueue = prio_enqueue,
420 .dequeue = prio_dequeue,
421 .requeue = prio_requeue,
422 .drop = prio_drop,
423 .init = prio_init,
424 .reset = prio_reset,
425 .destroy = prio_destroy,
426 .change = prio_tune,
427 .dump = prio_dump,
428 .owner = THIS_MODULE,
429};
430
431static int __init prio_module_init(void)
432{
433 return register_qdisc(&prio_qdisc_ops);
434}
435
436static void __exit prio_module_exit(void)
437{
438 unregister_qdisc(&prio_qdisc_ops);
439}
440
441module_init(prio_module_init)
442module_exit(prio_module_exit)
443
444MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
new file mode 100644
index 000000000000..664d0e47374f
--- /dev/null
+++ b/net/sched/sch_red.c
@@ -0,0 +1,459 @@
1/*
2 * net/sched/sch_red.c Random Early Detection queue.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Changes:
12 * J Hadi Salim <hadi@nortel.com> 980914: computation fixes
13 * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
14 * J Hadi Salim <hadi@nortelnetworks.com> 980816: ECN support
15 */
16
17#include <linux/config.h>
18#include <linux/module.h>
19#include <asm/uaccess.h>
20#include <asm/system.h>
21#include <linux/bitops.h>
22#include <linux/types.h>
23#include <linux/kernel.h>
24#include <linux/sched.h>
25#include <linux/string.h>
26#include <linux/mm.h>
27#include <linux/socket.h>
28#include <linux/sockios.h>
29#include <linux/in.h>
30#include <linux/errno.h>
31#include <linux/interrupt.h>
32#include <linux/if_ether.h>
33#include <linux/inet.h>
34#include <linux/netdevice.h>
35#include <linux/etherdevice.h>
36#include <linux/notifier.h>
37#include <net/ip.h>
38#include <net/route.h>
39#include <linux/skbuff.h>
40#include <net/sock.h>
41#include <net/pkt_sched.h>
42#include <net/inet_ecn.h>
43#include <net/dsfield.h>
44
45
46/* Random Early Detection (RED) algorithm.
47 =======================================
48
49 Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
50 for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
51
52 This file codes a "divisionless" version of RED algorithm
53 as written down in Fig.17 of the paper.
54
55Short description.
56------------------
57
58 When a new packet arrives we calculate the average queue length:
59
60 avg = (1-W)*avg + W*current_queue_len,
61
62 W is the filter time constant (chosen as 2^(-Wlog)), it controls
63 the inertia of the algorithm. To allow larger bursts, W should be
64 decreased.
65
66 if (avg > th_max) -> packet marked (dropped).
67 if (avg < th_min) -> packet passes.
68 if (th_min < avg < th_max) we calculate probability:
69
70 Pb = max_P * (avg - th_min)/(th_max-th_min)
71
72 and mark (drop) packet with this probability.
73 Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
74 max_P should be small (not 1), usually 0.01..0.02 is good value.
75
76 max_P is chosen as a number, so that max_P/(th_max-th_min)
77 is a negative power of two in order arithmetics to contain
78 only shifts.
79
80
81 Parameters, settable by user:
82 -----------------------------
83
84 limit - bytes (must be > qth_max + burst)
85
86 Hard limit on queue length, should be chosen >qth_max
87 to allow packet bursts. This parameter does not
88 affect the algorithms behaviour and can be chosen
89 arbitrarily high (well, less than ram size)
90 Really, this limit will never be reached
91 if RED works correctly.
92
93 qth_min - bytes (should be < qth_max/2)
94 qth_max - bytes (should be at least 2*qth_min and less limit)
95 Wlog - bits (<32) log(1/W).
96 Plog - bits (<32)
97
98 Plog is related to max_P by formula:
99
100 max_P = (qth_max-qth_min)/2^Plog;
101
102 F.e. if qth_max=128K and qth_min=32K, then Plog=22
103 corresponds to max_P=0.02
104
105 Scell_log
106 Stab
107
108 Lookup table for log((1-W)^(t/t_ave).
109
110
111NOTES:
112
113Upper bound on W.
114-----------------
115
116 If you want to allow bursts of L packets of size S,
117 you should choose W:
118
119 L + 1 - th_min/S < (1-(1-W)^L)/W
120
121 th_min/S = 32 th_min/S = 4
122
123 log(W) L
124 -1 33
125 -2 35
126 -3 39
127 -4 46
128 -5 57
129 -6 75
130 -7 101
131 -8 135
132 -9 190
133 etc.
134 */
135
136struct red_sched_data
137{
138/* Parameters */
139 u32 limit; /* HARD maximal queue length */
140 u32 qth_min; /* Min average length threshold: A scaled */
141 u32 qth_max; /* Max average length threshold: A scaled */
142 u32 Rmask;
143 u32 Scell_max;
144 unsigned char flags;
145 char Wlog; /* log(W) */
146 char Plog; /* random number bits */
147 char Scell_log;
148 u8 Stab[256];
149
150/* Variables */
151 unsigned long qave; /* Average queue length: A scaled */
152 int qcount; /* Packets since last random number generation */
153 u32 qR; /* Cached random number */
154
155 psched_time_t qidlestart; /* Start of idle period */
156 struct tc_red_xstats st;
157};
158
159static int red_ecn_mark(struct sk_buff *skb)
160{
161 if (skb->nh.raw + 20 > skb->tail)
162 return 0;
163
164 switch (skb->protocol) {
165 case __constant_htons(ETH_P_IP):
166 if (INET_ECN_is_not_ect(skb->nh.iph->tos))
167 return 0;
168 IP_ECN_set_ce(skb->nh.iph);
169 return 1;
170 case __constant_htons(ETH_P_IPV6):
171 if (INET_ECN_is_not_ect(ipv6_get_dsfield(skb->nh.ipv6h)))
172 return 0;
173 IP6_ECN_set_ce(skb->nh.ipv6h);
174 return 1;
175 default:
176 return 0;
177 }
178}
179
180static int
181red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
182{
183 struct red_sched_data *q = qdisc_priv(sch);
184
185 psched_time_t now;
186
187 if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
188 long us_idle;
189 int shift;
190
191 PSCHED_GET_TIME(now);
192 us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
193 PSCHED_SET_PASTPERFECT(q->qidlestart);
194
195/*
196 The problem: ideally, average length queue recalcultion should
197 be done over constant clock intervals. This is too expensive, so that
198 the calculation is driven by outgoing packets.
199 When the queue is idle we have to model this clock by hand.
200
201 SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
202 dummy packets as a burst after idle time, i.e.
203
204 q->qave *= (1-W)^m
205
206 This is an apparently overcomplicated solution (f.e. we have to precompute
207 a table to make this calculation in reasonable time)
208 I believe that a simpler model may be used here,
209 but it is field for experiments.
210*/
211 shift = q->Stab[us_idle>>q->Scell_log];
212
213 if (shift) {
214 q->qave >>= shift;
215 } else {
216 /* Approximate initial part of exponent
217 with linear function:
218 (1-W)^m ~= 1-mW + ...
219
220 Seems, it is the best solution to
221 problem of too coarce exponent tabulation.
222 */
223
224 us_idle = (q->qave * us_idle)>>q->Scell_log;
225 if (us_idle < q->qave/2)
226 q->qave -= us_idle;
227 else
228 q->qave >>= 1;
229 }
230 } else {
231 q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
232 /* NOTE:
233 q->qave is fixed point number with point at Wlog.
234 The formulae above is equvalent to floating point
235 version:
236
237 qave = qave*(1-W) + sch->qstats.backlog*W;
238 --ANK (980924)
239 */
240 }
241
242 if (q->qave < q->qth_min) {
243 q->qcount = -1;
244enqueue:
245 if (sch->qstats.backlog + skb->len <= q->limit) {
246 __skb_queue_tail(&sch->q, skb);
247 sch->qstats.backlog += skb->len;
248 sch->bstats.bytes += skb->len;
249 sch->bstats.packets++;
250 return NET_XMIT_SUCCESS;
251 } else {
252 q->st.pdrop++;
253 }
254 kfree_skb(skb);
255 sch->qstats.drops++;
256 return NET_XMIT_DROP;
257 }
258 if (q->qave >= q->qth_max) {
259 q->qcount = -1;
260 sch->qstats.overlimits++;
261mark:
262 if (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) {
263 q->st.early++;
264 goto drop;
265 }
266 q->st.marked++;
267 goto enqueue;
268 }
269
270 if (++q->qcount) {
271 /* The formula used below causes questions.
272
273 OK. qR is random number in the interval 0..Rmask
274 i.e. 0..(2^Plog). If we used floating point
275 arithmetics, it would be: (2^Plog)*rnd_num,
276 where rnd_num is less 1.
277
278 Taking into account, that qave have fixed
279 point at Wlog, and Plog is related to max_P by
280 max_P = (qth_max-qth_min)/2^Plog; two lines
281 below have the following floating point equivalent:
282
283 max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
284
285 Any questions? --ANK (980924)
286 */
287 if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
288 goto enqueue;
289 q->qcount = 0;
290 q->qR = net_random()&q->Rmask;
291 sch->qstats.overlimits++;
292 goto mark;
293 }
294 q->qR = net_random()&q->Rmask;
295 goto enqueue;
296
297drop:
298 kfree_skb(skb);
299 sch->qstats.drops++;
300 return NET_XMIT_CN;
301}
302
303static int
304red_requeue(struct sk_buff *skb, struct Qdisc* sch)
305{
306 struct red_sched_data *q = qdisc_priv(sch);
307
308 PSCHED_SET_PASTPERFECT(q->qidlestart);
309
310 __skb_queue_head(&sch->q, skb);
311 sch->qstats.backlog += skb->len;
312 sch->qstats.requeues++;
313 return 0;
314}
315
316static struct sk_buff *
317red_dequeue(struct Qdisc* sch)
318{
319 struct sk_buff *skb;
320 struct red_sched_data *q = qdisc_priv(sch);
321
322 skb = __skb_dequeue(&sch->q);
323 if (skb) {
324 sch->qstats.backlog -= skb->len;
325 return skb;
326 }
327 PSCHED_GET_TIME(q->qidlestart);
328 return NULL;
329}
330
331static unsigned int red_drop(struct Qdisc* sch)
332{
333 struct sk_buff *skb;
334 struct red_sched_data *q = qdisc_priv(sch);
335
336 skb = __skb_dequeue_tail(&sch->q);
337 if (skb) {
338 unsigned int len = skb->len;
339 sch->qstats.backlog -= len;
340 sch->qstats.drops++;
341 q->st.other++;
342 kfree_skb(skb);
343 return len;
344 }
345 PSCHED_GET_TIME(q->qidlestart);
346 return 0;
347}
348
349static void red_reset(struct Qdisc* sch)
350{
351 struct red_sched_data *q = qdisc_priv(sch);
352
353 __skb_queue_purge(&sch->q);
354 sch->qstats.backlog = 0;
355 PSCHED_SET_PASTPERFECT(q->qidlestart);
356 q->qave = 0;
357 q->qcount = -1;
358}
359
360static int red_change(struct Qdisc *sch, struct rtattr *opt)
361{
362 struct red_sched_data *q = qdisc_priv(sch);
363 struct rtattr *tb[TCA_RED_STAB];
364 struct tc_red_qopt *ctl;
365
366 if (opt == NULL ||
367 rtattr_parse_nested(tb, TCA_RED_STAB, opt) ||
368 tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 ||
369 RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) ||
370 RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256)
371 return -EINVAL;
372
373 ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);
374
375 sch_tree_lock(sch);
376 q->flags = ctl->flags;
377 q->Wlog = ctl->Wlog;
378 q->Plog = ctl->Plog;
379 q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
380 q->Scell_log = ctl->Scell_log;
381 q->Scell_max = (255<<q->Scell_log);
382 q->qth_min = ctl->qth_min<<ctl->Wlog;
383 q->qth_max = ctl->qth_max<<ctl->Wlog;
384 q->limit = ctl->limit;
385 memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
386
387 q->qcount = -1;
388 if (skb_queue_len(&sch->q) == 0)
389 PSCHED_SET_PASTPERFECT(q->qidlestart);
390 sch_tree_unlock(sch);
391 return 0;
392}
393
394static int red_init(struct Qdisc* sch, struct rtattr *opt)
395{
396 return red_change(sch, opt);
397}
398
399static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
400{
401 struct red_sched_data *q = qdisc_priv(sch);
402 unsigned char *b = skb->tail;
403 struct rtattr *rta;
404 struct tc_red_qopt opt;
405
406 rta = (struct rtattr*)b;
407 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
408 opt.limit = q->limit;
409 opt.qth_min = q->qth_min>>q->Wlog;
410 opt.qth_max = q->qth_max>>q->Wlog;
411 opt.Wlog = q->Wlog;
412 opt.Plog = q->Plog;
413 opt.Scell_log = q->Scell_log;
414 opt.flags = q->flags;
415 RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
416 rta->rta_len = skb->tail - b;
417
418 return skb->len;
419
420rtattr_failure:
421 skb_trim(skb, b - skb->data);
422 return -1;
423}
424
425static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
426{
427 struct red_sched_data *q = qdisc_priv(sch);
428
429 return gnet_stats_copy_app(d, &q->st, sizeof(q->st));
430}
431
432static struct Qdisc_ops red_qdisc_ops = {
433 .next = NULL,
434 .cl_ops = NULL,
435 .id = "red",
436 .priv_size = sizeof(struct red_sched_data),
437 .enqueue = red_enqueue,
438 .dequeue = red_dequeue,
439 .requeue = red_requeue,
440 .drop = red_drop,
441 .init = red_init,
442 .reset = red_reset,
443 .change = red_change,
444 .dump = red_dump,
445 .dump_stats = red_dump_stats,
446 .owner = THIS_MODULE,
447};
448
449static int __init red_module_init(void)
450{
451 return register_qdisc(&red_qdisc_ops);
452}
453static void __exit red_module_exit(void)
454{
455 unregister_qdisc(&red_qdisc_ops);
456}
457module_init(red_module_init)
458module_exit(red_module_exit)
459MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
new file mode 100644
index 000000000000..8734bb7280e3
--- /dev/null
+++ b/net/sched/sch_sfq.c
@@ -0,0 +1,497 @@
1/*
2 * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 */
11
12#include <linux/config.h>
13#include <linux/module.h>
14#include <asm/uaccess.h>
15#include <asm/system.h>
16#include <linux/bitops.h>
17#include <linux/types.h>
18#include <linux/kernel.h>
19#include <linux/jiffies.h>
20#include <linux/string.h>
21#include <linux/mm.h>
22#include <linux/socket.h>
23#include <linux/sockios.h>
24#include <linux/in.h>
25#include <linux/errno.h>
26#include <linux/interrupt.h>
27#include <linux/if_ether.h>
28#include <linux/inet.h>
29#include <linux/netdevice.h>
30#include <linux/etherdevice.h>
31#include <linux/notifier.h>
32#include <linux/init.h>
33#include <net/ip.h>
34#include <linux/ipv6.h>
35#include <net/route.h>
36#include <linux/skbuff.h>
37#include <net/sock.h>
38#include <net/pkt_sched.h>
39
40
41/* Stochastic Fairness Queuing algorithm.
42 =======================================
43
44 Source:
45 Paul E. McKenney "Stochastic Fairness Queuing",
46 IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
47
48 Paul E. McKenney "Stochastic Fairness Queuing",
49 "Interworking: Research and Experience", v.2, 1991, p.113-131.
50
51
52 See also:
53 M. Shreedhar and George Varghese "Efficient Fair
54 Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
55
56
57 This is not the thing that is usually called (W)FQ nowadays.
58 It does not use any timestamp mechanism, but instead
59 processes queues in round-robin order.
60
61 ADVANTAGE:
62
63 - It is very cheap. Both CPU and memory requirements are minimal.
64
65 DRAWBACKS:
66
67 - "Stochastic" -> It is not 100% fair.
68 When hash collisions occur, several flows are considered as one.
69
70 - "Round-robin" -> It introduces larger delays than virtual clock
71 based schemes, and should not be used for isolating interactive
72 traffic from non-interactive. It means, that this scheduler
73 should be used as leaf of CBQ or P3, which put interactive traffic
74 to higher priority band.
75
76 We still need true WFQ for top level CSZ, but using WFQ
77 for the best effort traffic is absolutely pointless:
78 SFQ is superior for this purpose.
79
80 IMPLEMENTATION:
81 This implementation limits maximal queue length to 128;
82 maximal mtu to 2^15-1; number of hash buckets to 1024.
83 The only goal of this restrictions was that all data
84 fit into one 4K page :-). Struct sfq_sched_data is
85 organized in anti-cache manner: all the data for a bucket
86 are scattered over different locations. This is not good,
87 but it allowed me to put it into 4K.
88
89 It is easy to increase these values, but not in flight. */
90
91#define SFQ_DEPTH 128
92#define SFQ_HASH_DIVISOR 1024
93
94/* This type should contain at least SFQ_DEPTH*2 values */
95typedef unsigned char sfq_index;
96
97struct sfq_head
98{
99 sfq_index next;
100 sfq_index prev;
101};
102
103struct sfq_sched_data
104{
105/* Parameters */
106 int perturb_period;
107 unsigned quantum; /* Allotment per round: MUST BE >= MTU */
108 int limit;
109
110/* Variables */
111 struct timer_list perturb_timer;
112 int perturbation;
113 sfq_index tail; /* Index of current slot in round */
114 sfq_index max_depth; /* Maximal depth */
115
116 sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */
117 sfq_index next[SFQ_DEPTH]; /* Active slots link */
118 short allot[SFQ_DEPTH]; /* Current allotment per slot */
119 unsigned short hash[SFQ_DEPTH]; /* Hash value indexed by slots */
120 struct sk_buff_head qs[SFQ_DEPTH]; /* Slot queue */
121 struct sfq_head dep[SFQ_DEPTH*2]; /* Linked list of slots, indexed by depth */
122};
123
124static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
125{
126 int pert = q->perturbation;
127
128 /* Have we any rotation primitives? If not, WHY? */
129 h ^= (h1<<pert) ^ (h1>>(0x1F - pert));
130 h ^= h>>10;
131 return h & 0x3FF;
132}
133
134static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
135{
136 u32 h, h2;
137
138 switch (skb->protocol) {
139 case __constant_htons(ETH_P_IP):
140 {
141 struct iphdr *iph = skb->nh.iph;
142 h = iph->daddr;
143 h2 = iph->saddr^iph->protocol;
144 if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
145 (iph->protocol == IPPROTO_TCP ||
146 iph->protocol == IPPROTO_UDP ||
147 iph->protocol == IPPROTO_ESP))
148 h2 ^= *(((u32*)iph) + iph->ihl);
149 break;
150 }
151 case __constant_htons(ETH_P_IPV6):
152 {
153 struct ipv6hdr *iph = skb->nh.ipv6h;
154 h = iph->daddr.s6_addr32[3];
155 h2 = iph->saddr.s6_addr32[3]^iph->nexthdr;
156 if (iph->nexthdr == IPPROTO_TCP ||
157 iph->nexthdr == IPPROTO_UDP ||
158 iph->nexthdr == IPPROTO_ESP)
159 h2 ^= *(u32*)&iph[1];
160 break;
161 }
162 default:
163 h = (u32)(unsigned long)skb->dst^skb->protocol;
164 h2 = (u32)(unsigned long)skb->sk;
165 }
166 return sfq_fold_hash(q, h, h2);
167}
168
169static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
170{
171 sfq_index p, n;
172 int d = q->qs[x].qlen + SFQ_DEPTH;
173
174 p = d;
175 n = q->dep[d].next;
176 q->dep[x].next = n;
177 q->dep[x].prev = p;
178 q->dep[p].next = q->dep[n].prev = x;
179}
180
181static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
182{
183 sfq_index p, n;
184
185 n = q->dep[x].next;
186 p = q->dep[x].prev;
187 q->dep[p].next = n;
188 q->dep[n].prev = p;
189
190 if (n == p && q->max_depth == q->qs[x].qlen + 1)
191 q->max_depth--;
192
193 sfq_link(q, x);
194}
195
196static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
197{
198 sfq_index p, n;
199 int d;
200
201 n = q->dep[x].next;
202 p = q->dep[x].prev;
203 q->dep[p].next = n;
204 q->dep[n].prev = p;
205 d = q->qs[x].qlen;
206 if (q->max_depth < d)
207 q->max_depth = d;
208
209 sfq_link(q, x);
210}
211
212static unsigned int sfq_drop(struct Qdisc *sch)
213{
214 struct sfq_sched_data *q = qdisc_priv(sch);
215 sfq_index d = q->max_depth;
216 struct sk_buff *skb;
217 unsigned int len;
218
219 /* Queue is full! Find the longest slot and
220 drop a packet from it */
221
222 if (d > 1) {
223 sfq_index x = q->dep[d+SFQ_DEPTH].next;
224 skb = q->qs[x].prev;
225 len = skb->len;
226 __skb_unlink(skb, &q->qs[x]);
227 kfree_skb(skb);
228 sfq_dec(q, x);
229 sch->q.qlen--;
230 sch->qstats.drops++;
231 return len;
232 }
233
234 if (d == 1) {
235 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
236 d = q->next[q->tail];
237 q->next[q->tail] = q->next[d];
238 q->allot[q->next[d]] += q->quantum;
239 skb = q->qs[d].prev;
240 len = skb->len;
241 __skb_unlink(skb, &q->qs[d]);
242 kfree_skb(skb);
243 sfq_dec(q, d);
244 sch->q.qlen--;
245 q->ht[q->hash[d]] = SFQ_DEPTH;
246 sch->qstats.drops++;
247 return len;
248 }
249
250 return 0;
251}
252
253static int
254sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch)
255{
256 struct sfq_sched_data *q = qdisc_priv(sch);
257 unsigned hash = sfq_hash(q, skb);
258 sfq_index x;
259
260 x = q->ht[hash];
261 if (x == SFQ_DEPTH) {
262 q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
263 q->hash[x] = hash;
264 }
265 __skb_queue_tail(&q->qs[x], skb);
266 sfq_inc(q, x);
267 if (q->qs[x].qlen == 1) { /* The flow is new */
268 if (q->tail == SFQ_DEPTH) { /* It is the first flow */
269 q->tail = x;
270 q->next[x] = x;
271 q->allot[x] = q->quantum;
272 } else {
273 q->next[x] = q->next[q->tail];
274 q->next[q->tail] = x;
275 q->tail = x;
276 }
277 }
278 if (++sch->q.qlen < q->limit-1) {
279 sch->bstats.bytes += skb->len;
280 sch->bstats.packets++;
281 return 0;
282 }
283
284 sfq_drop(sch);
285 return NET_XMIT_CN;
286}
287
288static int
289sfq_requeue(struct sk_buff *skb, struct Qdisc* sch)
290{
291 struct sfq_sched_data *q = qdisc_priv(sch);
292 unsigned hash = sfq_hash(q, skb);
293 sfq_index x;
294
295 x = q->ht[hash];
296 if (x == SFQ_DEPTH) {
297 q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
298 q->hash[x] = hash;
299 }
300 __skb_queue_head(&q->qs[x], skb);
301 sfq_inc(q, x);
302 if (q->qs[x].qlen == 1) { /* The flow is new */
303 if (q->tail == SFQ_DEPTH) { /* It is the first flow */
304 q->tail = x;
305 q->next[x] = x;
306 q->allot[x] = q->quantum;
307 } else {
308 q->next[x] = q->next[q->tail];
309 q->next[q->tail] = x;
310 q->tail = x;
311 }
312 }
313 if (++sch->q.qlen < q->limit - 1) {
314 sch->qstats.requeues++;
315 return 0;
316 }
317
318 sch->qstats.drops++;
319 sfq_drop(sch);
320 return NET_XMIT_CN;
321}
322
323
324
325
326static struct sk_buff *
327sfq_dequeue(struct Qdisc* sch)
328{
329 struct sfq_sched_data *q = qdisc_priv(sch);
330 struct sk_buff *skb;
331 sfq_index a, old_a;
332
333 /* No active slots */
334 if (q->tail == SFQ_DEPTH)
335 return NULL;
336
337 a = old_a = q->next[q->tail];
338
339 /* Grab packet */
340 skb = __skb_dequeue(&q->qs[a]);
341 sfq_dec(q, a);
342 sch->q.qlen--;
343
344 /* Is the slot empty? */
345 if (q->qs[a].qlen == 0) {
346 q->ht[q->hash[a]] = SFQ_DEPTH;
347 a = q->next[a];
348 if (a == old_a) {
349 q->tail = SFQ_DEPTH;
350 return skb;
351 }
352 q->next[q->tail] = a;
353 q->allot[a] += q->quantum;
354 } else if ((q->allot[a] -= skb->len) <= 0) {
355 q->tail = a;
356 a = q->next[a];
357 q->allot[a] += q->quantum;
358 }
359 return skb;
360}
361
362static void
363sfq_reset(struct Qdisc* sch)
364{
365 struct sk_buff *skb;
366
367 while ((skb = sfq_dequeue(sch)) != NULL)
368 kfree_skb(skb);
369}
370
371static void sfq_perturbation(unsigned long arg)
372{
373 struct Qdisc *sch = (struct Qdisc*)arg;
374 struct sfq_sched_data *q = qdisc_priv(sch);
375
376 q->perturbation = net_random()&0x1F;
377
378 if (q->perturb_period) {
379 q->perturb_timer.expires = jiffies + q->perturb_period;
380 add_timer(&q->perturb_timer);
381 }
382}
383
384static int sfq_change(struct Qdisc *sch, struct rtattr *opt)
385{
386 struct sfq_sched_data *q = qdisc_priv(sch);
387 struct tc_sfq_qopt *ctl = RTA_DATA(opt);
388
389 if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
390 return -EINVAL;
391
392 sch_tree_lock(sch);
393 q->quantum = ctl->quantum ? : psched_mtu(sch->dev);
394 q->perturb_period = ctl->perturb_period*HZ;
395 if (ctl->limit)
396 q->limit = min_t(u32, ctl->limit, SFQ_DEPTH);
397
398 while (sch->q.qlen >= q->limit-1)
399 sfq_drop(sch);
400
401 del_timer(&q->perturb_timer);
402 if (q->perturb_period) {
403 q->perturb_timer.expires = jiffies + q->perturb_period;
404 add_timer(&q->perturb_timer);
405 }
406 sch_tree_unlock(sch);
407 return 0;
408}
409
410static int sfq_init(struct Qdisc *sch, struct rtattr *opt)
411{
412 struct sfq_sched_data *q = qdisc_priv(sch);
413 int i;
414
415 init_timer(&q->perturb_timer);
416 q->perturb_timer.data = (unsigned long)sch;
417 q->perturb_timer.function = sfq_perturbation;
418
419 for (i=0; i<SFQ_HASH_DIVISOR; i++)
420 q->ht[i] = SFQ_DEPTH;
421 for (i=0; i<SFQ_DEPTH; i++) {
422 skb_queue_head_init(&q->qs[i]);
423 q->dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH;
424 q->dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH;
425 }
426 q->limit = SFQ_DEPTH;
427 q->max_depth = 0;
428 q->tail = SFQ_DEPTH;
429 if (opt == NULL) {
430 q->quantum = psched_mtu(sch->dev);
431 q->perturb_period = 0;
432 } else {
433 int err = sfq_change(sch, opt);
434 if (err)
435 return err;
436 }
437 for (i=0; i<SFQ_DEPTH; i++)
438 sfq_link(q, i);
439 return 0;
440}
441
442static void sfq_destroy(struct Qdisc *sch)
443{
444 struct sfq_sched_data *q = qdisc_priv(sch);
445 del_timer(&q->perturb_timer);
446}
447
448static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
449{
450 struct sfq_sched_data *q = qdisc_priv(sch);
451 unsigned char *b = skb->tail;
452 struct tc_sfq_qopt opt;
453
454 opt.quantum = q->quantum;
455 opt.perturb_period = q->perturb_period/HZ;
456
457 opt.limit = q->limit;
458 opt.divisor = SFQ_HASH_DIVISOR;
459 opt.flows = q->limit;
460
461 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
462
463 return skb->len;
464
465rtattr_failure:
466 skb_trim(skb, b - skb->data);
467 return -1;
468}
469
470static struct Qdisc_ops sfq_qdisc_ops = {
471 .next = NULL,
472 .cl_ops = NULL,
473 .id = "sfq",
474 .priv_size = sizeof(struct sfq_sched_data),
475 .enqueue = sfq_enqueue,
476 .dequeue = sfq_dequeue,
477 .requeue = sfq_requeue,
478 .drop = sfq_drop,
479 .init = sfq_init,
480 .reset = sfq_reset,
481 .destroy = sfq_destroy,
482 .change = NULL,
483 .dump = sfq_dump,
484 .owner = THIS_MODULE,
485};
486
487static int __init sfq_module_init(void)
488{
489 return register_qdisc(&sfq_qdisc_ops);
490}
491static void __exit sfq_module_exit(void)
492{
493 unregister_qdisc(&sfq_qdisc_ops);
494}
495module_init(sfq_module_init)
496module_exit(sfq_module_exit)
497MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
new file mode 100644
index 000000000000..cb9711ea8c6c
--- /dev/null
+++ b/net/sched/sch_tbf.c
@@ -0,0 +1,543 @@
1/*
2 * net/sched/sch_tbf.c Token Bucket Filter queue.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 * Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
11 * original idea by Martin Devera
12 *
13 */
14
15#include <linux/config.h>
16#include <linux/module.h>
17#include <asm/uaccess.h>
18#include <asm/system.h>
19#include <linux/bitops.h>
20#include <linux/types.h>
21#include <linux/kernel.h>
22#include <linux/jiffies.h>
23#include <linux/string.h>
24#include <linux/mm.h>
25#include <linux/socket.h>
26#include <linux/sockios.h>
27#include <linux/in.h>
28#include <linux/errno.h>
29#include <linux/interrupt.h>
30#include <linux/if_ether.h>
31#include <linux/inet.h>
32#include <linux/netdevice.h>
33#include <linux/etherdevice.h>
34#include <linux/notifier.h>
35#include <net/ip.h>
36#include <net/route.h>
37#include <linux/skbuff.h>
38#include <net/sock.h>
39#include <net/pkt_sched.h>
40
41
42/* Simple Token Bucket Filter.
43 =======================================
44
45 SOURCE.
46 -------
47
48 None.
49
50 Description.
51 ------------
52
53 A data flow obeys TBF with rate R and depth B, if for any
54 time interval t_i...t_f the number of transmitted bits
55 does not exceed B + R*(t_f-t_i).
56
57 Packetized version of this definition:
58 The sequence of packets of sizes s_i served at moments t_i
59 obeys TBF, if for any i<=k:
60
61 s_i+....+s_k <= B + R*(t_k - t_i)
62
63 Algorithm.
64 ----------
65
66 Let N(t_i) be B/R initially and N(t) grow continuously with time as:
67
68 N(t+delta) = min{B/R, N(t) + delta}
69
70 If the first packet in queue has length S, it may be
71 transmitted only at the time t_* when S/R <= N(t_*),
72 and in this case N(t) jumps:
73
74 N(t_* + 0) = N(t_* - 0) - S/R.
75
76
77
78 Actually, QoS requires two TBF to be applied to a data stream.
79 One of them controls steady state burst size, another
80 one with rate P (peak rate) and depth M (equal to link MTU)
81 limits bursts at a smaller time scale.
82
83 It is easy to see that P>R, and B>M. If P is infinity, this double
84 TBF is equivalent to a single one.
85
86 When TBF works in reshaping mode, latency is estimated as:
87
88 lat = max ((L-B)/R, (L-M)/P)
89
90
91 NOTES.
92 ------
93
94 If TBF throttles, it starts a watchdog timer, which will wake it up
95 when it is ready to transmit.
96 Note that the minimal timer resolution is 1/HZ.
97 If no new packets arrive during this period,
98 or if the device is not awaken by EOI for some previous packet,
99 TBF can stop its activity for 1/HZ.
100
101
102 This means, that with depth B, the maximal rate is
103
104 R_crit = B*HZ
105
106 F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
107
108 Note that the peak rate TBF is much more tough: with MTU 1500
109 P_crit = 150Kbytes/sec. So, if you need greater peak
110 rates, use alpha with HZ=1000 :-)
111
112 With classful TBF, limit is just kept for backwards compatibility.
113 It is passed to the default bfifo qdisc - if the inner qdisc is
114 changed the limit is not effective anymore.
115*/
116
117struct tbf_sched_data
118{
119/* Parameters */
120 u32 limit; /* Maximal length of backlog: bytes */
121 u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
122 u32 mtu;
123 u32 max_size;
124 struct qdisc_rate_table *R_tab;
125 struct qdisc_rate_table *P_tab;
126
127/* Variables */
128 long tokens; /* Current number of B tokens */
129 long ptokens; /* Current number of P tokens */
130 psched_time_t t_c; /* Time check-point */
131 struct timer_list wd_timer; /* Watchdog timer */
132 struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */
133};
134
135#define L2T(q,L) ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log])
136#define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log])
137
138static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
139{
140 struct tbf_sched_data *q = qdisc_priv(sch);
141 int ret;
142
143 if (skb->len > q->max_size) {
144 sch->qstats.drops++;
145#ifdef CONFIG_NET_CLS_POLICE
146 if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch))
147#endif
148 kfree_skb(skb);
149
150 return NET_XMIT_DROP;
151 }
152
153 if ((ret = q->qdisc->enqueue(skb, q->qdisc)) != 0) {
154 sch->qstats.drops++;
155 return ret;
156 }
157
158 sch->q.qlen++;
159 sch->bstats.bytes += skb->len;
160 sch->bstats.packets++;
161 return 0;
162}
163
164static int tbf_requeue(struct sk_buff *skb, struct Qdisc* sch)
165{
166 struct tbf_sched_data *q = qdisc_priv(sch);
167 int ret;
168
169 if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
170 sch->q.qlen++;
171 sch->qstats.requeues++;
172 }
173
174 return ret;
175}
176
177static unsigned int tbf_drop(struct Qdisc* sch)
178{
179 struct tbf_sched_data *q = qdisc_priv(sch);
180 unsigned int len;
181
182 if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) {
183 sch->q.qlen--;
184 sch->qstats.drops++;
185 }
186 return len;
187}
188
189static void tbf_watchdog(unsigned long arg)
190{
191 struct Qdisc *sch = (struct Qdisc*)arg;
192
193 sch->flags &= ~TCQ_F_THROTTLED;
194 netif_schedule(sch->dev);
195}
196
197static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
198{
199 struct tbf_sched_data *q = qdisc_priv(sch);
200 struct sk_buff *skb;
201
202 skb = q->qdisc->dequeue(q->qdisc);
203
204 if (skb) {
205 psched_time_t now;
206 long toks, delay;
207 long ptoks = 0;
208 unsigned int len = skb->len;
209
210 PSCHED_GET_TIME(now);
211
212 toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->buffer);
213
214 if (q->P_tab) {
215 ptoks = toks + q->ptokens;
216 if (ptoks > (long)q->mtu)
217 ptoks = q->mtu;
218 ptoks -= L2T_P(q, len);
219 }
220 toks += q->tokens;
221 if (toks > (long)q->buffer)
222 toks = q->buffer;
223 toks -= L2T(q, len);
224
225 if ((toks|ptoks) >= 0) {
226 q->t_c = now;
227 q->tokens = toks;
228 q->ptokens = ptoks;
229 sch->q.qlen--;
230 sch->flags &= ~TCQ_F_THROTTLED;
231 return skb;
232 }
233
234 delay = PSCHED_US2JIFFIE(max_t(long, -toks, -ptoks));
235
236 if (delay == 0)
237 delay = 1;
238
239 mod_timer(&q->wd_timer, jiffies+delay);
240
241 /* Maybe we have a shorter packet in the queue,
242 which can be sent now. It sounds cool,
243 but, however, this is wrong in principle.
244 We MUST NOT reorder packets under these circumstances.
245
246 Really, if we split the flow into independent
247 subflows, it would be a very good solution.
248 This is the main idea of all FQ algorithms
249 (cf. CSZ, HPFQ, HFSC)
250 */
251
252 if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
253 /* When requeue fails skb is dropped */
254 sch->q.qlen--;
255 sch->qstats.drops++;
256 }
257
258 sch->flags |= TCQ_F_THROTTLED;
259 sch->qstats.overlimits++;
260 }
261 return NULL;
262}
263
264static void tbf_reset(struct Qdisc* sch)
265{
266 struct tbf_sched_data *q = qdisc_priv(sch);
267
268 qdisc_reset(q->qdisc);
269 sch->q.qlen = 0;
270 PSCHED_GET_TIME(q->t_c);
271 q->tokens = q->buffer;
272 q->ptokens = q->mtu;
273 sch->flags &= ~TCQ_F_THROTTLED;
274 del_timer(&q->wd_timer);
275}
276
277static struct Qdisc *tbf_create_dflt_qdisc(struct net_device *dev, u32 limit)
278{
279 struct Qdisc *q = qdisc_create_dflt(dev, &bfifo_qdisc_ops);
280 struct rtattr *rta;
281 int ret;
282
283 if (q) {
284 rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
285 if (rta) {
286 rta->rta_type = RTM_NEWQDISC;
287 rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
288 ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
289
290 ret = q->ops->change(q, rta);
291 kfree(rta);
292
293 if (ret == 0)
294 return q;
295 }
296 qdisc_destroy(q);
297 }
298
299 return NULL;
300}
301
302static int tbf_change(struct Qdisc* sch, struct rtattr *opt)
303{
304 int err = -EINVAL;
305 struct tbf_sched_data *q = qdisc_priv(sch);
306 struct rtattr *tb[TCA_TBF_PTAB];
307 struct tc_tbf_qopt *qopt;
308 struct qdisc_rate_table *rtab = NULL;
309 struct qdisc_rate_table *ptab = NULL;
310 struct Qdisc *child = NULL;
311 int max_size,n;
312
313 if (rtattr_parse_nested(tb, TCA_TBF_PTAB, opt) ||
314 tb[TCA_TBF_PARMS-1] == NULL ||
315 RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt))
316 goto done;
317
318 qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]);
319 rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]);
320 if (rtab == NULL)
321 goto done;
322
323 if (qopt->peakrate.rate) {
324 if (qopt->peakrate.rate > qopt->rate.rate)
325 ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]);
326 if (ptab == NULL)
327 goto done;
328 }
329
330 for (n = 0; n < 256; n++)
331 if (rtab->data[n] > qopt->buffer) break;
332 max_size = (n << qopt->rate.cell_log)-1;
333 if (ptab) {
334 int size;
335
336 for (n = 0; n < 256; n++)
337 if (ptab->data[n] > qopt->mtu) break;
338 size = (n << qopt->peakrate.cell_log)-1;
339 if (size < max_size) max_size = size;
340 }
341 if (max_size < 0)
342 goto done;
343
344 if (q->qdisc == &noop_qdisc) {
345 if ((child = tbf_create_dflt_qdisc(sch->dev, qopt->limit)) == NULL)
346 goto done;
347 }
348
349 sch_tree_lock(sch);
350 if (child) q->qdisc = child;
351 q->limit = qopt->limit;
352 q->mtu = qopt->mtu;
353 q->max_size = max_size;
354 q->buffer = qopt->buffer;
355 q->tokens = q->buffer;
356 q->ptokens = q->mtu;
357 rtab = xchg(&q->R_tab, rtab);
358 ptab = xchg(&q->P_tab, ptab);
359 sch_tree_unlock(sch);
360 err = 0;
361done:
362 if (rtab)
363 qdisc_put_rtab(rtab);
364 if (ptab)
365 qdisc_put_rtab(ptab);
366 return err;
367}
368
369static int tbf_init(struct Qdisc* sch, struct rtattr *opt)
370{
371 struct tbf_sched_data *q = qdisc_priv(sch);
372
373 if (opt == NULL)
374 return -EINVAL;
375
376 PSCHED_GET_TIME(q->t_c);
377 init_timer(&q->wd_timer);
378 q->wd_timer.function = tbf_watchdog;
379 q->wd_timer.data = (unsigned long)sch;
380
381 q->qdisc = &noop_qdisc;
382
383 return tbf_change(sch, opt);
384}
385
386static void tbf_destroy(struct Qdisc *sch)
387{
388 struct tbf_sched_data *q = qdisc_priv(sch);
389
390 del_timer(&q->wd_timer);
391
392 if (q->P_tab)
393 qdisc_put_rtab(q->P_tab);
394 if (q->R_tab)
395 qdisc_put_rtab(q->R_tab);
396
397 qdisc_destroy(q->qdisc);
398}
399
400static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
401{
402 struct tbf_sched_data *q = qdisc_priv(sch);
403 unsigned char *b = skb->tail;
404 struct rtattr *rta;
405 struct tc_tbf_qopt opt;
406
407 rta = (struct rtattr*)b;
408 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
409
410 opt.limit = q->limit;
411 opt.rate = q->R_tab->rate;
412 if (q->P_tab)
413 opt.peakrate = q->P_tab->rate;
414 else
415 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
416 opt.mtu = q->mtu;
417 opt.buffer = q->buffer;
418 RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
419 rta->rta_len = skb->tail - b;
420
421 return skb->len;
422
423rtattr_failure:
424 skb_trim(skb, b - skb->data);
425 return -1;
426}
427
428static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
429 struct sk_buff *skb, struct tcmsg *tcm)
430{
431 struct tbf_sched_data *q = qdisc_priv(sch);
432
433 if (cl != 1) /* only one class */
434 return -ENOENT;
435
436 tcm->tcm_handle |= TC_H_MIN(1);
437 tcm->tcm_info = q->qdisc->handle;
438
439 return 0;
440}
441
442static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
443 struct Qdisc **old)
444{
445 struct tbf_sched_data *q = qdisc_priv(sch);
446
447 if (new == NULL)
448 new = &noop_qdisc;
449
450 sch_tree_lock(sch);
451 *old = xchg(&q->qdisc, new);
452 qdisc_reset(*old);
453 sch->q.qlen = 0;
454 sch_tree_unlock(sch);
455
456 return 0;
457}
458
459static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
460{
461 struct tbf_sched_data *q = qdisc_priv(sch);
462 return q->qdisc;
463}
464
465static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
466{
467 return 1;
468}
469
470static void tbf_put(struct Qdisc *sch, unsigned long arg)
471{
472}
473
474static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
475 struct rtattr **tca, unsigned long *arg)
476{
477 return -ENOSYS;
478}
479
480static int tbf_delete(struct Qdisc *sch, unsigned long arg)
481{
482 return -ENOSYS;
483}
484
485static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
486{
487 if (!walker->stop) {
488 if (walker->count >= walker->skip)
489 if (walker->fn(sch, 1, walker) < 0) {
490 walker->stop = 1;
491 return;
492 }
493 walker->count++;
494 }
495}
496
497static struct tcf_proto **tbf_find_tcf(struct Qdisc *sch, unsigned long cl)
498{
499 return NULL;
500}
501
502static struct Qdisc_class_ops tbf_class_ops =
503{
504 .graft = tbf_graft,
505 .leaf = tbf_leaf,
506 .get = tbf_get,
507 .put = tbf_put,
508 .change = tbf_change_class,
509 .delete = tbf_delete,
510 .walk = tbf_walk,
511 .tcf_chain = tbf_find_tcf,
512 .dump = tbf_dump_class,
513};
514
515static struct Qdisc_ops tbf_qdisc_ops = {
516 .next = NULL,
517 .cl_ops = &tbf_class_ops,
518 .id = "tbf",
519 .priv_size = sizeof(struct tbf_sched_data),
520 .enqueue = tbf_enqueue,
521 .dequeue = tbf_dequeue,
522 .requeue = tbf_requeue,
523 .drop = tbf_drop,
524 .init = tbf_init,
525 .reset = tbf_reset,
526 .destroy = tbf_destroy,
527 .change = tbf_change,
528 .dump = tbf_dump,
529 .owner = THIS_MODULE,
530};
531
532static int __init tbf_module_init(void)
533{
534 return register_qdisc(&tbf_qdisc_ops);
535}
536
537static void __exit tbf_module_exit(void)
538{
539 unregister_qdisc(&tbf_qdisc_ops);
540}
541module_init(tbf_module_init)
542module_exit(tbf_module_exit)
543MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
new file mode 100644
index 000000000000..6cf0342706b5
--- /dev/null
+++ b/net/sched/sch_teql.c
@@ -0,0 +1,511 @@
1/* net/sched/sch_teql.c "True" (or "trivial") link equalizer.
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU General Public License
5 * as published by the Free Software Foundation; either version
6 * 2 of the License, or (at your option) any later version.
7 *
8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 */
10
11#include <linux/module.h>
12#include <asm/uaccess.h>
13#include <asm/system.h>
14#include <linux/bitops.h>
15#include <linux/types.h>
16#include <linux/kernel.h>
17#include <linux/sched.h>
18#include <linux/string.h>
19#include <linux/mm.h>
20#include <linux/socket.h>
21#include <linux/sockios.h>
22#include <linux/in.h>
23#include <linux/errno.h>
24#include <linux/interrupt.h>
25#include <linux/if_ether.h>
26#include <linux/inet.h>
27#include <linux/netdevice.h>
28#include <linux/etherdevice.h>
29#include <linux/notifier.h>
30#include <linux/init.h>
31#include <net/ip.h>
32#include <net/route.h>
33#include <linux/skbuff.h>
34#include <linux/moduleparam.h>
35#include <net/sock.h>
36#include <net/pkt_sched.h>
37
38/*
39 How to setup it.
40 ----------------
41
42 After loading this module you will find a new device teqlN
43 and new qdisc with the same name. To join a slave to the equalizer
44 you should just set this qdisc on a device f.e.
45
46 # tc qdisc add dev eth0 root teql0
47 # tc qdisc add dev eth1 root teql0
48
49 That's all. Full PnP 8)
50
51 Applicability.
52 --------------
53
54 1. Slave devices MUST be active devices, i.e., they must raise the tbusy
55 signal and generate EOI events. If you want to equalize virtual devices
56 like tunnels, use a normal eql device.
57 2. This device puts no limitations on physical slave characteristics
58 f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-)
59 Certainly, large difference in link speeds will make the resulting
60 eqalized link unusable, because of huge packet reordering.
61 I estimate an upper useful difference as ~10 times.
62 3. If the slave requires address resolution, only protocols using
63 neighbour cache (IPv4/IPv6) will work over the equalized link.
64 Other protocols are still allowed to use the slave device directly,
65 which will not break load balancing, though native slave
66 traffic will have the highest priority. */
67
68struct teql_master
69{
70 struct Qdisc_ops qops;
71 struct net_device *dev;
72 struct Qdisc *slaves;
73 struct list_head master_list;
74 struct net_device_stats stats;
75};
76
77struct teql_sched_data
78{
79 struct Qdisc *next;
80 struct teql_master *m;
81 struct neighbour *ncache;
82 struct sk_buff_head q;
83};
84
85#define NEXT_SLAVE(q) (((struct teql_sched_data*)qdisc_priv(q))->next)
86
87#define FMASK (IFF_BROADCAST|IFF_POINTOPOINT|IFF_BROADCAST)
88
89/* "teql*" qdisc routines */
90
91static int
92teql_enqueue(struct sk_buff *skb, struct Qdisc* sch)
93{
94 struct net_device *dev = sch->dev;
95 struct teql_sched_data *q = qdisc_priv(sch);
96
97 __skb_queue_tail(&q->q, skb);
98 if (q->q.qlen <= dev->tx_queue_len) {
99 sch->bstats.bytes += skb->len;
100 sch->bstats.packets++;
101 return 0;
102 }
103
104 __skb_unlink(skb, &q->q);
105 kfree_skb(skb);
106 sch->qstats.drops++;
107 return NET_XMIT_DROP;
108}
109
110static int
111teql_requeue(struct sk_buff *skb, struct Qdisc* sch)
112{
113 struct teql_sched_data *q = qdisc_priv(sch);
114
115 __skb_queue_head(&q->q, skb);
116 sch->qstats.requeues++;
117 return 0;
118}
119
120static struct sk_buff *
121teql_dequeue(struct Qdisc* sch)
122{
123 struct teql_sched_data *dat = qdisc_priv(sch);
124 struct sk_buff *skb;
125
126 skb = __skb_dequeue(&dat->q);
127 if (skb == NULL) {
128 struct net_device *m = dat->m->dev->qdisc->dev;
129 if (m) {
130 dat->m->slaves = sch;
131 netif_wake_queue(m);
132 }
133 }
134 sch->q.qlen = dat->q.qlen + dat->m->dev->qdisc->q.qlen;
135 return skb;
136}
137
138static __inline__ void
139teql_neigh_release(struct neighbour *n)
140{
141 if (n)
142 neigh_release(n);
143}
144
145static void
146teql_reset(struct Qdisc* sch)
147{
148 struct teql_sched_data *dat = qdisc_priv(sch);
149
150 skb_queue_purge(&dat->q);
151 sch->q.qlen = 0;
152 teql_neigh_release(xchg(&dat->ncache, NULL));
153}
154
155static void
156teql_destroy(struct Qdisc* sch)
157{
158 struct Qdisc *q, *prev;
159 struct teql_sched_data *dat = qdisc_priv(sch);
160 struct teql_master *master = dat->m;
161
162 if ((prev = master->slaves) != NULL) {
163 do {
164 q = NEXT_SLAVE(prev);
165 if (q == sch) {
166 NEXT_SLAVE(prev) = NEXT_SLAVE(q);
167 if (q == master->slaves) {
168 master->slaves = NEXT_SLAVE(q);
169 if (q == master->slaves) {
170 master->slaves = NULL;
171 spin_lock_bh(&master->dev->queue_lock);
172 qdisc_reset(master->dev->qdisc);
173 spin_unlock_bh(&master->dev->queue_lock);
174 }
175 }
176 skb_queue_purge(&dat->q);
177 teql_neigh_release(xchg(&dat->ncache, NULL));
178 break;
179 }
180
181 } while ((prev = q) != master->slaves);
182 }
183}
184
185static int teql_qdisc_init(struct Qdisc *sch, struct rtattr *opt)
186{
187 struct net_device *dev = sch->dev;
188 struct teql_master *m = (struct teql_master*)sch->ops;
189 struct teql_sched_data *q = qdisc_priv(sch);
190
191 if (dev->hard_header_len > m->dev->hard_header_len)
192 return -EINVAL;
193
194 if (m->dev == dev)
195 return -ELOOP;
196
197 q->m = m;
198
199 skb_queue_head_init(&q->q);
200
201 if (m->slaves) {
202 if (m->dev->flags & IFF_UP) {
203 if ((m->dev->flags&IFF_POINTOPOINT && !(dev->flags&IFF_POINTOPOINT))
204 || (m->dev->flags&IFF_BROADCAST && !(dev->flags&IFF_BROADCAST))
205 || (m->dev->flags&IFF_MULTICAST && !(dev->flags&IFF_MULTICAST))
206 || dev->mtu < m->dev->mtu)
207 return -EINVAL;
208 } else {
209 if (!(dev->flags&IFF_POINTOPOINT))
210 m->dev->flags &= ~IFF_POINTOPOINT;
211 if (!(dev->flags&IFF_BROADCAST))
212 m->dev->flags &= ~IFF_BROADCAST;
213 if (!(dev->flags&IFF_MULTICAST))
214 m->dev->flags &= ~IFF_MULTICAST;
215 if (dev->mtu < m->dev->mtu)
216 m->dev->mtu = dev->mtu;
217 }
218 q->next = NEXT_SLAVE(m->slaves);
219 NEXT_SLAVE(m->slaves) = sch;
220 } else {
221 q->next = sch;
222 m->slaves = sch;
223 m->dev->mtu = dev->mtu;
224 m->dev->flags = (m->dev->flags&~FMASK)|(dev->flags&FMASK);
225 }
226 return 0;
227}
228
229/* "teql*" netdevice routines */
230
231static int
232__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev)
233{
234 struct teql_sched_data *q = qdisc_priv(dev->qdisc);
235 struct neighbour *mn = skb->dst->neighbour;
236 struct neighbour *n = q->ncache;
237
238 if (mn->tbl == NULL)
239 return -EINVAL;
240 if (n && n->tbl == mn->tbl &&
241 memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) {
242 atomic_inc(&n->refcnt);
243 } else {
244 n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev);
245 if (IS_ERR(n))
246 return PTR_ERR(n);
247 }
248 if (neigh_event_send(n, skb_res) == 0) {
249 int err;
250 read_lock(&n->lock);
251 err = dev->hard_header(skb, dev, ntohs(skb->protocol), n->ha, NULL, skb->len);
252 read_unlock(&n->lock);
253 if (err < 0) {
254 neigh_release(n);
255 return -EINVAL;
256 }
257 teql_neigh_release(xchg(&q->ncache, n));
258 return 0;
259 }
260 neigh_release(n);
261 return (skb_res == NULL) ? -EAGAIN : 1;
262}
263
264static __inline__ int
265teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev)
266{
267 if (dev->hard_header == NULL ||
268 skb->dst == NULL ||
269 skb->dst->neighbour == NULL)
270 return 0;
271 return __teql_resolve(skb, skb_res, dev);
272}
273
274static int teql_master_xmit(struct sk_buff *skb, struct net_device *dev)
275{
276 struct teql_master *master = (void*)dev->priv;
277 struct Qdisc *start, *q;
278 int busy;
279 int nores;
280 int len = skb->len;
281 struct sk_buff *skb_res = NULL;
282
283 start = master->slaves;
284
285restart:
286 nores = 0;
287 busy = 0;
288
289 if ((q = start) == NULL)
290 goto drop;
291
292 do {
293 struct net_device *slave = q->dev;
294
295 if (slave->qdisc_sleeping != q)
296 continue;
297 if (netif_queue_stopped(slave) || ! netif_running(slave)) {
298 busy = 1;
299 continue;
300 }
301
302 switch (teql_resolve(skb, skb_res, slave)) {
303 case 0:
304 if (spin_trylock(&slave->xmit_lock)) {
305 slave->xmit_lock_owner = smp_processor_id();
306 if (!netif_queue_stopped(slave) &&
307 slave->hard_start_xmit(skb, slave) == 0) {
308 slave->xmit_lock_owner = -1;
309 spin_unlock(&slave->xmit_lock);
310 master->slaves = NEXT_SLAVE(q);
311 netif_wake_queue(dev);
312 master->stats.tx_packets++;
313 master->stats.tx_bytes += len;
314 return 0;
315 }
316 slave->xmit_lock_owner = -1;
317 spin_unlock(&slave->xmit_lock);
318 }
319 if (netif_queue_stopped(dev))
320 busy = 1;
321 break;
322 case 1:
323 master->slaves = NEXT_SLAVE(q);
324 return 0;
325 default:
326 nores = 1;
327 break;
328 }
329 __skb_pull(skb, skb->nh.raw - skb->data);
330 } while ((q = NEXT_SLAVE(q)) != start);
331
332 if (nores && skb_res == NULL) {
333 skb_res = skb;
334 goto restart;
335 }
336
337 if (busy) {
338 netif_stop_queue(dev);
339 return 1;
340 }
341 master->stats.tx_errors++;
342
343drop:
344 master->stats.tx_dropped++;
345 dev_kfree_skb(skb);
346 return 0;
347}
348
349static int teql_master_open(struct net_device *dev)
350{
351 struct Qdisc * q;
352 struct teql_master *m = (void*)dev->priv;
353 int mtu = 0xFFFE;
354 unsigned flags = IFF_NOARP|IFF_MULTICAST;
355
356 if (m->slaves == NULL)
357 return -EUNATCH;
358
359 flags = FMASK;
360
361 q = m->slaves;
362 do {
363 struct net_device *slave = q->dev;
364
365 if (slave == NULL)
366 return -EUNATCH;
367
368 if (slave->mtu < mtu)
369 mtu = slave->mtu;
370 if (slave->hard_header_len > LL_MAX_HEADER)
371 return -EINVAL;
372
373 /* If all the slaves are BROADCAST, master is BROADCAST
374 If all the slaves are PtP, master is PtP
375 Otherwise, master is NBMA.
376 */
377 if (!(slave->flags&IFF_POINTOPOINT))
378 flags &= ~IFF_POINTOPOINT;
379 if (!(slave->flags&IFF_BROADCAST))
380 flags &= ~IFF_BROADCAST;
381 if (!(slave->flags&IFF_MULTICAST))
382 flags &= ~IFF_MULTICAST;
383 } while ((q = NEXT_SLAVE(q)) != m->slaves);
384
385 m->dev->mtu = mtu;
386 m->dev->flags = (m->dev->flags&~FMASK) | flags;
387 netif_start_queue(m->dev);
388 return 0;
389}
390
391static int teql_master_close(struct net_device *dev)
392{
393 netif_stop_queue(dev);
394 return 0;
395}
396
397static struct net_device_stats *teql_master_stats(struct net_device *dev)
398{
399 struct teql_master *m = (void*)dev->priv;
400 return &m->stats;
401}
402
403static int teql_master_mtu(struct net_device *dev, int new_mtu)
404{
405 struct teql_master *m = (void*)dev->priv;
406 struct Qdisc *q;
407
408 if (new_mtu < 68)
409 return -EINVAL;
410
411 q = m->slaves;
412 if (q) {
413 do {
414 if (new_mtu > q->dev->mtu)
415 return -EINVAL;
416 } while ((q=NEXT_SLAVE(q)) != m->slaves);
417 }
418
419 dev->mtu = new_mtu;
420 return 0;
421}
422
423static __init void teql_master_setup(struct net_device *dev)
424{
425 struct teql_master *master = dev->priv;
426 struct Qdisc_ops *ops = &master->qops;
427
428 master->dev = dev;
429 ops->priv_size = sizeof(struct teql_sched_data);
430
431 ops->enqueue = teql_enqueue;
432 ops->dequeue = teql_dequeue;
433 ops->requeue = teql_requeue;
434 ops->init = teql_qdisc_init;
435 ops->reset = teql_reset;
436 ops->destroy = teql_destroy;
437 ops->owner = THIS_MODULE;
438
439 dev->open = teql_master_open;
440 dev->hard_start_xmit = teql_master_xmit;
441 dev->stop = teql_master_close;
442 dev->get_stats = teql_master_stats;
443 dev->change_mtu = teql_master_mtu;
444 dev->type = ARPHRD_VOID;
445 dev->mtu = 1500;
446 dev->tx_queue_len = 100;
447 dev->flags = IFF_NOARP;
448 dev->hard_header_len = LL_MAX_HEADER;
449 SET_MODULE_OWNER(dev);
450}
451
452static LIST_HEAD(master_dev_list);
453static int max_equalizers = 1;
454module_param(max_equalizers, int, 0);
455MODULE_PARM_DESC(max_equalizers, "Max number of link equalizers");
456
457static int __init teql_init(void)
458{
459 int i;
460 int err = -ENODEV;
461
462 for (i = 0; i < max_equalizers; i++) {
463 struct net_device *dev;
464 struct teql_master *master;
465
466 dev = alloc_netdev(sizeof(struct teql_master),
467 "teql%d", teql_master_setup);
468 if (!dev) {
469 err = -ENOMEM;
470 break;
471 }
472
473 if ((err = register_netdev(dev))) {
474 free_netdev(dev);
475 break;
476 }
477
478 master = dev->priv;
479
480 strlcpy(master->qops.id, dev->name, IFNAMSIZ);
481 err = register_qdisc(&master->qops);
482
483 if (err) {
484 unregister_netdev(dev);
485 free_netdev(dev);
486 break;
487 }
488
489 list_add_tail(&master->master_list, &master_dev_list);
490 }
491 return i ? 0 : err;
492}
493
494static void __exit teql_exit(void)
495{
496 struct teql_master *master, *nxt;
497
498 list_for_each_entry_safe(master, nxt, &master_dev_list, master_list) {
499
500 list_del(&master->master_list);
501
502 unregister_qdisc(&master->qops);
503 unregister_netdev(master->dev);
504 free_netdev(master->dev);
505 }
506}
507
508module_init(teql_init);
509module_exit(teql_exit);
510
511MODULE_LICENSE("GPL");